IMPORT FILES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

LOAD DATASET

In [None]:
true_df=pd.read_csv('/content/True.csv.zip')
fake_df=pd.read_csv('/content/Fake.csv.zip')
true_df["label"] = 1  # Real
fake_df["label"] = 0  # Fake

# Combine datasets
df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)

# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)

print(df.head())
print(df['label'].value_counts())

                                               title  \
0  DAUGHTER OF SUNNI MUSLIM, George Clooney Wife,...   
1  (Video) Conservative Pundit: Liberals Are Maki...   
2  Australian police seize record A$1 billion met...   
3  TRUMP FALSELY ACCUSED OF ENCOURAGING Police to...   
4  Obama urges China to address industrial excess...   

                                                text       subject  \
0  There s nothing like a Brit telling Americans ...     left-news   
1  While I m not a huge O Reilly fan, I do believ...      politics   
2  SYDNEY (Reuters) - Australian police said on F...     worldnews   
3  What Trump actually said was: Our police are a...      politics   
4  UNITED NATIONS (Reuters) - U.S. President Bara...  politicsNews   

                  date  label  
0         Apr 27, 2016      0  
1         Jun 24, 2015      0  
2   December 22, 2017       1  
3         Sep 20, 2016      0  
4  September 19, 2016       1  
label
0    23481
1    21417
Name: count, dtype: in

In [None]:
df.describe()

Unnamed: 0,label
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [None]:
df.shape

(44898, 5)

In [None]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

PREPROCESSOR TEXT(REMOVAL OF STOPWORDS,LEMMATIZATION/STEMMING)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

df["clean_text"] = df["text"].apply(preprocess_text)
print(df[["text", "clean_text"]].head())

                                                text  \
0  There s nothing like a Brit telling Americans ...   
1  While I m not a huge O Reilly fan, I do believ...   
2  SYDNEY (Reuters) - Australian police said on F...   
3  What Trump actually said was: Our police are a...   
4  UNITED NATIONS (Reuters) - U.S. President Bara...   

                                          clean_text  
0  nothing like brit telling american support pre...  
1  huge reilly fan believe comment true think man...  
2  sydney reuters australian police said friday s...  
3  trump actually said police amazing local polic...  
4  united nation reuters u president barack obama...  


USING OF CLASSIFIERS(MODELS)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#LOGISTIC REGREESION
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))

Logistic Regression Accuracy: 0.9893095768374165


In [None]:
#NAVIE BAYES
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))


Naive Bayes Accuracy: 0.9307349665924276


In [None]:
#RANDOM FOREST
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Random Forest Accuracy: 0.9975501113585746


EVALUATION OF MODELS

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Loop through each model and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)        # Train model
    y_pred = model.predict(X_test)     # Make predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Display results
    print(f"\n{name} Results:")
    print(f" Accuracy : {accuracy:.4f}")
    print(f" Precision: {precision:.4f}")
    print(f" F1-score : {f1:.4f}")


Logistic Regression Results:
 Accuracy : 0.9893
 Precision: 0.9880
 F1-score : 0.9889

Naive Bayes Results:
 Accuracy : 0.9307
 Precision: 0.9323
 F1-score : 0.9275

Random Forest Results:
 Accuracy : 0.9976
 Precision: 0.9968
 F1-score : 0.9975
