In [1]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump, load

from preprocess import preprocess_corpus_for_classical  # from your project

In [3]:
true_df = pd.read_csv("../../data/True.csv")
fake_df = pd.read_csv("../../data/Fake.csv", on_bad_lines='skip')

print("True shape:", true_df.shape)
print("Fake shape:", fake_df.shape)
print("True columns:", true_df.columns.tolist())
print("Fake columns:", fake_df.columns.tolist())

True shape: (21417, 4)
Fake shape: (23481, 4)
True columns: ['title', 'text', 'subject', 'date']
Fake columns: ['title', 'text', 'subject', 'date']


In [4]:
true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
true_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0


In [7]:
fake_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0


In [8]:
# Add binary label
true_df["label"] = "real"
fake_df["label"] = "fake"

def combine_title_text(df):
    title = df["title"].fillna("")
    text = df["text"].fillna("")
    return (title + ". " + text).str.strip()

true_df["statement"] = combine_title_text(true_df)
fake_df["statement"] = combine_title_text(fake_df)

true_df = true_df[["statement", "label"]]
fake_df = fake_df[["statement", "label"]]

full_df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)
full_df = full_df.dropna(subset=["statement", "label"]).reset_index(drop=True)

print("Full dataset shape:", full_df.shape)
print("Label distribution:")
print(full_df["label"].value_counts(normalize=True))
full_df.head()


Full dataset shape: (44898, 2)
Label distribution:
label
fake    0.522985
real    0.477015
Name: proportion, dtype: float64


Unnamed: 0,statement,label
0,"As U.S. budget fight looms, Republicans flip t...",real
1,U.S. military to accept transgender recruits o...,real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,real
3,FBI Russia probe helped by Australian diplomat...,real
4,Trump wants Postal Service to charge 'much mor...,real


In [9]:
full_df = full_df.sample(frac=1, random_state=42).reset_index(drop=True)
full_df

Unnamed: 0,statement,label
0,BREAKING: GOP Chairman Grassley Has Had Enough...,fake
1,Failed GOP Candidates Remembered In Hilarious ...,fake
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY ...,fake
3,California AG pledges to defend birth control ...,real
4,AZ RANCHERS Living On US-Mexico Border Destroy...,fake
...,...,...
44893,Nigeria says U.S. agrees delayed $593 million ...,real
44894,Boiler Room #62 – Fatal Illusions. Tune in to ...,fake
44895,ATHEISTS SUE GOVERNOR OF TEXAS Over Display on...,fake
44896,Republican tax plan would deal financial hit t...,real


In [10]:
train_df, temp_df = train_test_split(
    full_df,
    test_size=0.30,
    random_state=42,
    stratify=full_df["label"]
)

valid_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["label"]
)

print("Train:", train_df.shape, "Valid:", valid_df.shape, "Test:", test_df.shape)
print("Train label distribution:")
print(train_df["label"].value_counts(normalize=True))


Train: (31428, 2) Valid: (6735, 2) Test: (6735, 2)
Train label distribution:
label
fake    0.522973
real    0.477027
Name: proportion, dtype: float64


In [11]:
le = LabelEncoder()
le.fit(train_df["label"].astype(str))
dump(le,"label_encoder.pkl")
y_train = le.transform(train_df["label"].astype(str))
y_valid = le.transform(valid_df["label"].astype(str))
y_test  = le.transform(test_df["label"].astype(str))
print("Classes:", list(le.classes_))  # should be ['fake', 'real']

Classes: ['fake', 'real']


In [13]:
X_train_raw = train_df["statement"].astype(str)
X_valid_raw = valid_df["statement"].astype(str)
X_test_raw  = test_df["statement"].astype(str)

X_train = preprocess_corpus_for_classical(X_train_raw)
X_valid = preprocess_corpus_for_classical(X_valid_raw)
X_test  = preprocess_corpus_for_classical(X_test_raw)

print("Sample raw:", X_train_raw.iloc[0][:200], "...")
print("Sample processed:", X_train[0][:200], "...")


Sample raw: Trump ‘Diversity Council’ Member Threatens to Quit If Trump Ends DACA…Bye, Bye! [Video]. A member of President Trump s  Diversity Council  is threatening to quit because he opposes Trump s cancelation ...
Sample processed: trump divers council member threaten quit trump end daca bye bye video member presid trump divers council threaten quit oppos trump cancel daca bye bye trump divers council member tell may quit counci ...


In [14]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)
X_test_tfidf  = vectorizer.transform(X_test)
print("TF-IDF shape (train):", X_train_tfidf.shape)

dump(vectorizer,"vectorizer.pkl")

TF-IDF shape (train): (31428, 10000)


['vectorizer.pkl']

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train_tfidf, y_train)

# Validation performance
y_valid_pred = rf.predict(X_valid_tfidf)
val_acc = accuracy_score(y_valid, y_valid_pred)
print("Validation Accuracy:", round(val_acc * 100, 2), "%")
print("Validation Report:")
print(classification_report(y_valid, y_valid_pred, target_names=le.classes_))

# Test performance
y_test_pred = rf.predict(X_test_tfidf)
test_acc = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", round(test_acc * 100, 2), "%")
print("Test Report:")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))


Validation Accuracy: 99.64 %
Validation Report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3522
        real       0.99      1.00      1.00      3213

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735

Test Accuracy: 99.81 %
Test Report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      3523
        real       1.00      1.00      1.00      3212

    accuracy                           1.00      6735
   macro avg       1.00      1.00      1.00      6735
weighted avg       1.00      1.00      1.00      6735



In [16]:
df = pd.read_csv("../../data/WELFake_Dataset.csv")
print("Loaded new dataset:", df.shape)
print(df.head())
print(df.columns)

Loaded new dataset: (72134, 4)
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  
Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')


In [17]:
TEXT_COL = "text"   # change to your actual text column
LABEL_COL = "label" # change to your actual label column

In [18]:
df["label_norm"] = df[LABEL_COL].astype(str).str.lower().str.strip()

df["label_norm"] = df["label_norm"].replace({
    "fake": "fake",
    "0": "fake",
    "real": "real",
    "1": "real"
})

# Keep only rows with labels that exist in label encoder
valid_labels = set(le.classes_)
df = df[df["label_norm"].isin(valid_labels)].dropna(subset=[TEXT_COL]).reset_index(drop=True)

print("After label normalization:", df.shape)
print(df["label_norm"].value_counts())


After label normalization: (72095, 5)
label_norm
real    37067
fake    35028
Name: count, dtype: int64


In [19]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label', 'label_norm'], dtype='object')

In [20]:
def combine_title_text(df):
    title = df["title"].fillna("")
    text = df["text"].fillna("")
    return (title + ". " + text).str.strip()

In [21]:
df["text"] = combine_title_text(df)

In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,label_norm
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,real
1,1,,. Did they post their votes for Hillary already?,1,real
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1,real
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...","Bobby Jindal, raised Hindu, uses story of Chri...",0,fake
4,4,SATAN 2: Russia unvelis an image of its terrif...,SATAN 2: Russia unvelis an image of its terrif...,1,real


In [23]:
df=df.drop(['title', 'Unnamed: 0'], axis=1)

In [24]:
df.head()

Unnamed: 0,text,label,label_norm
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1,real
1,. Did they post their votes for Hillary already?,1,real
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1,real
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0,fake
4,SATAN 2: Russia unvelis an image of its terrif...,1,real


In [25]:
X_new_raw = df[TEXT_COL].astype(str)

X_new_proc = preprocess_corpus_for_classical(X_new_raw)

X_new_tfidf = vectorizer.transform(X_new_proc)

y_true_new = le.transform(df["label_norm"].astype(str))

print("TF-IDF shape (new data):", X_new_tfidf.shape)


TF-IDF shape (new data): (72095, 10000)


In [26]:
# Predictions on WELFake dataset
y_pred_new = rf.predict(X_new_tfidf)

acc_new = accuracy_score(y_true_new, y_pred_new)
report_new = classification_report(y_true_new, y_pred_new, target_names=le.classes_, zero_division=0)

print(f"\nTest Accuracy of Random Forest on NEW dataset: {acc_new*100:.2f} %")
print("\nClassification Report (Random Forest on new dataset):\n")
print(report_new)



Test Accuracy of Random Forest on NEW dataset: 17.86 %

Classification Report (Random Forest on new dataset):

              precision    recall  f1-score   support

        fake       0.25      0.35      0.29     35028
        real       0.03      0.02      0.02     37067

    accuracy                           0.18     72095
   macro avg       0.14      0.18      0.16     72095
weighted avg       0.14      0.18      0.15     72095



In [27]:
dump(rf,"classical_model.pkl")

['classical_model.pkl']