#### **1. Imports**

In [None]:
# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# basic imports
from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


# sklearn components
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib


# gensim for Word2Vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm  # just for progress bars

#Defining Random State 
RANDOM_STATE = 42

#### **2. Load Data**

In [None]:
df = pd.read_csv("../data/processed/cleaned_data.csv") # loading csv file
df = df.drop(columns=["text_length"], errors="ignore")

X = df["text"].astype(str)
y = df["label"]

In [98]:
# Quick glance
(df.head())

Unnamed: 0,text,label
0,ben stein call circuit court committed ‘coup d...,0
1,trump drop steve bannon national security coun...,1
2,puerto rico expects lift jones act shipping re...,1
3,oops trump accidentally confirmed leaked israe...,0
4,donald trump head scotland reopen golf resort ...,1


#### **3. Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y  # stratify helps to divide the 0 and 1 as same ratio
)


#### **4. TF-IDF and SVD**

In [None]:
# - max_features controls vocabulary size so model generalizes
# - unigram only reduces memorization from bigrams
# - min_df / max_df remove rare/noisy or ubiquitous tokens, here taking as default
print("Fitting TF-IDF...")
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)   # fit ONLY on training data
X_test_tfidf = tfidf.transform(X_test)         # test sees only transform


Fitting TF-IDF...


SVD - helps to you minimize the features from 20000 to 300

In [101]:
print("Fitting SVD...")
svd = TruncatedSVD(n_components=300, random_state=RANDOM_STATE)
X_train_svd = svd.fit_transform(X_train_tfidf)   # fit ONLY on training data
X_test_svd = svd.transform(X_test_tfidf)

Fitting SVD...


#### **5. Train Models on TF-IDF+SV**

In [102]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, name):
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds))
    return acc, model

**Train the models:**

In [103]:
results = []

models_tfidf = {
    "TFIDF_LogReg": LogisticRegression(max_iter=2000, n_jobs=-1),
    "TFIDF_LinearSVC": LinearSVC(),
    "TFIDF_RandomForest": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
    "TFIDF_MLP": MLPClassifier(hidden_layer_sizes=(256,128), max_iter=20, random_state=RANDOM_STATE)
}

for name, model in models_tfidf.items():
    acc, model = train_and_evaluate(model, X_train_svd, y_train, X_test_svd, y_test, name)
    results.append([name, acc])
    joblib.dump(model, f"../model/{name}.joblib")



Training TFIDF_LogReg...
TFIDF_LogReg Accuracy: 0.9783
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3580
           1       0.97      0.99      0.98      4239

    accuracy                           0.98      7819
   macro avg       0.98      0.98      0.98      7819
weighted avg       0.98      0.98      0.98      7819


Training TFIDF_LinearSVC...
TFIDF_LinearSVC Accuracy: 0.9834
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3580
           1       0.98      0.99      0.98      4239

    accuracy                           0.98      7819
   macro avg       0.98      0.98      0.98      7819
weighted avg       0.98      0.98      0.98      7819


Training TFIDF_RandomForest...
TFIDF_RandomForest Accuracy: 0.9416
              precision    recall  f1-score   support

           0       0.96      0.91      0.93      3580
           1       0.93      0.97      0.95      42

#### **6. WORD2VEC**

In [105]:
print("Loading Word2Vec from gensim downloader...")

import gensim.downloader as api

# This automatically downloads and loads the 300-dim GoogleNews Word2Vec
w2v = api.load("word2vec-google-news-300")

EMB_DIM = 300

print("Word2Vec loaded successfully!")


Loading Word2Vec from gensim downloader...
Word2Vec loaded successfully!


**Converting text -> average word embeddings**

In [None]:
# helper fucntion to convert text to vectors 
def text_to_w2v(text):
    words = text.split()
    vecs = [w2v[word] for word in words if word in w2v]
    
    if len(vecs) == 0:
        return np.zeros(EMB_DIM)
    
    return np.mean(vecs, axis=0)


In [None]:
# Converting the entire train and text data to vectors
print("Converting train to W2V...")
X_train_w2v = np.array([text_to_w2v(t) for t in tqdm(X_train)])

print("Converting test to W2V...")
X_test_w2v = np.array([text_to_w2v(t) for t in tqdm(X_test)])


Converting train to W2V...


100%|██████████| 31273/31273 [00:25<00:00, 1250.54it/s]


Converting test to W2V...


100%|██████████| 7819/7819 [00:02<00:00, 2611.41it/s]


#### **7. Train Models on Word2Vec**

In [None]:
# training models
models_w2v = {
    "W2V_LogReg": LogisticRegression(max_iter=2000),
    "W2V_LinearSVC": LinearSVC(),
    "W2V_RandomForest": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
    "W2V_MLP": MLPClassifier(hidden_layer_sizes=(256,128), max_iter=20, random_state=RANDOM_STATE)
}

for name, model in models_w2v.items():
    acc, model = train_and_evaluate(model, X_train_w2v, y_train, X_test_w2v, y_test, name)
    results.append([name, acc])
    joblib.dump(model, f"../model/{name}.joblib")



Training W2V_LogReg...
W2V_LogReg Accuracy: 0.9450
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      3580
           1       0.94      0.96      0.95      4239

    accuracy                           0.95      7819
   macro avg       0.95      0.94      0.94      7819
weighted avg       0.95      0.95      0.94      7819


Training W2V_LinearSVC...
W2V_LinearSVC Accuracy: 0.9609
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      3580
           1       0.96      0.97      0.96      4239

    accuracy                           0.96      7819
   macro avg       0.96      0.96      0.96      7819
weighted avg       0.96      0.96      0.96      7819


Training W2V_RandomForest...
W2V_RandomForest Accuracy: 0.9206
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      3580
           1       0.91      0.95      0.93      4239

    accu

#### **8. FINAL COMPARISON**

In [109]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print("\nFinal Model Comparison:")
display(results_df)



Final Model Comparison:


Unnamed: 0,Model,Accuracy
3,TFIDF_MLP,0.989001
1,TFIDF_LinearSVC,0.983374
0,TFIDF_LogReg,0.978258
7,W2V_MLP,0.972375
5,W2V_LinearSVC,0.960865
4,W2V_LogReg,0.945006
2,TFIDF_RandomForest,0.941553
6,W2V_RandomForest,0.920578


#### **9. SAVE Transformations**

In [110]:
joblib.dump(tfidf, "../model/tfidf_vectorizer.joblib")
joblib.dump(svd, "../model/svd_300.joblib")

['../model/svd_300.joblib']