In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from warnings import filterwarnings
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, recall_score, roc_auc_score, auc, roc_curve, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

filterwarnings("ignore")
set_seed(42)

In [None]:
# Run eda.ipynb first to generate data_final.csv
data_final_preprocessed = pd.read_csv('../data/data_final.csv')
data_final_preprocessed

Unnamed: 0,text,cleaned_text,label,word_count,sentence_count,lexical_diversity,polarity,subjectivity
0,Donald Trump just couldn t wish all Americans ...,donald trump could wish american happy new yea...,1,599,28,0.435726,0.082132,0.599895
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,1,331,11,0.595166,-0.005004,0.334098
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,1,689,25,0.480406,-0.012345,0.541969
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,1,519,15,0.502890,-0.023118,0.394086
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,1,458,19,0.504367,-0.011722,0.495222
...,...,...,...,...,...,...,...,...
38345,Two North Korean shipments to a Syrian governm...,two north korean shipment syrian goveright now...,0,483,18,0.480331,-0.042119,0.380029
38346,"LexisNexis, a provider of legal, regulatory an...",lexisnexis provider legal regulatory business ...,0,137,6,0.583942,0.022222,0.077778
38347,In the shadow of disused Sovietera factories i...,shadow disused sovietera factory minsk street ...,0,352,16,0.596591,0.054382,0.426609
38348,Vatican Secretary of State Cardinal Pietro Par...,vatican secretary state cardinal pietro paroli...,0,219,8,0.621005,0.021993,0.377753


In [4]:
def make_data(df, text_col):
    vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
    X = vectorizer.fit_transform(df[text_col])

    with open('../artifacts/vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
    numeric_cols = [col for col in df.select_dtypes(include=['number']).columns.tolist() if col != 'label']
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    with open('../artifacts/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    X_final = pd.concat([tfidf_df, df[numeric_cols], df['label']], axis=1)
    return X_final, tfidf_df

data_final_preprocessed, tfidf_df = make_data(data_final_preprocessed, 'cleaned_text')


In [5]:
X = data_final_preprocessed.drop(columns=['label'])
y = data_final_preprocessed['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42,verbose=-1, objective='binary', metric='recall'),
    'XGboost': XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='auc', use_label_encoder=False, verbosity=0),
    'Catboost': CatBoostClassifier(random_state=42, verbose=0, loss_function='Logloss', eval_metric='Recall')
}

In [20]:
name_l, recall_l = [], []
for name, model in models.items():
    print(f'Model: {name}')
    name_l.append(name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    recall_s = recall_score(y_test, y_pred, pos_label=1)
    print(f'Recall Score: {recall_s}\n')
    recall_l.append(recall_s)
    print('-'*50)

results = pd.DataFrame({'Model': name_l, 'Recall': recall_l})
results = results.sort_values(by='Recall', ascending=False)

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4192
           1       0.98      0.97      0.98      3478

    accuracy                           0.98      7670
   macro avg       0.98      0.98      0.98      7670
weighted avg       0.98      0.98      0.98      7670

Recall Score: 0.9695227142035653

--------------------------------------------------
Model: Decision Tree
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4192
           1       0.91      0.91      0.91      3478

    accuracy                           0.92      7670
   macro avg       0.92      0.92      0.92      7670
weighted avg       0.92      0.92      0.92      7670

Recall Score: 0.9148936170212766

--------------------------------------------------
Model: Random Forest
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      419

In [21]:
results

Unnamed: 0,Model,Recall
5,Catboost,0.972685
3,LightGBM,0.970385
4,XGboost,0.970385
0,Logistic Regression,0.969523
2,Random Forest,0.945946
1,Decision Tree,0.914894


In [18]:
model_best = CatBoostClassifier(random_state=42, verbose=0, loss_function='Logloss', eval_metric='Recall')
model_best.fit(X, y)

with open('../artifacts/model.pkl', 'wb') as f:
    pickle.dump(model_best, f)