In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, recall_score, roc_auc_score, auc, roc_curve, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [2]:
data_final_preprocessed = pd.read_csv('../data/data_final.csv')
data_final_preprocessed

Unnamed: 0,text,cleaned_text,label,word_count,sentence_count,lexical_diversity,polarity,subjectivity
0,Donald Trump just couldn t wish all Americans ...,donald trump could wish american happy new yea...,1,599,28,0.435726,0.082132,0.599895
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,1,331,11,0.595166,-0.005004,0.334098
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,1,689,25,0.480406,-0.012345,0.541969
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,1,519,15,0.502890,-0.023118,0.394086
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,1,458,19,0.504367,-0.011722,0.495222
...,...,...,...,...,...,...,...,...
38575,NATO allies on Tuesday welcomed President Dona...,nato ally tuesday welcomed president donald tr...,0,497,15,0.523139,0.219279,0.447884
38576,"LexisNexis, a provider of legal, regulatory an...",lexisnexis provider legal regulatory business ...,0,137,6,0.583942,0.022222,0.077778
38577,In the shadow of disused Sovietera factories i...,shadow disused sovietera factory minsk street ...,0,352,16,0.596591,0.054382,0.426609
38578,Vatican Secretary of State Cardinal Pietro Par...,vatican secretary state cardinal pietro paroli...,0,219,8,0.621005,0.021993,0.377753


In [9]:
data_final_preprocessed.shape

(38580, 8)

In [4]:
data_final_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38580 entries, 0 to 38579
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               38580 non-null  object 
 1   cleaned_text       38580 non-null  object 
 2   label              38580 non-null  int64  
 3   word_count         38580 non-null  int64  
 4   sentence_count     38580 non-null  int64  
 5   lexical_diversity  38580 non-null  float64
 6   polarity           38580 non-null  float64
 7   subjectivity       38580 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 2.4+ MB


In [3]:
def make_data(df, text_col):
    vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
    X = vectorizer.fit_transform(df[text_col])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    X_final = pd.concat([tfidf_df, df[numeric_cols]], axis=1)
    return X_final, tfidf_df

data_final_preprocessed, tfidf_df = make_data(data_final_preprocessed, 'cleaned_text')


In [4]:
X = data_final_preprocessed.drop(columns=['label'])
y = data_final_preprocessed['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42,verbose=-1),
    'XGboost': XGBClassifier(random_state=42),
    'Catboost': CatBoostClassifier(random_state=42, verbose=0)
}

In [18]:
name_l, recall_l = [], []
for name, model in models.items():
    print(f'Model: {name}')
    name_l.append(name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    recall_s = recall_score(y_test, y_pred, pos_label=1)
    print(f'Recall Score: {recall_s}\n')
    recall_l.append(recall_s)
    print('-'*50)

results = pd.DataFrame({'Model': name_l, 'Recall': recall_l})
results = results.sort_values(by='Recall', ascending=False)

Model: Random Forest
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4238
           1       0.99      0.95      0.97      3478

    accuracy                           0.97      7716
   macro avg       0.98      0.97      0.97      7716
weighted avg       0.97      0.97      0.97      7716

Recall Score: 0.9531339850488787

--------------------------------------------------
Model: LightGBM
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4238
           1       0.99      0.98      0.98      3478

    accuracy                           0.99      7716
   macro avg       0.99      0.98      0.99      7716
weighted avg       0.99      0.99      0.99      7716

Recall Score: 0.9798734905117884

--------------------------------------------------
Model: XGboost
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4238
           1   

In [19]:
results

Unnamed: 0,Model,Recall
1,LightGBM,0.979873
3,Catboost,0.979586
2,XGboost,0.978148
0,Random Forest,0.953134
