In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from textstat import textstat
import re
import nltk
from joblib import Parallel, delayed
from scipy.sparse import hstack
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet

In [2]:
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
df_train=pd.read_csv("train.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0


In [4]:
df_test=pd.read_csv("test.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,8104,Conservatives Will HATE What Donald Trump Just...,Donald Trump isn t exactly a stranger to makin...,News,"February 14, 2016",0
1,7467,Trump victory may create new tension between U...,Donald Trump’s U.S. election victory may creat...,politicsNews,"November 9, 2016",1
2,9473,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,A couple of quick questions come to mind when ...,politics,"Nov 9, 2017",0
3,276,"Democratic Senator Franken to resign: CNN, cit...",U.S. Democratic Senator Al Franken will announ...,politicsNews,"December 7, 2017",1
4,19274,GANG OF DOMESTIC TERRORISTS Violently Attack L...,***WARNING*** Violence is graphic***This Trump...,left-news,"Jan 21, 2017",0


In [6]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  
    text = text.lower()  
    words = word_tokenize(text)  
    words = [word for word in words if word not in stopwords.words('english')] 
    pos_tags = pos_tag(words) 
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized_words)

In [7]:
df_train['cleaned_title'] = df_train['title'].apply(clean_text)
df_train['cleaned_text'] = df_train['text'].apply(clean_text)
df_test['cleaned_title'] = df_test['title'].apply(clean_text)
df_test['cleaned_text'] = df_test['text'].apply(clean_text)

In [8]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,cleaned_title,cleaned_text
0,2619,Ex-CIA head says Trump remarks on Russia inter...,Former CIA director John Brennan on Friday cri...,politicsNews,"July 22, 2017",1,ex cia head say trump remark russia interferen...,former cia director john brennan friday critic...
1,16043,YOU WON’T BELIEVE HIS PUNISHMENT! HISPANIC STO...,How did this man come to OWN this store? There...,Government News,"Jun 19, 2017",0,believe punishment hispanic store owner swindl...,man come store information much fraudster exce...
2,876,Federal Reserve governor Powell's policy views...,President Donald Trump on Thursday tapped Fede...,politicsNews,"November 2, 2017",1,federal reserve governor powell policy view word,president donald trump thursday tap federal re...
3,19963,SCOUNDREL HILLARY SUPPORTER STARTS “TrumpLeaks...,Hillary Clinton ally David Brock is offering t...,left-news,"Sep 17, 2016",0,scoundrel hillary supporter start trumpleaks c...,hillary clinton ally david brock offer pay new...
4,10783,NANCY PELOSI ARROGANTLY DISMISSES Questions on...,Pleading ignorance is a perfect ploy for Nancy...,politics,"May 26, 2017",0,nancy pelosi arrogantly dismiss question crook...,plead ignorance perfect ploy nancy pelosi beli...


In [9]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,cleaned_title,cleaned_text
0,8104,Conservatives Will HATE What Donald Trump Just...,Donald Trump isn t exactly a stranger to makin...,News,"February 14, 2016",0,conservative hate donald trump say plan parent...,donald trump exactly stranger make large group...
1,7467,Trump victory may create new tension between U...,Donald Trump’s U.S. election victory may creat...,politicsNews,"November 9, 2016",1,trump victory may create new tension u islam i...,donald trump u election victory may create fre...
2,9473,WATCH: Hundreds of ILLEGAL ALIENS Storm Senate...,A couple of quick questions come to mind when ...,politics,"Nov 9, 2017",0,watch hundred illegal alien storm senate build...,couple quick question come mind law abide amer...
3,276,"Democratic Senator Franken to resign: CNN, cit...",U.S. Democratic Senator Al Franken will announ...,politicsNews,"December 7, 2017",1,democratic senator franken resign cnn cite source,u democratic senator al franken announce resig...
4,19274,GANG OF DOMESTIC TERRORISTS Violently Attack L...,***WARNING*** Violence is graphic***This Trump...,left-news,"Jan 21, 2017",0,gang domestic terrorist violently attack lone ...,warn violence graphic trump supporter use fire...


In [10]:
def feature_engineering(data):
    data['title_length'] = data['title'].apply(lambda x: len(str(x).split()))
    data['text_length'] = data['text'].apply(lambda x: len(str(x).split()))
    data['title_readability_score'] = data['title'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    data['text_readability_score'] = data['text'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    sia = SentimentIntensityAnalyzer()
    data['title_sentiment'] = data['title'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
    data['text_sentiment'] = data['text'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
    data['title_word_density'] = data['title_length'] / (data['text_length'] + 1)
    return data

In [11]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [12]:
tfidf_title = TfidfVectorizer(max_features=3000, ngram_range=(1, 2)) 
tfidf_text = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

In [13]:
title_features_train = tfidf_title.fit_transform(df_train['cleaned_title'])
text_features_train = tfidf_text.fit_transform(df_train['cleaned_text'])


In [14]:
title_features_test = tfidf_title.transform(df_test['cleaned_title'])
text_features_test = tfidf_text.transform(df_test['cleaned_text'])


In [15]:
svd_title = TruncatedSVD(n_components=300, random_state=42)  # Reduce to 300 dimensions
svd_text = TruncatedSVD(n_components=300, random_state=42)

title_features_train = svd_title.fit_transform(title_features_train)
text_features_train = svd_text.fit_transform(text_features_train)
title_features_test = svd_title.transform(title_features_test)
text_features_test = svd_text.transform(text_features_test)

In [16]:
X_train = np.hstack((title_features_train, text_features_train, 
                     df_train[['title_length', 'text_length', 'title_sentiment', 
                                 'text_sentiment', 'title_word_density', 'title_readability_score', 'text_readability_score']].values))
X_test = np.hstack((title_features_test, text_features_test, 
                    df_test[['title_length', 'text_length', 'title_sentiment', 
                               'text_sentiment', 'title_word_density', 'title_readability_score', 'text_readability_score']].values))

In [17]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [18]:
y_train = df_train['label']
y_test = df_test['label']

In [21]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),  # Parallel processing
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42, n_jobs=-1),
    'Bagging': BaggingClassifier(random_state=42, n_jobs=-1)
}

In [22]:
results = []
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_proba) if y_proba is not None else 'N/A'

    print(f"{name} Results:")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("AUC-ROC:", auc_roc)
    results.append((name, accuracy, precision, recall, f1, auc_roc))

Training Random Forest...
Random Forest Results:
Accuracy: 0.9743558727470667
Precision: 0.9714928732183046
Recall: 0.9753954305799648
F1 Score: 0.9734402405412177
AUC-ROC: 0.9968063089324775
Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Results:
Accuracy: 0.9870569735091327
Precision: 0.979465611083622
Recall: 0.9939743911624404
F1 Score: 0.9866666666666667
AUC-ROC: 0.9989739891269924
Training SVM...
SVM Results:
Accuracy: 0.8109350429418168
Precision: 0.7529264214046822
Recall: 0.9043434597037409
F1 Score: 0.8217178054066385
AUC-ROC: 0.8839645993136563
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost Results:
Accuracy: 0.9866940849159308
Precision: 0.9823163138231631
Recall: 0.9902083856389656
F1 Score: 0.9862465616404101
AUC-ROC: 0.9991335725854489
Training Decision Tree...
Decision Tree Results:
Accuracy: 0.9392766420708842
Precision: 0.9443451621138627
Recall: 0.9286969620888778
F1 Score: 0.9364556962025317
AUC-ROC: 0.9389049703068105
Training AdaBoost...




AdaBoost Results:
Accuracy: 0.9674609894762308
Precision: 0.9628614157527418
Recall: 0.9698719558122019
F1 Score: 0.96635397123202
AUC-ROC: 0.995400327676472
Training Extra Trees...
Extra Trees Results:
Accuracy: 0.9698802467642433
Precision: 0.9641969169567379
Recall: 0.9736379613356766
F1 Score: 0.968894440974391
AUC-ROC: 0.9959559687964231
Training Bagging...
Bagging Results:
Accuracy: 0.9598403290189912
Precision: 0.9653326535814428
Recall: 0.9507908611599297
F1 Score: 0.9580065772830761
AUC-ROC: 0.9876056456560363


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

In [24]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [25]:
# Define Models with Hyperparameter Tuning
param_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), 
                   param_xgb, cv=3, scoring='accuracy', n_jobs=-1)

param_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

rf = GridSearchCV(RandomForestClassifier(random_state=42), param_rf, cv=3, scoring='accuracy', n_jobs=-1)

# Train Models
xgb.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Ensemble Model
voting_clf = VotingClassifier(estimators=[
    ('XGBoost', xgb.best_estimator_),
    ('Random Forest', rf.best_estimator_),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42))
], voting='soft', n_jobs=-1)

voting_clf.fit(X_train, y_train)

# Evaluate Models
y_pred = voting_clf.predict(X_test)
y_proba = voting_clf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_proba)

print("Voting Classifier Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc_roc)


Parameters: { "use_label_encoder" } are not used.



Voting Classifier Results:
Accuracy: 0.9883875650175397
Precision: 0.983337478239244
Recall: 0.9927190559879487
F1 Score: 0.9880059970014993
AUC-ROC: 0.9993754385175277
