In [None]:
# Fake News Detection Using NLP + Multiple Machine Learning Models
# MASTER SCRIPT: ALL 15 MODELS + ACCURACY TABLE + PREDICTION

In [6]:

#  IMPORT LIBRARIES
# =================
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# LOAD DATASET
# ============
df = pd.read_csv("news.csv")
dataset = df.drop("Unnamed: 0", axis=1)

X = dataset["text"]
y = dataset["label"]


# TRAIN TEST SPLIT
# =================
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=53
)


#  DEFINE VECTORIZERS
# ===================
vectorizers = {
    "COUNT": CountVectorizer(stop_words="english", max_features=5000),
    "TFIDF": TfidfVectorizer(stop_words="english", max_features=5000),
    "HASHING": HashingVectorizer(stop_words="english", n_features=2**14, alternate_sign=False)
}


#  DEFINE MODELS
# ===============
models = {
    "MULTINOMIAL NB": MultinomialNB(),
    "PASSIVE AGGRESSIVE": PassiveAggressiveClassifier(max_iter=1000),
    "LOGISTIC REGRESSION": LogisticRegression(max_iter=1000),
    "LINEAR SVM": LinearSVC(),
    "RANDOM FOREST": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
}


#  TRAIN ALL MODELS
# ================================
results = []

for vec_name, vectorizer in vectorizers.items():

    # FIT TRANSFORM (except Hashing)
    if vec_name == "HASHING":
        x_train_vec = vectorizer.transform(x_train)
        x_test_vec = vectorizer.transform(x_test)
    else:
        x_train_vec = vectorizer.fit_transform(x_train)
        x_test_vec = vectorizer.transform(x_test)

    for model_name, model in models.items():

        # Skip invalid combination: Hashing + Multinomial NB requires non-negative
        if vec_name == "HASHING" and model_name == "MULTINOMIAL NB":
            pass
        
        #  TRAIN MODEL
        model.fit(x_train_vec, y_train)
        y_pred = model.predict(x_test_vec)

        acc = accuracy_score(y_test, y_pred)

        results.append([f"{vec_name} + {model_name}", acc])


# ACCURACY TABLE WITH RANK 
# =========================
accuracy_df = pd.DataFrame(results, columns=["Model Combination", "Accuracy"])

# Sort by Accuracy (Descending)
accuracy_df = accuracy_df.sort_values(by="Accuracy", ascending=False).reset_index(drop=True)

# Add Rank Column
accuracy_df.insert(0, "Rank", accuracy_df.index + 1)

print("\n =======  FINAL RANKED ACCURACY TABLE =======  \n")
print(accuracy_df.to_string(index=False))   # 



#  SAMPLE PREDICTION DEMO
# =======================
sample_text = dataset["text"].iloc[2]

best_vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
best_model = LinearSVC()

x_train_best = best_vectorizer.fit_transform(x_train)
x_test_best = best_vectorizer.transform(x_test)

best_model.fit(x_train_best, y_train)

sample_vector = best_vectorizer.transform([sample_text])
prediction = best_model.predict(sample_vector)[0]

print("\n ======== SAMPLE PREDICTION ======== ")
print("News Text:\n", sample_text)
print("\nActual Label:", y.iloc[2])
print("Predicted Label (TF-IDF + SVM):", prediction)






 Rank             Model Combination  Accuracy
    1            TFIDF + LINEAR SVM  0.933719
    2    TFIDF + PASSIVE AGGRESSIVE  0.924250
    3          HASHING + LINEAR SVM  0.923724
    4         COUNT + RANDOM FOREST  0.918464
    5         TFIDF + RANDOM FOREST  0.917938
    6       HASHING + RANDOM FOREST  0.915834
    7   COUNT + LOGISTIC REGRESSION  0.915308
    8  HASHING + PASSIVE AGGRESSIVE  0.915308
    9   TFIDF + LOGISTIC REGRESSION  0.913730
   10 HASHING + LOGISTIC REGRESSION  0.908469
   11    COUNT + PASSIVE AGGRESSIVE  0.890058
   12        TFIDF + MULTINOMIAL NB  0.884271
   13      HASHING + MULTINOMIAL NB  0.883219
   14            COUNT + LINEAR SVM  0.882693
   15        COUNT + MULTINOMIAL NB  0.865860

News Text:
 U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.

Kerry said he expects to arrive in Paris Thursday evening, 