In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import pandas as pd

df_fake = pd.read_csv("../data/Fake.csv", encoding="latin1")
df_true = pd.read_csv("../data/True.csv", encoding="latin1")

print("Fake shape:", df_fake.shape)
print("True shape:", df_true.shape)

df_fake.head()

Fake shape: (23481, 4)
True shape: (21417, 4)


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Yearâ...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obamaâs Na...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
df_fake["label"] = 0   
df_true["label"] = 1   

df = pd.concat([df_fake, df_true], axis=0)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


In [4]:
df["content"] = df["title"].fillna("") + " " + df["text"].fillna("")

X = df["content"]
y = df["label"]

print(X.iloc[0])
print(y.iloc[0])

Ben Stein Calls Out 9th Circuit Court: Committed a âCoup dâÃ©tatâ Against the Constitution 21st Century Wire says Ben Stein, reputable professor from, Pepperdine University (also of some Hollywood fame appearing in TV shows and films such as Ferris Bueller s Day Off) made some provocative statements on Judge Jeanine Pirro s show recently. While discussing the halt that was imposed on President Trump s Executive Order on travel. Stein referred to the judgement by the 9th Circuit Court in Washington state as a  Coup d tat against the executive branch and against the constitution.  Stein went on to call the Judges in Seattle  political puppets  and the judiciary  political pawns. Watch the interview below for the complete statements and note the stark contrast to the rhetoric of the leftist media and pundits who neglect to note that no court has ever blocked any Presidential orders in immigration in the past or discuss the legal efficacy of the halt or the actual text of the Executi

In [5]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)                         # remove URLs
    text = re.sub(r"\d+", " ", text)                             # remove digits
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()                     # remove extra spaces
    return text
X_clean = X.apply(clean_text)
X_clean.head()

0    ben stein calls out th circuit court committed...
1    trump drops steve bannon from national securit...
2    puerto rico expects us to lift jones act shipp...
3    oops trump just accidentally confirmed he leak...
4    donald trump heads for scotland to reopen a go...
Name: content, dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42
)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

xv_train = tfidf.fit_transform(X_train)
xv_test  = tfidf.transform(X_test)

xv_train.shape, xv_test.shape

((35918, 5000), (8980, 5000))

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print("Training Logistic Regression...")

LR = LogisticRegression(max_iter=300)   # smaller max_iter
LR.fit(xv_train, y_train)

pred_lr = LR.predict(xv_test)
lr_acc = accuracy_score(y_test, pred_lr) * 100

print(f"Logistic Regression Accuracy: {lr_acc:.2f}%")

Training Logistic Regression...
Logistic Regression Accuracy: 98.78%


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("Training Random Forest (smaller)...")

RFC = RandomForestClassifier(
    n_estimators=50,      # default is 100 – we cut it down
    max_depth=20,        # limit tree depth so it’s faster
    random_state=42,
    n_jobs=-1
)
RFC.fit(xv_train, y_train)

pred_rfc = RFC.predict(xv_test)
rfc_acc = accuracy_score(y_test, pred_rfc) * 100

print(f"Random Forest Accuracy: {rfc_acc:.2f}%")

Training Random Forest (smaller)...
Random Forest Accuracy: 99.57%


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("Training Random Forest (smaller)...")

RFC = RandomForestClassifier(
    n_estimators=50,      # fewer trees = faster
    max_depth=20,        # limit depth
    random_state=42,
    n_jobs=-1
)
RFC.fit(xv_train, y_train)

pred_rfc = RFC.predict(xv_test)
rfc_acc = accuracy_score(y_test, pred_rfc) * 100

print(f"Random Forest Accuracy: {rfc_acc:.2f}%")

Training Random Forest (smaller)...
Random Forest Accuracy: 99.57%


In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

print("Training Gradient Boosting (smaller)...")

GBC = GradientBoostingClassifier(
    n_estimators=50,     # smaller than default 100
    random_state=42
)
GBC.fit(xv_train, y_train)

pred_gbc = GBC.predict(xv_test)
gbc_acc = accuracy_score(y_test, pred_gbc) * 100

print(f"Gradient Boosting Accuracy: {gbc_acc:.2f}%")

Training Gradient Boosting (smaller)...
Gradient Boosting Accuracy: 99.52%


In [27]:
def manual_testing(news):
    new_df = pd.Series([news])

    new_df_clean = new_df.apply(clean_text)

    
    new_xv = tfidf.transform(new_df_clean)

    probabilities = RFC.predict_proba(new_xv)[0]

    fake_prob = probabilities[0] * 100
    real_prob = probabilities[1] * 100

    pred = RFC.predict(new_xv)[0]

    if pred == 0:
        return f"Prediction: FAKE NEWS\nConfidence: {fake_prob:.2f}%"
    else:
        return f"Prediction: REAL NEWS\nConfidence: {real_prob:.2f}%"

In [28]:
news = input("Enter a news headline or article: ")
print(manual_testing(news))

Enter a news headline or article:  NEW PRESIDENT IS DONALD TRUMP?


Prediction: FAKE NEWS
Confidence: 80.88%


In [26]:
import joblib

import os
if not os.path.exists("models"):
    os.makedirs("models")

joblib.dump(RFC, "models/fake_news_model.joblib")

joblib.dump(tfidf, "models/tfidf_vectorizer.joblib")

print("Model + Vectorizer saved successfully!")

Model + Vectorizer saved successfully!
