In [5]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
# Load data
true_df = pd.read_csv('../data/True.csv')
fake_df = pd.read_csv('../data/Fake.csv')

# Add label columns
true_df['label'] = 1  # Real
fake_df['label'] = 0  # Fake

# Combine and shuffle
df = pd.concat([true_df, fake_df], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# Combine title + text (optional but useful)
df['content'] = df['title'] + " " + df['text']

df.head()


Unnamed: 0,title,text,subject,date,label,content
0,Boiler Room EP #85.5 – Who’s Watching The Watc...,Tune in to the Alternate Current Radio Network...,US_News,"November 30, 2016",0,Boiler Room EP #85.5 – Who’s Watching The Watc...
1,"Australian senate rejects proposed visa, citiz...",SYDNEY (Reuters) - Australia s lawmakers have ...,worldnews,"October 19, 2017",1,"Australian senate rejects proposed visa, citiz..."
2,"Boiler Room EP #70 – Sticks, Stones & The Medi...",Tune in to the Alternate Current Radio Network...,US_News,"August 25, 2016",0,"Boiler Room EP #70 – Sticks, Stones & The Medi..."
3,Charles Koch Does A Brutally Honest Interview...,Charles Koch sat down with ABC News and did an...,News,"April 25, 2016",0,Charles Koch Does A Brutally Honest Interview...
4,LIBERAL LOSER Screams “This is my America!” Af...,,left-news,"Dec 19, 2016",0,LIBERAL LOSER Screams “This is my America!” Af...


In [6]:
# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    words = re.findall(r"\b\w+\b", text)  # Tokenize words using regex
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Apply cleaning
df['cleaned_content'] = df['content'].apply(clean_text)
df = df[['cleaned_content', 'label']]
df.head()

Unnamed: 0,cleaned_content,label
0,boiler room ep 855 who watching watcher tune a...,0
1,australian senate reject proposed visa citizen...,1
2,boiler room ep 70 stick stone medium hammer tu...,0
3,charles koch brutally honest interview republi...,0
4,liberal loser scream america elector vote trum...,0


In [7]:
# Split data
X = df['cleaned_content']
y = df['label']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [9]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Save the model and vectorizer
joblib.dump(model, '../models/logistic_model.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')


['../models/tfidf_vectorizer.pkl']

In [10]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9880846325167038

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4677
           1       0.99      0.99      0.99      4303

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4614   63]
 [  44 4259]]


In [11]:
# Sample prediction
def predict_news(text):
    cleaned = clean_text(text)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)
    return "Real" if prediction[0] == 1 else "Fake"

sample_text = "The prime minister announced a new vaccine today during the press conference."
print(predict_news(sample_text))


Fake


In [13]:
import pickle

# Save the model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the vectorizer (Tfidf or CountVectorizer)
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
