In [4]:
import numpy as np
import pandas as pd


In [13]:
data_fake = pd.read_csv('archive/Fake.csv')
data_real = pd.read_csv('archive/True.csv')
data_real["label"] = 1   # 1 = Real
data_fake["label"] = 0

In [14]:
data = pd.concat([data_real, data_fake]).sample(frac=1).reset_index(drop=True)

In [15]:
data["content"] = data["title"] + " " + data["text"]
data = data[["content", "label"]]


In [16]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation/numbers
    words = text.split()
    words = [ps.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

data["clean_text"] = data["content"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/animesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = tfidf.fit_transform(data["clean_text"]).toarray()
y = data["label"].values


In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [20]:
import joblib
joblib.dump(model, "model/fake_news_model.pkl")
joblib.dump(tfidf, "model/vectorizer.pkl")


['model/vectorizer.pkl']