In [2]:
import pandas as pd

In [3]:
# Reload data
fake = pd.read_csv(r"C:\projects\WEEK 30\fake_news_detector\data\Fake.csv")
true = pd.read_csv(r"C:\projects\WEEK 30\fake_news_detector\data\True.csv")

# Balance: sample equal amount
min_len = min(len(fake), len(true))
fake = fake.sample(min_len)
true = true.sample(min_len)

# Label and merge
fake["label"] = 0
true["label"] = 1
data = pd.concat([fake, true]).sample(frac=1).reset_index(drop=True)


In [4]:
print(data.head())

                                               title  \
0  Kremlin dismisses Trump's 'imperialist' securi...   
1  Iran says no need to increase missile range as...   
2  Chorus of sexual harassment allegations spread...   
3   Watch Elizabeth Warren Clobber Trump And Scot...   
4     Uber CEO quits Trump's business advisory group   

                                                text       subject  \
0  MOSCOW (Reuters) - The Kremlin dismissed U.S. ...     worldnews   
1  LONDON (Reuters) - Iran has no need to increas...     worldnews   
2  (Reuters) - Minnesota Governor Mark Dayton on ...  politicsNews   
3  Massachusetts Senator Elizabeth Warren once ag...          News   
4  SAN FRANCISCO/WASHINGTON (Reuters) - Uber Tech...  politicsNews   

                 date  label  
0  December 19, 2017       1  
1   October 31, 2017       1  
2   November 9, 2017       1  
3       June 19, 2016      0  
4   February 2, 2017       1  


In [5]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

data["full_text"] = data["title"] + " " + data["text"]
data["clean_text"] = data["full_text"].apply(clean_text)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data["clean_text"])
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


In [7]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4268
           1       0.98      0.99      0.99      4299

    accuracy                           0.99      8567
   macro avg       0.99      0.99      0.99      8567
weighted avg       0.99      0.99      0.99      8567



In [8]:
import joblib
import os

os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/fake_news_model.pkl")
joblib.dump(vectorizer, "model/tfidf_vectorizer.pkl")


['model/tfidf_vectorizer.pkl']

In [9]:
data['label'].value_counts()


label
1    21417
0    21417
Name: count, dtype: int64