In [5]:
# ===============================
# FEATURE EXTRACTION + BASELINE MODEL
# TF-IDF + Logistic Regression
# ===============================

import pandas as pd
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score


# ---------- LOAD DATA ----------
df = pd.read_csv("../data/raw/news.csv")

# Convert labels to numeric (VERY IMPORTANT)
df["label"] = df["label"].map({"FAKE": 1, "REAL": 0})

# Combine title + text safely
df["content"] = df["title"].fillna("") + " " + df["text"].fillna("")

# HARD FIX: ensure NO NaN reaches TF-IDF
df["content"] = df["content"].fillna("")

# Drop bad rows (safety)
df = df.dropna(subset=["label"])

X = df["content"]
y = df["label"]

print("Dataset size:", X.shape)


# ---------- TRAIN / TEST SPLIT ----------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ---------- PIPELINE: TF-IDF + LOGISTIC REGRESSION ----------
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=5000,
        max_df=0.7
    )),
    ("clf", LogisticRegression(max_iter=1000))
])


# ---------- TRAIN ----------
model.fit(X_train, y_train)


# ---------- EVALUATE ----------
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


# ---------- SAVE MODEL ----------
os.makedirs("../models", exist_ok=True)
joblib.dump(model, "../models/fake_news_model.pkl")

print("\nModel saved as: ../models/fake_news_model.pkl")

Dataset size: (6335,)
Accuracy: 0.920284135753749

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       634
           1       0.91      0.93      0.92       633

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267


Model saved as: ../models/fake_news_model.pkl
