<a href="https://colab.research.google.com/github/aman685892/Fake-News-Classifier/blob/main/Fake_News_Classifier_%E2%80%93_TF_IDF_%2B_Logistic_Regression_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd
import numpy as  np
import re
import string


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [30]:
import joblib

In [31]:
df_fake = pd.read_csv("/content/Fake.csv.zip")
df_true = pd.read_csv("/content/True.csv.zip")

In [32]:
df_fake["label"] = 0   # 0 = Fake
df_true["label"] = 1   # 1 = Real

In [33]:
df = pd.concat([df_fake[["title","text","label"]],
                df_true[["title","text","label"]]], ignore_index=True)


In [34]:
# Basic cleanup: drop empty rows
df["text"] = df["text"].fillna("").astype(str)
df["title"] = df["title"].fillna("").astype(str)

In [35]:
# Combine title + text for stronger signal
df["input_text"] = (df["title"] + " " + df["text"]).str.strip()

print("Dataset shape:", df.shape)
print(df["label"].value_counts())

Dataset shape: (44898, 4)
label
0    23481
1    21417
Name: count, dtype: int64


In [36]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    df["input_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


In [37]:
#  Build two simple models to compare
# Model A: TF-IDF + Logistic Regression
logreg_clf = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_df=0.7)),
    ("clf", LogisticRegression(max_iter=1000))
])

In [38]:
# Model B: TF-IDF + Multinomial Naive Bayes (fast baseline)
nb_clf = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_df=0.7)),
    ("clf", MultinomialNB())
])

In [39]:
#  Train
logreg_clf.fit(X_train, y_train)
nb_clf.fit(X_train, y_train)

In [40]:
#  Evaluate
def evaluate(model, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n====== {name} ======")
    print("Accuracy:", round(acc, 4))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

evaluate(logreg_clf, "TF-IDF + Logistic Regression")
evaluate(nb_clf, "TF-IDF + Naive Bayes")


Accuracy: 0.9857

Classification Report:
               precision    recall  f1-score   support

           0     0.9893    0.9834    0.9863      4696
           1     0.9819    0.9883    0.9851      4284

    accuracy                         0.9857      8980
   macro avg     0.9856    0.9859    0.9857      8980
weighted avg     0.9858    0.9857    0.9857      8980

Confusion Matrix:
 [[4618   78]
 [  50 4234]]

Accuracy: 0.9372

Classification Report:
               precision    recall  f1-score   support

           0     0.9375    0.9427    0.9401      4696
           1     0.9368    0.9311    0.9340      4284

    accuracy                         0.9372      8980
   macro avg     0.9372    0.9369    0.9370      8980
weighted avg     0.9372    0.9372    0.9372      8980

Confusion Matrix:
 [[4427  269]
 [ 295 3989]]


In [41]:
#  Pick best model (usually Logistic) and demo with custom texts
best_model = logreg_clf

samples = [
    "Breaking: Government announces major policy reform on education next month.",
    "Shocking! Scientists confirm aliens have secretly replaced the moon with a hologram.",
    "Ministry releases official GDP figures; growth beats expectations in Q2."
]
preds = best_model.predict(samples)
for t, p in zip(samples, preds):
    print("\nText:", t)
    print("Prediction:", "Real (1)" if p==1 else "Fake (0)")


Text: Breaking: Government announces major policy reform on education next month.
Prediction: Fake (0)

Text: Shocking! Scientists confirm aliens have secretly replaced the moon with a hologram.
Prediction: Fake (0)

Text: Ministry releases official GDP figures; growth beats expectations in Q2.
Prediction: Real (1)


In [42]:
#  Save vectorizer+model
# Pipeline already contains vectorizer + classifier, so saving pipeline is enough
joblib.dump(best_model, "/content/fake_news_pipeline.joblib")
print("\nSaved model to /content/fake_news_pipeline.joblib")


Saved model to /content/fake_news_pipeline.joblib


In [43]:
#  Quick helper for user input
def classify_text(text):
    pred = best_model.predict([text])[0]
    proba = best_model.predict_proba([text])[0]
    return {"prediction": int(pred), "proba_fake": float(proba[0]), "proba_real": float(proba[1])}


In [44]:
# Example:
print("\nQuick test:", classify_text("Experts report a new vaccine approved by the health authority."))



Quick test: {'prediction': 0, 'proba_fake': 0.5017755591716113, 'proba_real': 0.49822444082838874}


In [45]:
print(classify_text("Prime Minister announced a new economic package today."))

{'prediction': 1, 'proba_fake': 0.3569778479806627, 'proba_real': 0.6430221520193373}


In [46]:
print(classify_text("Prime Minister of india is nitish kumar"))

{'prediction': 1, 'proba_fake': 0.1852470029672293, 'proba_real': 0.8147529970327707}
