<a href="https://colab.research.google.com/github/badhanamitroy/FakeNewsDetector/blob/main/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================
# STEP 1: Mount Google Drive
# =============================
from google.colab import drive
drive.mount('/content/drive')

# Paths to your custom datasets
custom_fake_path = "/content/drive/MyDrive/Colab Notebooks/FakeNews/Fake.csv"
custom_real_path = "/content/drive/MyDrive/Colab Notebooks/RealNews/True.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# =============================
# STEP 2: Import Libraries
# =============================
import pandas as pd
import numpy as np
import re, string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import os

In [None]:
# =============================
# STEP 4: Load Custom Datasets
# =============================
def load_custom_dataset(path, true_label):
    """
    Load custom CSV dataset and assign label based on the file.
    """
    print(f"\nAttempting to load: {path}")

    if not os.path.exists(path):
        print(f"Error: File not found at {path}")
        return None
    if not os.path.isfile(path):
        print(f"Error: Path is a directory, not a file: {path}")
        return None

    df = pd.read_csv(path)
    print(f"Loaded {len(df)} rows. Columns: {df.columns.tolist()}")

    # Combine title + text → statement
    statement_parts = []
    if 'title' in df.columns:
        statement_parts.append(df["title"].fillna("").astype(str))
    if 'text' in df.columns:
        statement_parts.append(df["text"].fillna("").astype(str))

    if not statement_parts:
        print(f"Error: Could not find 'title' or 'text' columns")
        return None

    # Concatenate with space separator
    df["statement"] = statement_parts[0]
    for part in statement_parts[1:]:
        df["statement"] = df["statement"] + " " + part

    # Assign the label
    df["label"] = true_label
    df["source"] = "fake_csv" if true_label == "fake" else "real_csv"

    # Keep only statement + label + source
    df = df[["statement", "label", "source"]].copy()

    return df.dropna()

# Load the datasets
df_fake = load_custom_dataset(custom_fake_path, "fake")
df_real = load_custom_dataset(custom_real_path, "real")

if df_fake is None or df_real is None:
    raise ValueError("Cannot proceed without custom datasets")

print(f"\n✓ Fake News Dataset: {len(df_fake)} rows")
print(f"✓ Real News Dataset: {len(df_real)} rows")


Attempting to load: /content/drive/MyDrive/Colab Notebooks/FakeNews/Fake.csv
Loaded 23481 rows. Columns: ['title', 'text', 'subject', 'date']

Attempting to load: /content/drive/MyDrive/Colab Notebooks/RealNews/True.csv
Loaded 21417 rows. Columns: ['title', 'text', 'subject', 'date']

✓ Fake News Dataset: 23481 rows
✓ Real News Dataset: 21417 rows


In [None]:
# =============================
# STEP 5: Merge All Datasets
# =============================
if USE_LIAR_DATASET and df_liar_all is not None:
    df_liar_all["source"] = "liar"
    combined = pd.concat([df_liar_all, df_fake, df_real], ignore_index=True)
else:
    combined = pd.concat([df_fake, df_real], ignore_index=True)

print("\n" + "="*50)
print("COMBINED DATASET STATISTICS")
print("="*50)
print(f"Total dataset size: {combined.shape[0]} rows")
print(f"\nLabel distribution:")
print(combined['label'].value_counts())
print(f"\nLabel percentages:")
print(combined['label'].value_counts(normalize=True) * 100)
print(f"\nSource distribution:")
print(combined['source'].value_counts())


COMBINED DATASET STATISTICS
Total dataset size: 44898 rows

Label distribution:
label
fake    23481
real    21417
Name: count, dtype: int64

Label percentages:
label
fake    52.298543
real    47.701457
Name: proportion, dtype: float64

Source distribution:
source
fake_csv    23481
real_csv    21417
Name: count, dtype: int64


In [None]:
# =============================
# STEP 6: Preprocess Text
# =============================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

combined["statement"] = combined["statement"].apply(clean_text)
combined = combined[combined["statement"].str.len() > 10]

print(f"\nDataset size after cleaning: {combined.shape[0]} rows")


Dataset size after cleaning: 44889 rows


In [None]:
# =============================
# STEP 7: Train/Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    combined["statement"], combined["label"],
    test_size=0.2, stratify=combined["label"], random_state=42
)

print(f"\nTraining set: {len(X_train)} rows")
print(f"Test set: {len(X_test)} rows")
print(f"Training label distribution:\n{y_train.value_counts()}")


Training set: 35911 rows
Test set: 8978 rows
Training label distribution:
label
fake    18777
real    17134
Name: count, dtype: int64


In [None]:
# =============================
# STEP 8: TF-IDF + Logistic Regression
# =============================
print("\n" + "="*50)
print("TRAINING MODEL")
print("="*50)

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3),  # Include trigrams for better context
    min_df=2,  # Ignore terms that appear in fewer than 2 documents
    max_df=0.95  # Ignore terms that appear in more than 95% of documents
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    C=1.0,  # Regularization parameter
    class_weight='balanced'  # Handle class imbalance
)
model.fit(X_train_tfidf, y_train)

print("✓ Model training complete!")


TRAINING MODEL
✓ Model training complete!


In [None]:
# =============================
# STEP 9: Evaluation
# =============================
print("\n" + "="*50)
print("MODEL EVALUATION")
print("="*50)

y_pred = model.predict(X_test_tfidf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\n[True Negatives: {cm[0][0]}, False Positives: {cm[0][1]}]")
print(f"[False Negatives: {cm[1][0]}, True Positives: {cm[1][1]}]")

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"\n✓ Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")


MODEL EVALUATION

Classification Report:
              precision    recall  f1-score   support

        fake     0.9917    0.9887    0.9902      4695
        real     0.9877    0.9909    0.9893      4283

    accuracy                         0.9898      8978
   macro avg     0.9897    0.9898    0.9897      8978
weighted avg     0.9898    0.9898    0.9898      8978


Confusion Matrix:
[[4642   53]
 [  39 4244]]

[True Negatives: 4642, False Positives: 53]
[False Negatives: 39, True Positives: 4244]

✓ Overall Accuracy: 0.9898 (98.98%)


In [None]:
# =============================
# STEP 10: Analyze Top Features
# =============================
print("\n" + "="*50)
print("TOP PREDICTIVE FEATURES")
print("="*50)

feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

# Get indices of top features
top_fake_indices = np.argsort(coefficients)[:20]
top_real_indices = np.argsort(coefficients)[-20:]

print("\nTop 20 words/phrases indicating FAKE news:")
for idx in top_fake_indices:
    print(f"  '{feature_names[idx]}' (weight: {coefficients[idx]:.4f})")

print("\nTop 20 words/phrases indicating REAL news:")
for idx in top_real_indices[::-1]:
    print(f"  '{feature_names[idx]}' (weight: {coefficients[idx]:.4f})")


TOP PREDICTIVE FEATURES

Top 20 words/phrases indicating FAKE news:
  'video' (weight: -8.8628)
  'via' (weight: -6.4218)
  'this' (weight: -5.4639)
  'is' (weight: -4.9503)
  'hillary' (weight: -4.8526)
  'just' (weight: -4.7428)
  'president trump' (weight: -4.6221)
  'that' (weight: -4.4794)
  'gop' (weight: -4.1859)
  'obama' (weight: -4.0987)
  'image' (weight: -3.6995)
  'mr' (weight: -3.6061)
  'america' (weight: -3.5017)
  'you' (weight: -3.4726)
  'breaking' (weight: -3.3245)
  'image via' (weight: -3.2364)
  'watch' (weight: -3.0806)
  'our' (weight: -3.0556)
  'american' (weight: -2.9285)
  'even' (weight: -2.7114)

Top 20 words/phrases indicating REAL news:
  'reuters' (weight: 18.1435)
  'said' (weight: 15.7858)
  'washington reuters' (weight: 7.1132)
  'on' (weight: 6.4436)
  'said on' (weight: 5.7021)
  'reuters the' (weight: 4.7447)
  'on wednesday' (weight: 4.6673)
  'on tuesday' (weight: 4.4180)
  'on thursday' (weight: 4.3009)
  'on friday' (weight: 4.1342)
  'us' (

In [None]:
# =============================
# STEP 11: Prediction Function
# =============================
def predict_news(news_text, show_details=True):
    """
    Predict whether a news article is fake or real.
    """
    cleaned = clean_text(news_text)

    if len(cleaned) < 10:
        return "Error: Text too short", 0.0

    text_tfidf = vectorizer.transform([cleaned])
    prediction = model.predict(text_tfidf)[0]
    probabilities = model.predict_proba(text_tfidf)[0]

    # probabilities[0] is for 'fake', probabilities[1] is for 'real'
    fake_prob = probabilities[0] * 100
    real_prob = probabilities[1] * 100

    if show_details:
        print(f"\n{'='*50}")
        print(f"PREDICTION RESULT")
        print(f"{'='*50}")
        print(f"Text preview: {news_text[:100]}...")
        print(f"\nPrediction: {prediction.upper()}")
        print(f"Confidence scores:")
        print(f"  - FAKE: {fake_prob:.2f}%")
        print(f"  - REAL: {real_prob:.2f}%")
        print(f"{'='*50}\n")

    return prediction, max(fake_prob, real_prob)

In [None]:
# =============================
# STEP 12:
Interactive Testing
# =============================
print("\n" + "="*50)
print("READY FOR PREDICTIONS")
print("="*50)
print("Use: predict_news('your news text here')\n")

# Test with actual samples from your datasets
print("Testing with samples from your datasets:\n")

if len(df_fake) > 0:
    print("=" * 60)
    print("TEST 1: Sample from Fake.csv")
    print("=" * 60)
    fake_sample = df_fake.iloc[0]['statement']
    predict_news(fake_sample[:500])  # Use first 500 chars

if len(df_real) > 0:
    print("=" * 60)
    print("TEST 2: Sample from True.csv")
    print("=" * 60)
    real_sample = df_real.iloc[0]['statement']
    predict_news(real_sample[:500])  # Use first 500 chars

print("\n" + "="*50)
print("Now try with your own text!")
print("="*50)

# Interactive user input loop
while True:
    print("\nEnter news text to analyze (or type 'exit' to quit):")
    user_input = input("> ")

    if user_input.lower().strip() in ['exit', 'quit', 'q']:
        print("Exiting prediction mode. Goodbye!")
        break

    if len(user_input.strip()) < 10:
        print("⚠️  Please enter a longer text (at least 10 characters)")
        continue

    predict_news(user_input)




READY FOR PREDICTIONS
Use: predict_news('your news text here')

Testing with samples from your datasets:

TEST 1: Sample from Fake.csv

PREDICTION RESULT
Text preview:  Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing Donald Trump just co...

Prediction: FAKE
Confidence scores:
  - FAKE: 98.32%
  - REAL: 1.68%

TEST 2: Sample from True.csv

PREDICTION RESULT
Text preview: As U.S. budget fight looms, Republicans flip their fiscal script WASHINGTON (Reuters) - The head of ...

Prediction: REAL
Confidence scores:
  - FAKE: 1.03%
  - REAL: 98.97%


Now try with your own text!

Enter news text to analyze (or type 'exit' to quit):

PREDICTION RESULT
Text preview: SHOCKING: Doctors Reveal That Drinking Lemon Water Every Morning Cures Diabetes! Big Pharma Doesn't ...

Prediction: FAKE
Confidence scores:
  - FAKE: 99.20%
  - REAL: 0.80%


Enter news text to analyze (or type 'exit' to quit):
