<a href="https://colab.research.google.com/github/badhanamitroy/FakeNewsDetector/blob/main/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================
# STEP 1: Mount Google Drive
# =============================
from google.colab import drive
drive.mount('/content/drive')

# Path to LIAR dataset
liar_path = "/content/drive/MyDrive/Colab Notebooks/liar_dataset/"

# Paths to your custom datasets (replace with your actual file names)
custom_ds1 = "/content/drive/MyDrive/Colab Notebooks/Fake.csv"
custom_ds2 = "/content/drive/MyDrive/Colab Notebooks/True.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# =============================
# STEP 2: Import Libraries
# =============================
import pandas as pd
import numpy as np
import re, string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# =============================
# STEP 3: Load LIAR Dataset
# =============================
train_liar = pd.read_csv(liar_path + "train.tsv", sep='\t', header=None)
valid_liar = pd.read_csv(liar_path + "valid.tsv", sep='\t', header=None)
test_liar  = pd.read_csv(liar_path + "test.tsv", sep='\t', header=None)

# Add a placeholder for the first column (ID)
cols = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
        "barely_true_counts", "false_counts", "half_true_counts",
        "mostly_true_counts", "pants_on_fire_counts", "context"]
for df in [train_liar, valid_liar, test_liar]:
    df.columns = cols

# Simplify LIAR labels → Fake / Real
def simplify_label_liar(x):
    if x in ["false", "barely-true", "pants-fire"]:
        return "fake"
    else:
        return "real"

for df in [train_liar, valid_liar, test_liar]:
    df["label"] = df["label"].apply(simplify_label_liar)
    # Keep the 'id' column for now, drop other unnecessary columns
    df.drop(columns=[c for c in cols if c not in ["id", "statement", "label"]], inplace=True)

In [None]:
# =============================
# STEP 4: Load Custom Datasets
# =============================
import os

def load_custom_dataset(path):
    """
    Load custom CSV dataset and standardize it.
    Format: should have 'title', 'text', and 'label' columns.
    """
    import pandas as pd

    if not os.path.exists(path):
        print(f"Error: File not found at {path}. Please check the path and try again.")
        return None # Return None if file not found

    df = pd.read_csv(path)

    # Combine title + text → statement
    df["statement"] = df["title"].fillna("").astype(str) + " " + df["text"].fillna("").astype(str)

    # Keep only statement + label
    df = df[["statement", "label"]].copy()

    # Normalize labels
    df["label"] = df["label"].apply(
        lambda x: "fake" if str(x).lower() in ["fake", "false", "0"] else "real"
    )
    return df.dropna()

# Use the custom dataset paths defined in cell V_wMyjCjt1yc
df_cust1 = load_custom_dataset(custom_ds1)
df_cust2 = load_custom_dataset(custom_ds2)

# Check if dataframes were loaded before proceeding
if df_cust1 is not None and df_cust2 is not None:
    print(f"Custom Dataset 1 size: {len(df_cust1)}, Custom Dataset 2 size: {len(df_cust2)}")
else:
    print("Failed to load one or both custom datasets due to file not found error.")

Error: File not found at /content/drive/MyDrive/FakeNews/custom_dataset1.csv. Please check the path and try again.
Error: File not found at /content/drive/MyDrive/FakeNews/custom_dataset2.csv. Please check the path and try again.
Failed to load one or both custom datasets due to file not found error.


In [None]:
# =============================
# STEP 5: Merge All Datasets
# =============================
df_liar_all = pd.concat([train_liar, valid_liar], ignore_index=True)

combined = pd.concat([df_liar_all, test_liar, df_cust1, df_cust2], ignore_index=True)

print("Dataset size:", combined.shape)
print(combined['label'].value_counts())

Dataset size: (12791, 3)
label
real    7134
fake    5657
Name: count, dtype: int64


In [None]:
# =============================
# STEP 6: Preprocess Text
# =============================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)   # remove URLs
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    return text

combined["statement"] = combined["statement"].apply(clean_text)

In [None]:
# =============================
# STEP 7: Train/Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    combined["statement"], combined["label"],
    test_size=0.2, stratify=combined["label"], random_state=42
)

In [None]:
# =============================
# STEP 8: TF-IDF + Logistic Regression
# =============================
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

In [None]:
# =============================
# STEP 9: Evaluation
# =============================
y_pred = model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

        fake       0.60      0.44      0.51      1132
        real       0.63      0.76      0.69      1427

    accuracy                           0.62      2559
   macro avg       0.62      0.60      0.60      2559
weighted avg       0.62      0.62      0.61      2559

Confusion Matrix:
 [[ 501  631]
 [ 337 1090]]
