In [10]:
"""
Preprocessing for Fake News Dataset
-------------------------------------------
- Loads raw dataset
- Cleans text (HTML tags, punctuation, stopwords)
- Normalizes and saves processed CSV
"""

import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Make sure stopwords are downloaded
nltk.download("stopwords")

RAW_PATH = "data/raw/fake_news.csv"
PROCESSED_PATH = "data/processed/fake_news_clean.csv"

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"<.*?>", " ", text)             # remove HTML tags
    text = re.sub(r"http\S+|www\S+", " ", text)    # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", " ", text)       # remove punctuation/numbers
    text = text.lower()                            # lowercase
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

def main():
    if not os.path.exists(RAW_PATH):
        raise FileNotFoundError(f"❌ Dataset not found at {RAW_PATH}")

    df = pd.read_csv(RAW_PATH)

    # Handle common column name variations
    possible_text_cols = [c for c in df.columns if c.lower() in ["text", "content", "article", "body"]]
    possible_label_cols = [c for c in df.columns if c.lower() in ["label", "target", "fake", "class"]]

    if not possible_text_cols or not possible_label_cols:
        raise ValueError("Could not find appropriate text/label columns.")

    text_col = possible_text_cols[0]
    label_col = possible_label_cols[0]

    df = df[[text_col, label_col]].rename(columns={text_col: "text", label_col: "label"})
    print(f"✅ Loaded {len(df)} rows. Cleaning text...")

    df["clean_text"] = df["text"].apply(clean_text)
    df = df.dropna(subset=["clean_text", "label"]).reset_index(drop=True)

    os.makedirs(os.path.dirname(PROCESSED_PATH), exist_ok=True)
    df.to_csv(PROCESSED_PATH, index=False)
    print(f"💾 Saved cleaned data → {PROCESSED_PATH}")

if __name__ == "__main__":
    main()


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1032)>


✅ Loaded 72134 rows. Cleaning text...
💾 Saved cleaned data → data/processed/fake_news_clean.csv
