In [1]:
"""
Feature Engineering for Fake News Detection
---------------------------------------------------
- Loads cleaned data
- Converts text into dense embeddings using SentenceTransformer
- Creates train/test split with validation set
- Scales features and saves metadata
- Includes GPU support, reproducibility, and detailed logging
"""

import os
import pandas as pd
import numpy as np
import logging
import torch
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from datetime import datetime
import json

# ============================================================
# CONFIGURATION
# ============================================================
PROCESSED_PATH = "data/processed/fake_news_clean.csv"
FEATURE_DIR = "data/processed/features"
METADATA_FILE = os.path.join(FEATURE_DIR, "metadata.json")
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
BATCH_SIZE = 64
TEST_SIZE = 0.2
VAL_SIZE = 0.1
RANDOM_STATE = 42

# ============================================================
# LOGGING SETUP
# ============================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# ============================================================
# HELPER FUNCTIONS
# ============================================================
def validate_data(df):
    """Validate that required columns exist and data is not empty."""
    logger.info("🔍 Validating data...")
    if df.empty:
        raise ValueError("❌ Dataset is empty!")

    required_cols = ["clean_text", "label"]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"❌ Missing columns: {missing_cols}")

    # Handle nulls
    null_counts = df[required_cols].isnull().sum()
    if null_counts.any():
        logger.warning(f"⚠️ Found null values:\n{null_counts}")
        df = df.dropna(subset=required_cols)
        logger.info(f"✅ Removed null values. Remaining rows: {len(df)}")

    return df


def encode_labels(df):
    """Encode labels to binary (0 = real, 1 = fake)."""
    logger.info("🏷️ Encoding labels...")
    logger.info(f"Original label distribution:\n{df['label'].value_counts()}")

    df["label"] = df["label"].apply(
        lambda x: 1 if str(x).lower() in ["fake", "1", "true"] else 0
    )

    logger.info(f"Encoded label distribution:\n{df['label'].value_counts()}")
    return df


def generate_embeddings(texts, model_name=MODEL_NAME, batch_size=BATCH_SIZE):
    """Generate embeddings using SentenceTransformer."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"🧠 Using device: {device}")
    logger.info(f"⚙️ Loading model: {model_name}")

    try:
        model = SentenceTransformer(model_name, device=device)
    except Exception as e:
        logger.error(f"❌ Failed to load model: {e}")
        raise

    logger.info("⚙️ Generating embeddings (this may take a few minutes)...")
    try:
        embeddings = model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True
        )
    except Exception as e:
        logger.error(f"❌ Failed to generate embeddings: {e}")
        raise

    logger.info(f"✅ Embeddings created: shape {embeddings.shape}, dtype {embeddings.dtype}")
    return embeddings


def split_data(X, y, test_size=TEST_SIZE, val_size=VAL_SIZE, random_state=RANDOM_STATE):
    """Split data into train, validation, and test sets."""
    logger.info("📊 Splitting data into train/validation/test sets...")

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y,
        test_size=(test_size + val_size),
        random_state=random_state,
        stratify=y
    )

    val_ratio = val_size / (test_size + val_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp,
        test_size=(1 - val_ratio),
        random_state=random_state,
        stratify=y_temp
    )

    logger.info(f"✅ Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")
    logger.info(f"Train label dist → Real={sum(y_train==0)}, Fake={sum(y_train==1)}")
    logger.info(f"Val label dist → Real={sum(y_val==0)}, Fake={sum(y_val==1)}")
    logger.info(f"Test label dist → Real={sum(y_test==0)}, Fake={sum(y_test==1)}")

    return X_train, X_val, X_test, y_train, y_val, y_test


def save_features(X_train, X_val, X_test, y_train, y_val, y_test, output_dir=FEATURE_DIR):
    """Save features, scaler, and metadata."""
    os.makedirs(output_dir, exist_ok=True)
    logger.info("💾 Saving features and metadata...")

    # ============================================================
    # Scale embeddings for ML models
    # ============================================================
    logger.info("📏 Scaling embeddings using StandardScaler...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    scaler_file = os.path.join(output_dir, "scaler.pkl")
    joblib.dump(scaler, scaler_file)
    logger.info(f"✅ Scaler saved → {scaler_file}")

    # Save data arrays
    npz_file = os.path.join(output_dir, "features.npz")
    np.savez_compressed(
        npz_file,
        X_train=X_train, X_val=X_val, X_test=X_test,
        y_train=y_train, y_val=y_val, y_test=y_test
    )
    logger.info(f"✅ Features saved → {npz_file}")

    # Save pickled datasets
    joblib.dump((X_train, y_train), os.path.join(output_dir, "train.pkl"))
    joblib.dump((X_val, y_val), os.path.join(output_dir, "val.pkl"))
    joblib.dump((X_test, y_test), os.path.join(output_dir, "test.pkl"))
    logger.info("✅ Train/Val/Test sets saved as .pkl files")

    # Metadata
    metadata = {
        "timestamp": datetime.now().isoformat(),
        "model_name": MODEL_NAME,
        "embedding_dim": X_train.shape[1],
        "train_samples": X_train.shape[0],
        "val_samples": X_val.shape[0],
        "test_samples": X_test.shape[0],
        "total_samples": len(X_train) + len(X_val) + len(X_test),
        "random_state": RANDOM_STATE,
        "scaler_used": True,
        "val_split_ratio": VAL_SIZE,
        "test_split_ratio": TEST_SIZE,
        "train_real": int(sum(y_train == 0)),
        "train_fake": int(sum(y_train == 1)),
        "val_real": int(sum(y_val == 0)),
        "val_fake": int(sum(y_val == 1)),
        "test_real": int(sum(y_test == 0)),
        "test_fake": int(sum(y_test == 1))
    }

    with open(METADATA_FILE, "w") as f:
        json.dump(metadata, f, indent=2)

    logger.info(f"✅ Metadata saved → {METADATA_FILE}")
    return metadata


def print_summary(metadata):
    """Print summary of the process."""
    logger.info("\n" + "="*60)
    logger.info("📈 FEATURE ENGINEERING SUMMARY")
    logger.info("="*60)
    logger.info(f"Model: {metadata['model_name']}")
    logger.info(f"Embedding Dimension: {metadata['embedding_dim']}")
    logger.info(f"Train Samples: {metadata['train_samples']}")
    logger.info(f"Val Samples: {metadata['val_samples']}")
    logger.info(f"Test Samples: {metadata['test_samples']}")
    logger.info(f"Scaler Used: {metadata['scaler_used']}")
    logger.info(f"Generated On: {metadata['timestamp']}")
    logger.info("="*60 + "\n")


# ============================================================
# MAIN PIPELINE
# ============================================================
def main():
    """Main feature engineering pipeline."""
    logger.info("🚀 Starting Feature Engineering Pipeline")
    logger.info("="*60)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_STATE)
    torch.manual_seed(RANDOM_STATE)
    logger.info(f"🔢 Random seed set to {RANDOM_STATE}")

    try:
        if not os.path.exists(PROCESSED_PATH):
            raise FileNotFoundError(f"❌ Cleaned dataset not found at {PROCESSED_PATH}")

        logger.info(f"📥 Loading data from {PROCESSED_PATH}")
        df = pd.read_csv(PROCESSED_PATH)
        logger.info(f"✅ Loaded {len(df)} rows")

        # Validation & encoding
        df = validate_data(df)
        df = encode_labels(df)

        # Embeddings
        embeddings = generate_embeddings(df["clean_text"].tolist())

        # Split
        X_train, X_val, X_test, y_train, y_val, y_test = split_data(
            embeddings, df["label"].values
        )

        # Save features
        metadata = save_features(X_train, X_val, X_test, y_train, y_val, y_test)

        # Print summary
        print_summary(metadata)

        logger.info("✅ Feature engineering completed successfully!")
        return True

    except Exception as e:
        logger.error(f"❌ Error during feature engineering: {e}", exc_info=True)
        return False


if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)


  from .autonotebook import tqdm as notebook_tqdm
2025-10-12 13:55:51,593 - INFO - 🚀 Starting Feature Engineering Pipeline
2025-10-12 13:55:51,596 - INFO - 🔢 Random seed set to 42
2025-10-12 13:55:51,596 - INFO - 📥 Loading data from data/processed/fake_news_clean.csv
2025-10-12 13:55:54,538 - INFO - ✅ Loaded 72134 rows
2025-10-12 13:55:54,539 - INFO - 🔍 Validating data...
clean_text    917
label           0
dtype: int64
2025-10-12 13:55:54,551 - INFO - ✅ Removed null values. Remaining rows: 71217
2025-10-12 13:55:54,553 - INFO - 🏷️ Encoding labels...
2025-10-12 13:55:54,554 - INFO - Original label distribution:
label
1    36191
0    35026
Name: count, dtype: int64
2025-10-12 13:55:54,571 - INFO - Encoded label distribution:
label
1    36191
0    35026
Name: count, dtype: int64
2025-10-12 13:55:54,572 - INFO - 🧠 Using device: cpu
2025-10-12 13:55:54,572 - INFO - ⚙️ Loading model: sentence-transformers/all-MiniLM-L6-v2
2025-10-12 13:55:54,575 - INFO - Load pretrained SentenceTransformer: