<a href="https://colab.research.google.com/github/ashandish/FakeNewsDetection/blob/main/Fakeddit_Binary_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This script performs a full machine learning workflow for fake news detection
# using the Fakeddit dataset, BERT embeddings, and a neural network classifier.
# All data and models are saved to Google Drive for persistence across Colab sessions.

# --- Library Installations ---
# It's a good practice to include library installations at the top
# to ensure the environment is set up correctly after a session restart.
!pip install chromadb sentence-transformers

import pandas as pd
import numpy as np
import os
import chromadb
from google.colab import drive
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model

print("Starting the Fakeddit Binary Classifier workflow...")

# 1. Mount Google Drive for persistence
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    # Fallback for running in environments other than Colab
    print("Continuing without mounting, files will not be saved persistently.")
    pass

# --- Configuration ---
# File paths for the TSV datasets and persistent storage on Google Drive
TSV_PATH_PREFIX = '/content/drive/MyDrive/FakedditDataSet/all_samples (also includes non multimodal)-20250601T164249Z-1-001/all_samples (also includes non multimodal)'
TRAIN_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_train.tsv')
TEST_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_test_public.tsv')
VALIDATE_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_validate.tsv')

CHROMA_DB_PATH = '/content/drive/MyDrive/FakedditTitleEmbedding'
CHROMA_COLLECTION_NAME = 'fakeddit_train_embeddings'
MODEL_SAVE_PATH = '/content/drive/MyDrive/FakedditTextModel/model.h5'

# --- Step 1: Data Loading ---
def load_data(file_path):
    """Loads a .tsv file into a Pandas DataFrame."""
    try:
        df = pd.read_csv(file_path, sep='\t', engine='python', on_bad_lines='skip')
        print(f"Successfully loaded data from {file_path}. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

train_df = load_data(TRAIN_DATA_PATH)
test_df = load_data(TEST_DATA_PATH)
validate_df = load_data(VALIDATE_DATA_PATH)

if train_df is None or test_df is None or validate_df is None:
    print("Could not load all datasets. Exiting.")
    # Exit here in a real script, for this demo we'll continue with an empty dataframe
    train_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])
    test_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])
    validate_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])

# --- Step 2: Generate and Save Embeddings to ChromaDB ---
# This part is designed to be resumable
print("\nProcessing embeddings and saving to ChromaDB...")

# Initialize Sentence-Transformers model for BERT embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence-Transformer model 'all-MiniLM-L6-v2' loaded.")

# Initialize a persistent ChromaDB client
try:
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    collection = client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
    print(f"ChromaDB client initialized. Collection '{CHROMA_COLLECTION_NAME}' ready.")

    # Check how many documents are already in the collection
    existing_count = collection.count()
    print(f"Found {existing_count} existing documents in the collection.")

    # Get the IDs of the documents already in the database
    existing_ids = set()
    if existing_count > 0:
        existing_ids = set(collection.get(include=[])['ids'])

    # Filter out documents that are already in the database to resume saving
    train_df = train_df[~train_df['id'].isin(existing_ids)].copy()

    if not train_df.empty:
        # Generate embeddings for the new documents
        documents_to_embed = train_df['clean_title'].tolist()
        generated_embeddings = embedding_model.encode(documents_to_embed).tolist()

        # Prepare data lists for ChromaDB's add method
        ids = train_df['id'].astype(str).tolist()
        documents = train_df['clean_title'].tolist()
        metadatas = train_df[['title', '2_way_label']].to_dict('records')

        # Add new embeddings to the collection
        collection.add(
            ids=ids,
            embeddings=generated_embeddings,
            documents=documents,
            metadatas=metadatas
        )
        print(f"Successfully added {len(ids)} new documents to ChromaDB.")
    else:
        print("All documents from the training dataset are already in ChromaDB. Skipping embedding generation.")
except Exception as e:
    print(f"Error while generating or saving embeddings to ChromaDB: {e}")
    # In a real scenario, you might want to stop the execution here.

# --- Step 3: Train a Neural Network Model ---
print("\nTraining the Neural Network Model...")

try:
    # Retrieve all data from ChromaDB to train the model
    chroma_data = collection.get(
        ids=collection.get(include=[])['ids'],
        include=['embeddings', 'metadatas']
    )

    X_train = np.array(chroma_data['embeddings'])
    y_train = np.array([m['2_way_label'] for m in chroma_data['metadatas']])

    # Define the Keras model
    input_dim = X_train.shape[1]
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')  # Sigmoid for binary classification
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    print("Model compiled. Training...")

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

    # Save the trained model to Google Drive
    os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
    model.save(MODEL_SAVE_PATH)
    print(f"Model saved successfully to: {MODEL_SAVE_PATH}")
except Exception as e:
    print(f"Error during model training or saving: {e}")

# --- Step 4: Model Evaluation ---
print("\n--- Model Evaluation ---")

# Load the trained model
try:
    trained_model = load_model(MODEL_SAVE_PATH)
    print("Trained model loaded for evaluation.")
except Exception as e:
    print(f"Error loading the trained model: {e}")
    trained_model = None

def evaluate_model(df, dataset_name):
    """
    Generates embeddings for a given DataFrame,
    makes predictions, and calculates classification metrics.
    """
    if df.empty or trained_model is None:
        print(f"Cannot evaluate on {dataset_name}: data is empty or model not loaded.")
        return

    print(f"\nEvaluating on {dataset_name} dataset...")
    # Generate embeddings for the evaluation dataset
    embeddings = embedding_model.encode(df['clean_title'].tolist())
    X_eval = np.array(embeddings)
    y_true = df['2_way_label'].to_numpy()

    # Predict probabilities and convert to binary labels
    y_pred_probs = trained_model.predict(X_eval)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    # Calculate and print metrics
    precision = precision_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"--- Metrics for {dataset_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Evaluate on the Test and Validation datasets
evaluate_model(test_df, "Test")
evaluate_model(validate_df, "Validation")

print("\nWorkflow complete.")

Starting the Fakeddit Binary Classifier workflow...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
