In [1]:
# This script performs a full machine learning workflow for fake news detection
# using the Fakeddit dataset, BERT embeddings, and a neural network classifier.
# All data and models are saved to Google Drive for persistence across Colab sessions.

# --- Library Installations ---
# It's a good practice to include library installations at the top
# to ensure the environment is set up correctly after a session restart.
!pip install chromadb sentence-transformers

import pandas as pd
import numpy as np
import os
import chromadb
from google.colab import drive
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model

print("Starting the Fakeddit Binary Classifier workflow...")

# 1. Mount Google Drive for persistence
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    # Fallback for running in environments other than Colab
    print("Continuing without mounting, files will not be saved persistently.")
    pass

# --- Configuration ---
# File paths for the TSV datasets and persistent storage on Google Drive
TSV_PATH_PREFIX = '/content/drive/MyDrive/FakedditDataSet/all_samples (also includes non multimodal)-20250601T164249Z-1-001/all_samples (also includes non multimodal)'
TRAIN_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_train.tsv')
TEST_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_test_public.tsv')
VALIDATE_DATA_PATH = os.path.join(TSV_PATH_PREFIX, 'all_validate.tsv')

CHROMA_DB_PATH = '/content/drive/MyDrive/FakedditTitleEmbedding'
CHROMA_COLLECTION_NAME = 'fakeddit_train_embeddings'
MODEL_SAVE_PATH = '/content/drive/MyDrive/FakedditTextModel/model.h5'

# --- Step 1: Data Loading ---
def load_data(file_path):
    """Loads a .tsv file into a Pandas DataFrame and cleans it."""
    try:
        df = pd.read_csv(file_path, sep='\t', engine='python', on_bad_lines='skip')
        print(f"Successfully loaded data from {file_path}. Shape: {df.shape}")

        # Filter out rows where 'clean_title' is NaN
        original_rows = len(df)
        df.dropna(subset=['clean_title'], inplace=True)
        print(f"Filtered out {original_rows - len(df)} records with missing 'clean_title'. New shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

train_df = load_data(TRAIN_DATA_PATH)
test_df = load_data(TEST_DATA_PATH)
validate_df = load_data(VALIDATE_DATA_PATH)

if train_df is None or test_df is None or validate_df is None:
    print("Could not load all datasets. Exiting.")
    # Exit here in a real script, for this demo we'll continue with an empty dataframe
    train_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])
    test_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])
    validate_df = pd.DataFrame(columns=['id', 'clean_title', 'title', '2_way_label'])

# --- Step 2: Generate and Save Embeddings to ChromaDB ---
# This part is designed to be resumable and is now batched.
print("\nProcessing embeddings and saving to ChromaDB...")

# Initialize Sentence-Transformers model for BERT embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Sentence-Transformer model 'all-MiniLM-L6-v2' loaded.")

# Initialize a persistent ChromaDB client
try:
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    collection = client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
    print(f"ChromaDB client initialized. Collection '{CHROMA_COLLECTION_NAME}' ready.")

    # Check how many documents are already in the collection
    existing_count = collection.count()
    print(f"Found {existing_count} existing documents in the collection.")

    # Get the IDs of the documents already in the database
    existing_ids = set()
    if existing_count > 0:
        existing_ids = set(collection.get(include=[])['ids'])

    # Filter out documents that are already in the database to resume saving
    train_df_to_process = train_df[~train_df['id'].isin(existing_ids)].copy()
    total_to_process = len(train_df_to_process)

    if total_to_process > 0:
        batch_size = 5000
        for i in range(0, total_to_process, batch_size):
            batch_df = train_df_to_process.iloc[i:i + batch_size]
            batch_number = i // batch_size + 1
            print(f"\nProcessing batch {batch_number} of {len(train_df_to_process) // batch_size + 1}...")

            # Generate embeddings for the new documents in the batch
            documents_to_embed = batch_df['clean_title'].tolist()
            generated_embeddings = embedding_model.encode(documents_to_embed).tolist()

            # Prepare data lists for ChromaDB's add method
            ids = batch_df['id'].astype(str).tolist()
            documents = batch_df['clean_title'].tolist()
            metadatas = batch_df[['title', '2_way_label']].to_dict('records')

            # Add new embeddings to the collection
            collection.add(
                ids=ids,
                embeddings=generated_embeddings,
                documents=documents,
                metadatas=metadatas
            )
            print(f"Successfully added {len(ids)} new documents to ChromaDB.")
    else:
        print("All documents from the training dataset are already in ChromaDB. Skipping embedding generation.")
except Exception as e:
    print(f"Error while generating or saving embeddings to ChromaDB: {e}")
    # In a real scenario, you might want to stop the execution here.

# --- Step 3: Train a Neural Network Model ---
print("\nTraining the Neural Network Model...")

try:
    # Retrieve all IDs to process for training in batches
    all_ids = collection.get(include=[])['ids']
    total_ids = len(all_ids)

    if total_ids == 0:
        print("No embeddings found in the database. Cannot train model.")
    else:
        X_train_list, y_train_list = [], []
        # Retrieve data in smaller batches to avoid "too many SQL variables" error
        batch_size = 5000
        for i in range(0, total_ids, batch_size):
            batch_ids = all_ids[i:i + batch_size]
            batch_data = collection.get(ids=batch_ids, include=['embeddings', 'metadatas'])
            X_train_list.extend(batch_data['embeddings'])
            y_train_list.extend([m['2_way_label'] for m in batch_data['metadatas']])

        X_train = np.array(X_train_list)
        y_train = np.array(y_train_list)

        # Define the Keras model
        input_dim = X_train.shape[1]
        model = Sequential([
            Dense(128, activation='relu', input_dim=input_dim),
            Dense(64, activation='relu'),
            Dense(1, activation='sigmoid')  # Sigmoid for binary classification
        ])

        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        print("Model compiled. Training...")

        # Train the model
        model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

        # Save the trained model to Google Drive
        os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
        model.save(MODEL_SAVE_PATH)
        print(f"Model saved successfully to: {MODEL_SAVE_PATH}")
except Exception as e:
    print(f"Error during model training or saving: {e}")

# --- Step 4: Model Evaluation ---
print("\n--- Model Evaluation ---")

# Load the trained model
try:
    trained_model = load_model(MODEL_SAVE_PATH)
    print("Trained model loaded for evaluation.")
except Exception as e:
    print(f"Error loading the trained model: {e}")
    trained_model = None

def evaluate_model(df, dataset_name):
    """
    Generates embeddings for a given DataFrame,
    makes predictions, and calculates classification metrics.
    """
    if df.empty or trained_model is None:
        print(f"Cannot evaluate on {dataset_name}: data is empty or model not loaded.")
        return

    print(f"\nEvaluating on {dataset_name} dataset...")
    # Generate embeddings for the evaluation dataset
    embeddings = embedding_model.encode(df['clean_title'].tolist())
    X_eval = np.array(embeddings)
    y_true = df['2_way_label'].to_numpy()

    # Predict probabilities and convert to binary labels
    y_pred_probs = trained_model.predict(X_eval)
    y_pred = (y_pred_probs > 0.5).astype(int).flatten()

    # Calculate and print metrics
    precision = precision_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"--- Metrics for {dataset_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Evaluate on the Test and Validation datasets
evaluate_model(test_df, "Test")
evaluate_model(validate_df, "Validation")

print("\nWorkflow complete.")

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence-Transformer model 'all-MiniLM-L6-v2' loaded.
ChromaDB client initialized. Collection 'fakeddit_train_embeddings' ready.
Found 802789 existing documents in the collection.
All documents from the training dataset are already in ChromaDB. Skipping embedding generation.

Training the Neural Network Model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model compiled. Training...
Epoch 1/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 2ms/step - accuracy: 0.8005 - loss: 0.4338 - val_accuracy: 0.8262 - val_loss: 0.3903
Epoch 2/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.8330 - loss: 0.3781 - val_accuracy: 0.8317 - val_loss: 0.3799
Epoch 3/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 2ms/step - accuracy: 0.8429 - loss: 0.3601 - val_accuracy: 0.8337 - val_loss: 0.3753
Epoch 4/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.8461 - loss: 0.3505 - val_accuracy: 0.8347 - val_loss: 0.3737
Epoch 5/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.8516 - loss: 0.3422 - val_accuracy: 0.8341 - val_loss: 0.3752
Epoch 6/10
[1m20070/20070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.8546 - loss: 0.3355 - val_accura



Model saved successfully to: /content/drive/MyDrive/FakedditTextModel/model.h5

--- Model Evaluation ---
Trained model loaded for evaluation.

Evaluating on Test dataset...
[1m2641/2641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
--- Metrics for Test ---
Accuracy: 0.8325
Precision: 0.8366
Recall: 0.8273
F1 Score: 0.8319

Evaluating on Validation dataset...
[1m2642/2642[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
--- Metrics for Validation ---
Accuracy: 0.8336
Precision: 0.8358
Recall: 0.8288
F1 Score: 0.8323

Workflow complete.
