# Phase 0: Setup, Data Loading, EDA, and Cleaning

This phase handles the initial setup by installing required packages, downloading the dataset from Kaggle, performing exploratory data analysis (EDA), cleaning and imputing missing values in the product and review datasets, and verifying the cleaned data structures.

In [None]:
# Install required packages
!pip install chromadb sentence-transformers torch kaggle datasets

In [None]:
# Setup Kaggle API and download dataset
import os
from dotenv import load_dotenv
from kaggle.api.kaggle_api_extended import KaggleApi

# Check if dataset already exists
expected_files = [
    'product_info.csv',
    'reviews_0-250.csv',
    'reviews_250-500.csv', 
    'reviews_500-750.csv',
    'reviews_750-1250.csv',
    'reviews_1250-end.csv'
]

if os.path.exists('./sephora_data') and all(os.path.exists(f'./sephora_data/{f}') for f in expected_files):
    print("Dataset already exists, skipping download!")
else:
    # Download dataset
    load_dotenv()
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files('nadyinky/sephora-products-and-skincare-reviews', path='./sephora_data', unzip=True)
    print("Dataset downloaded successfully!")

In [None]:
# Import libraries and load data
import pandas as pd
import numpy as np
from transformers import EarlyStoppingCallback

In [None]:
# Initial Exploratory Data Analysis (EDA)
def initial_eda(df, name):
    eda_summary = pd.DataFrame({
        'Column': df.columns,
        'Data Type': df.dtypes.values,
        'Null Count': df.isnull().sum().values,
        'Null %': (df.isnull().sum() / len(df) * 100).round(2).values
    })
    print(f"EDA for {name}:")
    print(eda_summary.to_string(index=False))
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")

# Ensure data is loaded as DataFrame
try:
    product_info_loaded = isinstance(product_info, pd.DataFrame)
except NameError:
    product_info_loaded = False

if not product_info_loaded:
    print("Loading product_info from CSV...")
    try:
        product_info = pd.read_csv('./sephora_data/product_info.csv')
        print("Product info loaded successfully")
    except FileNotFoundError:
        print("Error: product_info.csv not found. Please run the data download cell first.")
        product_info = pd.DataFrame()  # Empty DataFrame to avoid error

try:
    reviews_loaded = isinstance(reviews, pd.DataFrame)
except NameError:
    reviews_loaded = False

if not reviews_loaded:
    print("Loading reviews from CSV...")
    try:
        review_files = ['./sephora_data/reviews_0-250.csv', './sephora_data/reviews_250-500.csv', './sephora_data/reviews_500-750.csv', './sephora_data/reviews_750-1250.csv', './sephora_data/reviews_1250-end.csv']
        reviews = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)
        print("Reviews loaded successfully")
    except FileNotFoundError:
        print("Error: review files not found. Please run the data download cell first.")
        reviews = pd.DataFrame()  # Empty DataFrame to avoid error

initial_eda(product_info, "Product Information")
initial_eda(reviews, "Product Reviews")

if isinstance(product_info, pd.DataFrame) and len(product_info) > 0:
    print(product_info.describe())
else:
    print("Cannot show describe() - product_info not loaded as DataFrame")

In [None]:
# Product Information Data Cleaning
product_info_clean = product_info.copy()
fill_dict = {
    'rating': product_info_clean['rating'].median(),
    'reviews': product_info_clean['reviews'].median(),
    'size': 'Unknown',
    'variation_type': 'None',
    'variation_value': 'None',
    'variation_desc': 'None',
    'ingredients': 'Not Listed',
    'highlights': 'None',
    'secondary_category': 'Other',
    'tertiary_category': 'Other',
    'child_max_price': 0,
    'child_min_price': 0
}
product_info_clean = product_info_clean.fillna(fill_dict)
product_info_clean['value_price_usd'] = product_info_clean['value_price_usd'].fillna(product_info_clean['price_usd'])
product_info_clean['sale_price_usd'] = product_info_clean['sale_price_usd'].fillna(product_info_clean['price_usd'])

print("Product info cleaned, nulls:", product_info_clean.isnull().sum().sum())

In [None]:
# Product Reviews Data Cleaning
reviews_clean = reviews.copy()
reviews_clean = reviews_clean.drop('Unnamed: 0', axis=1)
fill_dict_reviews = {
    'is_recommended': 0,
    'helpfulness': 0,
    'review_text': '',
    'review_title': 'No Title',
    'skin_tone': 'Not Specified',
    'eye_color': 'Not Specified',
    'skin_type': 'Not Specified',
    'hair_color': 'Not Specified'
}
reviews_clean = reviews_clean.fillna(fill_dict_reviews)
print("Reviews cleaned, nulls:", reviews_clean.isnull().sum().sum())

In [None]:
# Final Data Verification and Summary
print("Product Info columns:", len(product_info.columns))
print("Reviews columns:", len(reviews.columns))

# Check columns in the datasets
print("Product Info columns:")
print(product_info.columns.tolist())
print(f"\nProduct Info shape: {product_info.shape}")

print("\nReviews columns:")
print(reviews.columns.tolist())
print(f"\nReviews shape: {reviews.shape}")

print("\nProduct Info Clean columns:")
print(product_info_clean.columns.tolist())

print("\nReviews Clean columns:")
print(reviews_clean.columns.tolist())

# Phase 1: Synthetic Dataset Creation for Fake Review Detection

This phase creates synthetic datasets for training a fake review detection model. It merges cleaned reviews with product information, generates fake reviews by shuffling real review texts, combines real and fake data, and splits into training and testing sets with stratification to ensure balanced labels.

In [None]:
# Phase 1: Synthetic Dataset Creation for Fake Review Detection

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Check if synthetic datasets already exist
import os
if os.path.exists('./synthetic_train.csv') and os.path.exists('./synthetic_test.csv'):
    print("Synthetic datasets already exist, skipping creation.")
    # Load existing datasets for verification
    train_df = pd.read_csv('./synthetic_train.csv')
    test_df = pd.read_csv('./synthetic_test.csv')
    print(f"Loaded existing training: {len(train_df)}, testing: {len(test_df)} samples")
else:
    print("Synthetic datasets not found, proceeding with creation...")

In [None]:
# Merge reviews with product information
print(f"Reviews clean shape: {reviews_clean.shape}")
print(f"Product info clean shape: {product_info_clean.shape}")
print(f"Common product_ids: {len(set(reviews_clean['product_id']).intersection(set(product_info_clean['product_id'])))}")

merged_df = pd.merge(reviews_clean, product_info_clean, on='product_id', how='inner', suffixes=('_review', '_product'))
print(f"Merged dataset shape: {merged_df.shape}")
print("Merged columns:", merged_df.columns.tolist()[:10])  # Show first 10

In [None]:
# Create product_info concatenated column
def create_product_info(row):
    # Safely handle both suffixed and unsuffixed column names from the merge
    def safe_get(r, *keys, default=None):
        for k in keys:
            if k in r.index and pd.notna(r[k]):
                return r[k]
        return default

    ingredients = safe_get(row, 'ingredients', default='Not Listed')
    highlights = safe_get(row, 'highlights', default='None')
    brand = safe_get(row, 'brand_name_product', 'brand_name', default='Unknown')
    primary_category = safe_get(row, 'primary_category', default='Unknown')
    price = safe_get(row, 'price_usd_product', 'price_usd', default=0.0)

    try:
        price_val = float(price)
    except Exception:
        price_val = 0.0

    return f"Brand: {brand}, Category: {primary_category}, Price: ${price_val:.2f}, Ingredients: {ingredients}, Highlights: {highlights}"

merged_df['product_info'] = merged_df.apply(create_product_info, axis=1)


In [None]:
# Enhanced Synthetic Data Generation Function (Safer & Faster)
def generate_synthetic_fakes_with_product_swap(merged_df, fake_ratio=0.2, max_rows=None):
    """
    Generate fake reviews by swapping reviews across different product categories while keeping product labels the same.

    This safer version:
    - Optional `max_rows` to operate on a sampled subset for quick runs.
    - Precomputes category groups and all_reviews once.
    - Avoids repeated expensive list constructions.
    - Adds light progress logging and deterministic randomness.
    """
    import numpy as _np
    from tqdm import tqdm

    # Work on a sample if requested to avoid very long runs during interactive debugging
    if max_rows is not None and len(merged_df) > max_rows:
        df = merged_df.sample(n=max_rows, random_state=42).reset_index(drop=True)
        print(f"Using sampled subset of {max_rows} rows for synthetic generation (from {len(merged_df)} total rows)")
    else:
        df = merged_df.reset_index(drop=True)

    _np.random.seed(42)

    # Pre-group by category for efficient sampling
    categories = df['primary_category'].unique()
    category_groups = {cat: df[df['primary_category'] == cat]['review_text'].tolist() for cat in categories}

    # Precompute a flat list of all reviews as fallback
    all_reviews = [rev for lst in category_groups.values() for rev in lst]
    if not all_reviews:
        # If there are no reviews at all, return empty frame
        print("No reviews available to generate fakes from.")
        df['is_fake'] = 0
        return df

    num_fake = max(1, int(fake_ratio * len(df)))
    fake_indices = _np.random.choice(len(df), num_fake, replace=False)

    print(f"Generating {num_fake} fake reviews...")

    # VECTORIZED APPROACH: Precompute categories for all fake indices
    fake_categories = df.loc[fake_indices, 'primary_category'].values
    other_categories_list = []
    available_reviews_list = []

    for cat in tqdm(fake_categories, desc="Processing fake categories"):
        other_cats = [c for c in categories if c != cat]
        if other_cats:
            random_cat = _np.random.choice(other_cats)
            reviews = category_groups.get(random_cat, [])
            # LIMIT REVIEW LIST SIZE FOR EFFICIENCY - sample at most 1000 reviews per category
            if len(reviews) > 1000:
                reviews = _np.random.choice(reviews, size=1000, replace=False).tolist()
            if reviews:
                available_reviews_list.append(reviews)
                other_categories_list.append(random_cat)
            else:
                # Limit fallback list size too
                limited_all_reviews = all_reviews if len(all_reviews) <= 1000 else _np.random.choice(all_reviews, size=1000, replace=False).tolist()
                available_reviews_list.append(limited_all_reviews)
                other_categories_list.append('fallback')
        else:
            # Limit fallback list size too
            limited_all_reviews = all_reviews if len(all_reviews) <= 1000 else _np.random.choice(all_reviews, size=1000, replace=False).tolist()
            available_reviews_list.append(limited_all_reviews)
            other_categories_list.append('fallback')

    # Now generate the fake reviews efficiently - VECTORIZED APPROACH
    print(f"Selecting {len(available_reviews_list)} fake reviews...")

    # VECTORIZED RANDOM SELECTION: Pre-compute all random indices at once for speed
    all_reviews_lengths = [len(reviews) for reviews in available_reviews_list]
    random_indices = _np.random.randint(0, _np.array(all_reviews_lengths))

    new_reviews = []
    for i, reviews in enumerate(tqdm(available_reviews_list, desc="Selecting fake reviews")):
        if len(reviews) > 0:
            new_reviews.append(reviews[random_indices[i]])
        else:
            # Fallback to all_reviews if category is empty
            fallback_idx = _np.random.randint(0, len(all_reviews))
            new_reviews.append(all_reviews[fallback_idx])

    fake_df = df.iloc[fake_indices].copy()
    fake_df['review_text'] = new_reviews
    fake_df['is_fake'] = 1

    real_df = df.drop(index=fake_indices).copy()
    real_df['is_fake'] = 0

    # Return combined dataset (shuffled)
    combined = pd.concat([real_df, fake_df], ignore_index=True)
    combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
    return combined


In [None]:
# Generate synthetic dataset with category-aware mismatches
# Use max_rows=10000 for faster testing, remove for full dataset
combined_df = generate_synthetic_fakes_with_product_swap(merged_df, fake_ratio=0.2, max_rows=10000)

print(f"Real reviews: {len(combined_df[combined_df['is_fake'] == 0])}")
print(f"Fake reviews: {len(combined_df[combined_df['is_fake'] == 1])}")
print(f"Total combined: {len(combined_df)}")

In [None]:
# Select required columns and rename
name_col = None
if 'product_name_product' in combined_df.columns:
    name_col = 'product_name_product'
elif 'product_name' in combined_df.columns:
    name_col = 'product_name'
else:
    # Fallback: create a product_name column if missing
    combined_df['product_name'] = combined_df.get('product_name_product', 'Unknown')
    name_col = 'product_name'

cols = ['product_id', name_col, 'review_text', 'is_fake', 'product_info']
final_df = combined_df[cols].copy()
final_df.rename(columns={name_col: 'product_name'}, inplace=True)


In [None]:
# Split into training (80%) and testing (20%) with stratification
label_counts = final_df['is_fake'].value_counts()
use_stratify = label_counts.min() >= 2
if use_stratify:
    stratify_col = final_df['is_fake']
    print("Using stratified split (each class has >=2 samples)")
else:
    stratify_col = None
    print("Dataset too small or imbalanced for stratified split; using random split without stratification")

train_df, test_df = train_test_split(
    final_df,
    test_size=0.2,
    stratify=stratify_col,
    random_state=42
)

print(f"Training set: {len(train_df)} samples")
print(f"Testing set: {len(test_df)} samples")
print(f"Training fake ratio: {train_df['is_fake'].mean():.3f}")
print(f"Testing fake ratio: {test_df['is_fake'].mean():.3f}")


In [None]:
# Save datasets
train_df.to_csv('./synthetic_train.csv', index=False)
test_df.to_csv('./synthetic_test.csv', index=False)

print("✓ Synthetic datasets saved as 'synthetic_train.csv' and 'synthetic_test.csv'")

In [None]:
# Quick verification
print("\nSample of training data:")
print(train_df.head(2))

# Phase 2: Feature Engineering & RAG Preparation

This phase prepares features for the RAG (Retrieval-Augmented Generation) system. It creates comprehensive product profiles, embeds them using Sentence Transformers, stores them in a ChromaDB vector database for efficient retrieval, generates training examples with explanations for fake review detection, and saves them for model fine-tuning.

In [None]:
# Phase 2: Feature Engineering & RAG Preparation

import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer

# Load synthetic datasets
train_df = pd.read_csv('./synthetic_train.csv')
test_df = pd.read_csv('./synthetic_test.csv')

print(f"Loaded training: {len(train_df)}, testing: {len(test_df)} samples")

In [None]:
# Create product profiles
product_profiles = []
product_ids = []
product_metadatas = []

for idx, row in product_info_clean.iterrows():
    profile = f"""
Product: {row['product_name']}
Brand: {row['brand_name']}
Category: {row['primary_category']}
Price: ${row['price_usd']:.2f}
Ingredients: {row['ingredients'] if pd.notna(row['ingredients']) else 'Not Listed'}
Highlights: {row['highlights'] if pd.notna(row['highlights']) else 'None'}
""".strip()
    
    product_profiles.append(profile)
    product_ids.append(str(row['product_id']))
    product_metadatas.append({
        "product_name": row['product_name'],
        "brand_name": row['brand_name'],
        "category": row['primary_category'],
        "price": float(row['price_usd'])
    })

print(f"Created {len(product_profiles)} product profiles")

In [None]:
# Setup ChromaDB
client = chromadb.PersistentClient(path="./chroma_data")
product_profile_collection = client.get_or_create_collection(
    name="product_profiles",
    metadata={"hnsw:space": "cosine"}
)

In [None]:
# Embed and store product profiles
if product_profile_collection.count() == 0:
    # Embed and store product profiles
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Generating embeddings...")
    profile_embeddings = embedding_model.encode(product_profiles, show_progress_bar=True)

    batch_size = 5000
    print(f"Storing in batches of {batch_size}...")
    for i in range(0, len(product_profiles), batch_size):
        batch_end = min(i + batch_size, len(product_profiles))
        batch_ids = product_ids[i:batch_end]
        batch_profiles = product_profiles[i:batch_end]
        batch_embeddings = profile_embeddings[i:batch_end].tolist()
        batch_metadatas = product_metadatas[i:batch_end]

        product_profile_collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_profiles,
            metadatas=batch_metadatas
        )
        print(f"Added {batch_end}/{len(product_profiles)} profiles")
    print("Vector database initialized.")
else:
    print("Vector database already exists.")

In [None]:
# Test retrieval
test_profile = product_profile_collection.query(
    query_texts=["A hydrating moisturizer for dry skin"],
    n_results=2
)
print("Test retrieval completed.")

In [None]:
# Check if training examples already exist
import os
if os.path.exists('./training_examples.json'):
    print("Training examples already exist, skipping creation.")
    # Load existing for verification
    import json
    with open('./training_examples.json', 'r') as f:
        training_examples = json.load(f)
    print(f"Loaded {len(training_examples)} existing training examples")
else:
    print("Training examples not found, proceeding with creation...")

In [None]:
# Prepare training data for examples
train_df['review_text'] = train_df['review_text'].fillna('').astype(str)
train_df['product_info'] = train_df['product_info'].fillna('').astype(str)

In [None]:
# Define heuristic explanation generation function for training examples
def generate_heuristic_explanation(row):
    # Generate explanation for fake reviews based on simple heuristics
    is_fake = int(row.get('is_fake', 0))
    if is_fake == 0:
        return "This review matches the product information and appears authentic."
    else:
        review_text = str(row.get('review_text', '')).lower()
        product_info = str(row.get('product_info', '')).lower()
        explanations = []
        if 'skincare' in review_text and 'hair' in product_info:
            explanations.append("Review discusses skincare but product is for hair.")
        elif 'hair' in review_text and 'skincare' in product_info:
            explanations.append("Review discusses hair but product is for skincare.")
        if 'natural' in review_text and 'chemical' in product_info:
            explanations.append("Review praises natural ingredients but product contains chemicals.")
        if 'drying' in review_text and 'hydrating' in product_info:
            explanations.append("Review mentions drying effects but product is hydrating.")
        elif 'hydrating' in review_text and 'drying' in product_info:
            explanations.append("Review mentions hydrating effects but product may be drying.")
        if not explanations:
            explanations.append("Review appears mismatched with product information.")
        return " ".join(explanations)

In [None]:
# Create training examples
training_examples = []
for idx, row in train_df.iterrows():
    # Generate an explanation using the heuristic generator defined earlier.
    # Fall back to a short default string if the heuristic fails for any row.
    try:
        explanation = generate_heuristic_explanation(row)
    except Exception as e:
        explanation = "No explanation available (error generating heuristic): " + str(e)

    example = {
        "product_info": row['product_info'],
        "review_text": row['review_text'],
        "label": int(row['is_fake']),
        "explanation_template": explanation,
    }
    training_examples.append(example)

print(f"Created {len(training_examples)} training examples")

In [None]:
# Save training examples
import json
with open('./training_examples.json', 'w') as f:
    json.dump(training_examples, f, indent=2)

print("Training examples saved.")

In [None]:
# Show sample training examples
for ex in training_examples[:2]:
    print(f"Label: {ex['label']}, Explanation: {ex['explanation_template'][:50]}...")

print("Phase 2 completed: Product profiles in ChromaDB, training examples prepared.")

In [None]:
!pip install transformers torch datasets chromadb sentence-transformers rouge-score

# Phase 3: RoBERTa Binary Classification + Dual-LLM Explanations (Optimized)

This phase implements a two-stage pipeline for fake review detection with speed optimizations:

**Stage 1: RoBERTa Binary Classifier (Optimized)**
- Uses RoBERTa-base optimized for classification tasks
- Input: "Product: [brand, category, price] Review: [text]"
- Output: Direct class logits (0=real, 1=fake)
- **Speed Optimizations:**
  - Batch size: 64 (up from 16)
  - Mixed precision: fp16=True
  - Gradient checkpointing enabled
  - Pre-tokenized dataset caching
  - Parallel data loading (num_workers=4)
  - Reduced max_length=256 (from 512)
  - RAG queries removed from training loop

**Stage 2: GPT-2 Explanation Generation**
- Separate LLM for generating explanations post-classification
- Uses RAG context from ChromaDB for fake reviews
- Prompt: "Review: [text] Product: [info] Context: [similar products] Explain why this review might be fake:"
- Benefits: Isolated explanation task, better quality control

**Key Improvements over T5:**
- ✓ Direct logit predictions → accurate metrics
- ✓ Training time: ~30-45 min (down from 3+ hours)
- ✓ Production-ready binary classification
- ✓ Separate explanation generation for better control
- ✓ Memory efficient with gradient checkpointing

In [None]:
# Phase 3: RoBERTa Binary Classification + Dual-LLM Explanations (Optimized)

import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from datasets import Dataset, load_from_disk, DatasetDict
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import json
from datetime import datetime
import chromadb
import os

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    torch.cuda.empty_cache()
    print("GPU cache cleared")

# =============================================================================
# CONSOLIDATED FUNCTION DEFINITIONS FOR PHASE 3 (refactored)
# - Extracted small helpers to reduce duplication
# - Clear docstrings and return contracts added
# =============================================================================

class RobertaDatasetFormatter:
    """Format inputs for RoBERTa classification.

    Methods
    - format_input(product_info, review_text) -> str
    - tokenize_function(examples) -> dict
    """
    def __init__(self, tokenizer, max_input_length=256):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def format_input(self, product_info, review_text):
        """Return a human-readable input string for the model."""
        input_text = f"Product information: {product_info}\nReview: {review_text}\n\nIs this review authentic for this product?"
        return input_text

    def tokenize_function(self, examples):
        inputs = examples["input_text"]
        labels = examples["label"]
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.max_input_length,
            truncation=True,
            padding="max_length",
        )
        model_inputs["labels"] = labels
        return model_inputs


def prepare_training_data_roberta(examples, formatter):
    """Prepare lists of input_text and labels from training examples.

    Returns
    - inputs: list[str]
    - labels: list[int]
    """
    inputs = []
    labels = []
    print("Preparing training data for RoBERTa...")
    for i, ex in enumerate(examples):
        if i % 100 == 0:
            print(f"  Processed {i}/{len(examples)} examples")
        input_text = formatter.format_input(ex['product_info'], ex['review_text'])
        inputs.append(input_text)
        labels.append(int(ex['label']))
    return inputs, labels


def compute_metrics(eval_pred):
    """Compute standard classification metrics given (predictions, labels).

    Expects eval_pred: (logits, labels)
    Returns a dict of floats suitable for HF Trainer logging.
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    try:
        auc = roc_auc_score(labels, torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy())
    except Exception:
        auc = None
    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "auc": float(auc) if auc is not None else None,
    }

# ---------- Inference helper to avoid duplicated code ----------

def _run_model_inference(tokenizer, model, input_texts, device, max_length=256):
    """Tokenize inputs, run the model, and return (pred_labels, confidences, logits).

    - input_texts: list[str]
    - returns: (preds: np.ndarray, confidences: np.ndarray, logits: np.ndarray)
    """
    inputs = tokenizer(input_texts, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits.detach().cpu().numpy()
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
        preds = np.argmax(logits, axis=1)
        confidences = probs[np.arange(len(preds)), preds]
    return preds, confidences, logits

# ---------- Classification wrappers (single-instance and batch) ----------

def classify_review(product_info, review_text, tokenizer, model, formatter, device):
    """Classify a single review and return ('FAKE'|'REAL', confidence)

    Uses the shared inference helper for consistency and easier testing.
    """
    input_text = formatter.format_input(product_info, review_text)
    preds, confs, _ = _run_model_inference(tokenizer, model, [input_text], device)
    pred = int(preds[0])
    confidence = float(confs[0])
    return ("FAKE" if pred == 1 else "REAL"), confidence


def classify_review_roberta_batch(product_infos, review_texts, tokenizer, model, formatter, device):
    """Batch classify multiple reviews and return (pred_labels, confidences).

    Inputs are parallel lists of product_infos and review_texts.
    """
    input_texts = [formatter.format_input(prod, rev) for prod, rev in zip(product_infos, review_texts)]
    preds, confidences, logits = _run_model_inference(tokenizer, model, input_texts, device)
    return preds.tolist(), confidences.tolist()

# ---------- RAG and explanation generation helpers (kept as before) ----------

def get_rag_context(review_text, product_info=""):
    """Get RAG context for a review by querying ChromaDB"""
    try:
        results = product_profile_collection.query(
            query_texts=[review_text],
            n_results=3
        )
        contexts = results['documents'][0] if results.get('documents') else []
        return " ".join(contexts[:2])
    except Exception as e:
        return f"Error retrieving context: {str(e)}"


def generate_explanation_with_gpt2(product_info, review_text, prediction, rag_context):
    prompt = f"Product: {product_info}\nReview: {review_text}\nContext: {rag_context}\n\nExplain why this review might be {'fake' if prediction == 1 else 'real'}:"
    inputs = gpt2_tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = gpt2_model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + 100,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=gpt2_tokenizer.eos_token_id,
        )
    explanation = gpt2_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return explanation.strip() if explanation.strip() else "Unable to generate explanation."


def generate_explanation(review_text, prediction, confidence):
    prompt = f"Review: {review_text}\nPrediction: {'Real' if prediction == 0 else 'Fake'}\nConfidence: {confidence:.3f}\n\nExplain why this review is predicted as {'real' if prediction == 0 else 'fake'}:"
    inputs = gpt2_tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = gpt2_model.generate(
            **inputs,
            max_length=inputs['input_ids'].shape[1] + 100,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=gpt2_tokenizer.eos_token_id,
        )
    explanation = gpt2_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return explanation.strip()


def explain_prediction(review_text, prediction, confidence):
    try:
        explanation = generate_explanation(review_text, prediction, confidence)
        return explanation if explanation else "Unable to generate explanation."
    except Exception as e:
        return f"Error generating explanation: {str(e)}"

# Batch processing helpers and main evaluation orchestration remain unchanged in contract

def setup_batch_evaluation(test_df, batch_size=32):
    predictions = []
    ground_truth = []
    explanations = []
    confidences = []
    print(f"Setting up batch evaluation with batch size {batch_size}")
    print(f"Total test samples: {len(test_df)}")
    return predictions, ground_truth, explanations, confidences, batch_size


def get_rag_contexts_for_fake_reviews(fake_reviews):
    batch_rag_contexts = []
    for review_text in fake_reviews:
        rag_context = get_rag_context(review_text, "")
        batch_rag_contexts.append(rag_context)
    return batch_rag_contexts


def generate_explanations_for_fake_reviews(batch_df, fake_indices, batch_rag_contexts):
    fake_explanations = []
    for i, idx in enumerate(fake_indices):
        row = batch_df.iloc[idx]
        explanation = generate_explanation_with_gpt2(
            row['product_info'],
            row['review_text'],
            1,
            batch_rag_contexts[i],
        )
        fake_explanations.append(explanation)
    return fake_explanations


def combine_batch_explanations(batch_df, batch_predictions, fake_indices, fake_explanations):
    batch_explanations = []
    explanation_idx = 0
    for i in range(len(batch_df)):
        if i in fake_indices:
            batch_explanations.append(fake_explanations[explanation_idx])
            explanation_idx += 1
        else:
            batch_explanations.append("This review appears authentic and matches the product information.")
    return batch_explanations


def evaluate_model(test_df, model, tokenizer, chromadb_client, formatter, batch_size=32):
    predictions, ground_truth, explanations, confidences, _ = setup_batch_evaluation(test_df, batch_size)
    print("Starting batch evaluation...")
    for start_idx in tqdm(range(0, len(test_df), batch_size), desc="Processing batches"):
        end_idx = min(start_idx + batch_size, len(test_df))
        batch_df = test_df.iloc[start_idx:end_idx]
        batch_product_infos = batch_df['product_info'].tolist() if 'product_info' in batch_df.columns else [''] * len(batch_df)
        batch_review_texts = batch_df['review_text'].tolist()
        batch_preds, batch_confs = classify_review_roberta_batch(batch_product_infos, batch_review_texts, tokenizer, model, formatter, device)
        fake_indices = [i for i, pred in enumerate(batch_preds) if pred == 1]
        if fake_indices:
            fake_reviews = batch_df.iloc[fake_indices]['review_text'].tolist()
            batch_rag_contexts = get_rag_contexts_for_fake_reviews(fake_reviews)
            fake_explanations = generate_explanations_for_fake_reviews(batch_df, fake_indices, batch_rag_contexts)
            batch_explanations = combine_batch_explanations(batch_df, batch_preds, fake_indices, fake_explanations)
        else:
            batch_explanations = ["This review appears authentic and matches the product information."] * len(batch_df)
        predictions.extend(batch_preds)
        confidences.extend(batch_confs)
        explanations.extend(batch_explanations)
        ground_truth.extend(batch_df['is_fake'].astype(int).tolist())
    return predictions, ground_truth, explanations, confidences

# End of consolidated/refactored cell


In [None]:
# Enhanced compute_metrics that includes per-class precision/recall and confusion matrix
# This will help monitor whether both classes are being predicted during training
from sklearn.metrics import precision_score, recall_score, confusion_matrix

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)

    # Per-class metrics
    try:
        precision_per_class = precision_score(labels, preds, average=None, zero_division=0)
        recall_per_class = recall_score(labels, preds, average=None, zero_division=0)
        prec_c0 = float(precision_per_class[0]) if len(precision_per_class) > 0 else 0.0
        prec_c1 = float(precision_per_class[1]) if len(precision_per_class) > 1 else 0.0
        rec_c0 = float(recall_per_class[0]) if len(recall_per_class) > 0 else 0.0
        rec_c1 = float(recall_per_class[1]) if len(recall_per_class) > 1 else 0.0
    except Exception:
        prec_c0 = prec_c1 = rec_c0 = rec_c1 = 0.0

    # Confusion matrix as JSON-serializable list
    try:
        cm = confusion_matrix(labels, preds).tolist()
    except Exception:
        cm = None

    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "precision_class_0": prec_c0,
        "precision_class_1": prec_c1,
        "recall_class_0": rec_c0,
        "recall_class_1": rec_c1,
        "confusion_matrix": cm,
    }

In [None]:
# Data loading and preparation
with open('./training_examples.json', 'r') as f:
    training_examples = json.load(f)

print(f"Loaded {len(training_examples)} training examples")

client = chromadb.PersistentClient(path="./chroma_data")
product_profile_collection = client.get_collection(name="product_profiles")

In [None]:
# Debug: Check training data labels
print("Sample training examples:")
for i, ex in enumerate(training_examples[:5]):
    print(f"Example {i}: Label={ex['label']}, Product={ex['product_info'][:100]}..., Review={ex['review_text'][:100]}...")

# Check label distribution
labels = [ex['label'] for ex in training_examples]
print(f"\nLabel distribution: {np.bincount(labels)} (0=real, 1=fake)")
print(f"Label ratio: {np.mean(labels):.3f} fake")

In [None]:
# DEBUG: Create a small balanced debug dataset if possible
print("\n" + "="*50)
print("DEBUG: Creating small balanced dataset")
print("="*50)

# Sample small balanced dataset
debug_size = 1000  # Target size
real_examples = [ex for ex in training_examples if ex['label'] == 0]
fake_examples = [ex for ex in training_examples if ex['label'] == 1]

# Determine feasible balanced size
min_class = min(len(real_examples), len(fake_examples))
if min_class == 0:
    print("Not enough examples to create a balanced debug dataset; skipping debug sampling.")
else:
    actual_half = min(min_class, debug_size // 2)
    np.random.seed(42)
    debug_real = np.random.choice(real_examples, size=actual_half, replace=False).tolist()
    debug_fake = np.random.choice(fake_examples, size=actual_half, replace=False).tolist()

    debug_training_examples = debug_real + debug_fake
    np.random.shuffle(debug_training_examples)

    print(f"Debug dataset: {len(debug_training_examples)} examples ({actual_half} real, {actual_half} fake)")

    # COMMENTED OUT: Override training_examples for debugging
    # training_examples = debug_training_examples

print("Using FULL training dataset for production training...")


In [None]:
# Debug: Check what the formatted inputs look like
print("\nSample formatted inputs:")

# Ensure `tokenizer` is available. If it's missing, try to load a saved tokenizer from
# the local `./fake_review_detector_roberta` directory (present in the repo),
# otherwise fall back to the public 'roberta-base' tokenizer.
try:
    tokenizer
except NameError:
    try:
        from transformers import RobertaTokenizer
    except Exception as e:
        raise ImportError("transformers not available: " + str(e))
    import os
    model_dir = os.path.join('.', 'fake_review_detector_roberta')
    if os.path.isdir(model_dir):
        tokenizer = RobertaTokenizer.from_pretrained(model_dir)
        print(f"Loaded tokenizer from {model_dir}")
    else:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        print("Loaded tokenizer 'roberta-base' as fallback")

# Instantiate the formatter (assumes the class RobertaDatasetFormatter is defined earlier in the notebook)
try:
    formatter = RobertaDatasetFormatter(tokenizer)
except NameError:
    raise NameError("RobertaDatasetFormatter is not defined. Please run the cell that defines this class before running this debug cell.")

# Show a few formatted examples and tokenization result
for i, ex in enumerate(training_examples[:3]):
    product_info = ex.get('product_info', '')
    review_text = ex.get('review_text', '')
    input_text = formatter.format_input(product_info, review_text)
    print(f"--- Example {i} ---")
    print(input_text)
    tokens = tokenizer.tokenize(input_text)
    print("Tokens:", tokens)


In [None]:
# Model and tokenizer setup - Using RoBERTa-base for classification
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Move model to device
model = model.to(device)
print(f"Model moved to device: {next(model.parameters()).device}")

formatter = RobertaDatasetFormatter(tokenizer)

print(f"\nLoaded model: {model_name}")
print(f"Model device: {next(model.parameters()).device}")

In [None]:
# Check if tokenized dataset already exists
force_regenerate = True

tokenized_dataset_path = "./tokenized_roberta_dataset"
if os.path.exists(tokenized_dataset_path) and not force_regenerate:
    print("Loading pre-tokenized dataset...")
    tokenized_datasets = load_from_disk(tokenized_dataset_path)
    train_dataset = tokenized_datasets["train"]
    val_dataset = tokenized_datasets["validation"]
else:
    # Prepare training data
    train_inputs, train_labels = prepare_training_data_roberta(training_examples, formatter)

    train_dataset = Dataset.from_dict({
        "input_text": train_inputs,
        "label": train_labels
    })

    print(f"Training dataset size: {len(train_dataset)}")

    # Train/validation split
    print("\n" + "="*70)
    print("Creating Train/Validation Split")
    print("="*70)

    # Determine whether stratification is feasible
    try:
        label_counts = np.bincount(train_labels)
        use_stratify = label_counts.min() >= 2
    except Exception:
        use_stratify = False

    if use_stratify:
        stratify_labels = train_labels
        print("Using stratified split for tokenization precompute")
    else:
        stratify_labels = None
        print("Not enough examples for stratified split; using random split")

    train_split_indices, val_split_indices = train_test_split(
        range(len(train_dataset)),
        test_size=0.2,
        stratify=stratify_labels,
        random_state=42
    )

    train_split = train_dataset.select(train_split_indices)
    val_split = train_dataset.select(val_split_indices)

    print(f"Training split size: {len(train_split)}")
    print(f"Validation split size: {len(val_split)}")

    # Tokenize datasets (precompute for speed)
    print("\nTokenizing datasets (this may take a moment)...")
    tokenized_train_dataset = train_split.map(formatter.tokenize_function, batched=True, num_proc=1)
    tokenized_val_dataset = val_split.map(formatter.tokenize_function, batched=True, num_proc=1)

    # Save tokenized datasets
    tokenized_datasets = DatasetDict({
        "train": tokenized_train_dataset,
        "validation": tokenized_val_dataset
    })
    tokenized_datasets.save_to_disk(tokenized_dataset_path)
    print(f"Tokenized datasets saved to {tokenized_dataset_path}")

    train_dataset = tokenized_train_dataset
    val_dataset = tokenized_val_dataset


In [None]:
# Training arguments and trainer setup (optimized for speed)
from sklearn.utils.class_weight import compute_class_weight

# Determine batch sizes depending on device
if device.type == "cpu":
    train_batch_size = 4
    eval_batch_size = 4
    grad_accum_steps = 4
else:
    train_batch_size = 64  # Increased from 16
    eval_batch_size = 64   # Increased from 16
    grad_accum_steps = 1   # Reduced from 2

# Compute class weights from the raw training labels (train_labels is defined earlier when preparing inputs)
try:
    unique_classes = np.unique(train_labels)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=np.array(train_labels))
    class_weights_dict = {int(c): float(w) for c, w in zip([0, 1], class_weights)}
    print(f"Computed class weights: {class_weights_dict}")
except Exception as e:
    print(f"Warning: could not compute class weights (falling back to 1.0): {e}")
    class_weights = np.array([1.0, 1.0])

# Move class weights to device for loss calculation
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Use weighted loss via a custom Trainer to avoid modifying model code
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Custom compute_loss that accepts extra kwargs from HF Trainer.

        This avoids TypeError when the Trainer passes framework-specific keywords
        such as `num_items_in_batch`.
        """
        # Trainer already moves tensors to device; labels are in inputs['labels']
        labels = inputs.get("labels")

        # Forward pass
        outputs = model(**inputs)
        # If model returned a precomputed loss (some models do), prefer that when labels are missing
        logits = getattr(outputs, "logits", None)

        if labels is None:
            # If model provided a loss (e.g., when labels are embedded), use it; otherwise fallback to 0
            loss = getattr(outputs, "loss", None)
            if loss is None:
                # Fallback: zero tensor on same device as model
                loss = torch.tensor(0.0, device=next(model.parameters()).device)
        else:
            # Use CrossEntropyLoss with class weights for multi-class/binary classification
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
            # Ensure logits exist
            if logits is None:
                outputs = model(**inputs)
                logits = outputs.logits
            loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# TrainingArguments - prefer accuracy or a composite metric for early stopping when F1 may be zero early
training_args = TrainingArguments(
    output_dir="./fake_review_detector_roberta",
    num_train_epochs=3,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,  # Changed from 50 to 1 to see training loss
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir="./logs",
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Use accuracy (or 'f1' once F1 becomes meaningful)
    greater_is_better=True,
    # Speed optimizations
    fp16=True,  # Mixed precision training
    dataloader_num_workers=4,  # Parallel data loading
    dataloader_pin_memory=True,  # Faster GPU transfer
)

# Instantiate our WeightedTrainer instead of the default Trainer
# Use a plain EarlyStoppingCallback() without keyword args to avoid mismatches across transformer versions
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback()]
)

print("Trainer initialized with weighted loss and metric_for_best_model=\"accuracy\"")

In [None]:
# Debug: Check dataset sizes
print("Dataset sizes:")
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(val_dataset)} samples")
print(f"Test dataframe: {len(test_df)} samples")

# Check label distributions
train_labels = [ex['label'] for ex in train_dataset]
val_labels = [ex['label'] for ex in val_dataset]
test_labels = test_df['is_fake'].astype(int).tolist()

print(f"\nTrain labels distribution: {np.bincount(train_labels)} (0=real, 1=fake)")
print(f"Val labels distribution: {np.bincount(val_labels)} (0=real, 1=fake)")
print(f"Test labels distribution: {np.bincount(test_labels)} (0=real, 1=fake)")

In [None]:
# Check data quality: duplicates and label leakage
print("\nData Quality Checks:")

# Check for duplicates in training data (using original training_examples)
train_texts = [ex['review_text'] for ex in training_examples[:len(train_dataset)]]  # Match the split size
val_texts = [ex['review_text'] for ex in training_examples[len(train_dataset):len(train_dataset)+len(val_dataset)]]  # Val split
test_texts = test_df['review_text'].tolist()

print(f"Train duplicates: {len(train_texts) - len(set(train_texts))}")
print(f"Val duplicates: {len(val_texts) - len(set(val_texts))}")
print(f"Test duplicates: {len(test_texts) - len(set(test_texts))}")

# Check for overlap between splits
train_set = set(train_texts)
val_set = set(val_texts)
test_set = set(test_texts)

train_val_overlap = len(train_set.intersection(val_set))
train_test_overlap = len(train_set.intersection(test_set))
val_test_overlap = len(val_set.intersection(test_set))

print(f"Train-Val overlap: {train_val_overlap}")
print(f"Train-Test overlap: {train_test_overlap}")
print(f"Val-Test overlap: {val_test_overlap}")

# Check label balance
print(f"\nLabel balance check:")
print(f"Train fake ratio: {np.mean([ex['label'] for ex in train_dataset]):.3f}")
print(f"Val fake ratio: {np.mean([ex['label'] for ex in val_dataset]):.3f}")
print(f"Test fake ratio: {test_df['is_fake'].mean():.3f}")

In [None]:
# Start training
import traceback
print("\nStarting RoBERTa training (optimized)...")
try:
    train_result = trainer.train()
    best_metric = getattr(trainer.state, 'best_metric', None)
    if best_metric is not None:
        try:
            print(f"Training completed! Best metric: {best_metric:.4f}")
        except Exception:
            print("Training completed! Best metric:", best_metric)
    else:
        print("Training completed! No best_metric available in trainer.state.")
except Exception as e:
    print("Training failed with exception:")
    traceback.print_exc()
    # Re-raise to surface the error to the notebook if desired
    raise


In [None]:
# Save model
trainer.save_model("./fake_review_detector_roberta")
tokenizer.save_pretrained("./fake_review_detector_roberta")
print("RoBERTa model saved successfully to './fake_review_detector_roberta'")

# Phase 4: RAG Pipeline Integration

This phase integrates the RAG (Retrieval-Augmented Generation) pipeline to enhance explainability for fake review detection. Now using RoBERTa for classification and GPT-2 for explanation generation.

**Objective:** Enable explainability for fake reviews with dual-LLM approach

**Tasks:**
- Use RoBERTa for binary classification (0=real, 1=fake)
- For fake reviews: Query ChromaDB for semantically similar products
- Pass to GPT-2: "Review: [text] Product: [info] Context: [similar products] Explain why this review might be fake:"
- Generate contextual explanations for mismatches

**Dual-LLM Benefits:**
- RoBERTa: Optimized for classification, direct logit outputs
- GPT-2: Specialized for text generation and explanations
- RAG: Provides product context for better mismatch detection
- Isolated tasks: Better performance and debugging

**Evaluation metrics:**
- Classification accuracy, precision, recall, F1-score
- ROUGE/BLEU scores for explanation quality
- Human evaluation of explanation coherence

# get_rag_context function moved to consolidated definitions cell (Phase 3 imports)

## Phase 4 Summary

**Objective:** Implement end-to-end evaluation pipeline with RAG-augmented explanations

**Key Components:**
- **Batch Processing:** Efficient evaluation with configurable batch sizes (32)
- **RAG Integration:** Context retrieval for explanation generation using ChromaDB
- **GPT-2 Explanations:** Natural language explanations for detected fake reviews
- **Modular Design:** Separate functions for classification, context retrieval, and explanation generation

**Functions Created:**
- `setup_batch_evaluation()`: Initialize evaluation data structures
- `batch_classify_reviews()`: Classify reviews in batches
- `get_rag_contexts_for_fake_reviews()`: Retrieve RAG contexts for fake reviews
- `generate_explanations_for_fake_reviews()`: Generate GPT-2 explanations
- `combine_batch_explanations()`: Merge explanations for real and fake reviews
- `evaluate_model()`: Main orchestration function

**Performance Optimizations:**
- Batch processing reduces memory usage and improves speed
- Selective explanation generation (only for fake reviews)
- Efficient data structures for result collection

**Next Steps:** Phase 5 will focus on RoBERTa model evaluation and benchmarking.

# Phase 5: Testing & Validation

**Objective:** Evaluate end-to-end performance

**Tasks:**
- Run inference on test set - Get classifications and explanations
- Analyze failure cases - Where does the model misclassify or provide poor explanations?
- Benchmark against baselines - Compare to simple text similarity baselines (cosine similarity, TF-IDF, bag of words)
- Generate evaluation report - Document accuracy, explanation quality, computational costs

In [None]:
# Initialize results storage
roberta_predictions = []
roberta_confidences = []

# Set batch size for evaluation
batch_size = 64
print(f"Starting RoBERTa evaluation with batch size {batch_size}")
print(f"Total test samples: {len(test_df)}")

In [None]:
# Execute batch evaluation
print("\n[1] RoBERTa Binary Classification - Predictions")

from tqdm import tqdm
import numpy as np

batch_size = 64  # Define batch size

for start_idx in tqdm(range(0, len(test_df), batch_size), desc="Processing RoBERTa batches"):
    end_idx = min(start_idx + batch_size, len(test_df))
    batch_product_infos = test_df.iloc[start_idx:end_idx]['product_info'].tolist()
    batch_review_texts = test_df.iloc[start_idx:end_idx]['review_text'].tolist()

    # Pass tokenizer, model, formatter, device per refactor
    batch_preds, batch_confs = classify_review_roberta_batch(batch_product_infos, batch_review_texts, tokenizer, model, formatter, device)
    roberta_predictions.extend(batch_preds)
    roberta_confidences.extend(batch_confs)

print(f"Completed RoBERTa evaluation: {len(roberta_predictions)} predictions generated")

# Calculate metrics for RoBERTa
ground_truth = test_df['is_fake'].astype(int).tolist()
roberta_metrics = calculate_metrics(np.array(roberta_predictions), np.array(ground_truth), np.array(roberta_confidences))

print("\nRoBERTa Evaluation Metrics:")
for metric, value in roberta_metrics.items():
    if value is not None:
        print(f"{metric.capitalize()}: {value:.4f}")
    else:
        print(f"{metric.capitalize()}: N/A")

# Debug: Check distributions
print(f"\nPredictions distribution: {np.bincount(roberta_predictions)} (0=real, 1=fake)")
print(f"Ground truth distribution: {np.bincount(ground_truth)} (0=real, 1=fake)")
print(f"Sample predictions: {roberta_predictions[:10]}")
print(f"Sample ground truth: {ground_truth[:10]}")


In [None]:
# Define calculate_metrics and print evaluation summary (added to resolve NameError from earlier cell)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def calculate_metrics(predictions, labels, confidences):
    """Calculate evaluation metrics for predictions.

    predictions, labels, confidences are numpy arrays or lists.
    Returns dict with accuracy, precision, recall, f1_score, auc (or None).
    """
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    try:
        auc = roc_auc_score(labels, confidences)
    except Exception:
        auc = None
    return {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'auc': float(auc) if auc is not None else None,
    }

# Compute and print metrics using variables already present in kernel
try:
    roberta_metrics = calculate_metrics(np.array(roberta_predictions), np.array(ground_truth), np.array(roberta_confidences))
    print("\nRoBERTa Evaluation Metrics:")
    for metric, value in roberta_metrics.items():
        if value is not None:
            print(f"{metric.capitalize()}: {value:.4f}")
        else:
            print(f"{metric.capitalize()}: N/A")
    print(f"\nPredictions distribution: {np.bincount(roberta_predictions)} (0=real, 1=fake)")
    print(f"Ground truth distribution: {np.bincount(ground_truth)} (0=real, 1=fake)")
except Exception as e:
    print('Error computing/printing metrics:', e)


## Phase 5 Summary

**Objective:** Evaluate RoBERTa model performance on test data

**Key Components:**
- **Batch Inference:** Optimized RoBERTa classification with batch processing
- **Confidence Scores:** Probability-based confidence metrics for predictions
- **Performance Tracking:** Progress monitoring with tqdm for large datasets

**Functions Created:**
- `classify_review_roberta_batch()`: Batch classification using RoBERTa model
- Batch processing loop with configurable batch size (64)

**Technical Details:**
- **Input Formatting:** Uses `RobertaDatasetFormatter` for consistent input formatting
- **GPU Acceleration:** Leverages CUDA device for fast inference
- **Memory Efficient:** Processes data in batches to manage memory usage
- **Output:** Predictions (0=real, 1=fake) and confidence scores

**Results:**
- `roberta_predictions`: Binary classification results
- `roberta_confidences`: Confidence scores for each prediction
- Ready for comparison with ground truth labels

**Next Steps:** Compare RoBERTa performance against baseline methods and analyze results.

In [None]:
# Diagnostics cell: run multiple leakage and training checks (overlap, label dist, preds, chroma, correlations)
import hashlib
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import confusion_matrix, classification_report

print('Starting diagnostics checks...')

# helper
def hash_series_texts(s):
    return s.dropna().astype(str).map(lambda x: hashlib.md5(x.strip().encode('utf-8')).hexdigest())

results = {}

# 1) Overlap checks using pandas DataFrames if available
try:
    if 'train_df' in globals() and 'val_df' in globals():
        h_train = set(hash_series_texts(train_df['review_text']))
        h_val = set(hash_series_texts(val_df['review_text']))
        txt_overlap = len(h_train & h_val)
        results['train_val_text_overlap'] = txt_overlap
        print('train/val exact text overlap:', txt_overlap)
    else:
        print('train_df or val_df not in globals(); skipping pandas text overlap check')
except Exception as e:
    print('Error computing pandas train/val text overlap:', e)

# If datasets Arrow splits exist (train_split, val_split)
try:
    if 'train_split' in globals() and 'val_split' in globals():
        def ds_text_hashes(ds, text_col='review_text'):
            seen = set()
            for x in ds[text_col]:
                if x is None:
                    continue
                seen.add(hashlib.md5(str(x).strip().encode('utf-8')).hexdigest())
            return seen
        s1 = ds_text_hashes(train_split)
        s2 = ds_text_hashes(val_split)
        results['ds_train_val_text_overlap'] = len(s1 & s2)
        print('arrow train/val text overlap:', results['ds_train_val_text_overlap'])
    else:
        print('train_split/val_split not present; skipping Arrow overlap check')
except Exception as e:
    print('Error computing Arrow overlap:', e)

# Print existing precomputed overlap variables if present
for v in ['train_val_overlap','train_test_overlap','val_test_overlap']:
    if v in globals():
        print(f'{v} (existing variable) =', globals()[v])

# 2) ID overlaps (product_id or any id-like column)
try:
    if 'train_df' in globals() and 'val_df' in globals():
        id_cols = [c for c in train_df.columns if 'id' in c.lower() or 'product' in c.lower()]
        for c in id_cols:
            a = set(train_df[c].dropna().astype(str))
            b = set(val_df[c].dropna().astype(str))
            ov = len(a & b)
            print(f'ID overlap on {c}:', ov)
            results[f'id_overlap_{c}'] = ov
    else:
        print('train_df/val_df not available for ID overlap checks')
except Exception as e:
    print('Error computing ID overlaps:', e)

# 3) Label distributions
try:
    def print_label_dist_from_df(df, name='df'):
        vc = df['label'].value_counts(dropna=False)
        print(f"{name} label counts:\n", vc.to_dict())
        print(f"{name} proportions:\n", (vc/vc.sum()).round(3).to_dict())
    if 'train_df' in globals():
        print_label_dist_from_df(train_df, 'train')
    if 'val_df' in globals():
        print_label_dist_from_df(val_df, 'val')
    if 'test_df' in globals():
        print_label_dist_from_df(test_df, 'test')
    # datasets
    if 'train_split' in globals():
        try:
            print('train_split label distribution (arrow):', Counter(train_split['label']))
        except Exception:
            pass
    if 'val_split' in globals():
        try:
            print('val_split label distribution (arrow):', Counter(val_split['label']))
        except Exception:
            pass
except Exception as e:
    print('Error printing label distributions:', e)

# 4) Prediction behavior: collapse to majority or balanced?
try:
    y_true = None
    y_pred = None
    # possible sources
    if 'roberta_predictions' in globals() and len(roberta_predictions) > 0:
        y_pred = np.array(roberta_predictions)
    elif 'roberta_confidences' in globals() and len(roberta_confidences) > 0:
        arr = np.array(roberta_confidences)
        if arr.ndim == 2:
            y_pred = arr.argmax(axis=1)
    # true labels
    if 'val_labels' in globals() and len(val_labels) > 0:
        y_true = np.array(val_labels)
    elif 'val_dataset' in globals():
        try:
            y_true = np.array(val_dataset['label'])
        except Exception:
            pass
    if y_pred is not None and y_true is not None:
        print('pred dist:', np.bincount(y_pred))
        print('true dist:', np.bincount(y_true))
        print('confusion matrix:\n', confusion_matrix(y_true, y_pred))
        print(classification_report(y_true, y_pred, digits=4))
        results['prediction_collapse'] = (np.bincount(y_pred).min() == 0)
    else:
        print('predictions or true labels not available for prediction diagnostics')
except Exception as e:
    print('Error computing prediction diagnostics:', e)

# 5) Chroma / retrieval DB sanity: check if collection contains val/test ids
try:
    if 'product_profile_collection' in globals():
        coll = product_profile_collection
        try:
            # try to access metadatas via get()
            info = coll.get()
            metadatas = info.get('metadatas', [])
            coll_ids = set()
            for md in metadatas:
                if isinstance(md, dict) and 'id' in md:
                    coll_ids.add(str(md['id']))
            print('Chroma collection metadata ids found:', len(coll_ids))
            # compare with val ids if exist
            if 'val_df' in globals():
                val_ids = set(val_df['product_id'].dropna().astype(str)) if 'product_id' in val_df.columns else set()
                print('overlap of Chroma collection with val_df product_id:', len(coll_ids & val_ids))
        except Exception as e:
            print('Could not access collection.get() result directly, trying safer paths:', e)
    else:
        print('product_profile_collection not in globals(); skipping Chroma checks')
except Exception as e:
    print('Error during Chroma checks:', e)

# 6) Feature correlation checks in train_df numeric columns
try:
    if 'train_df' in globals():
        num_cols = train_df.select_dtypes(include=['int','float']).columns.tolist()
        suspicious = []
        for c in num_cols:
            if c == 'label':
                continue
            corr = train_df[c].corr(train_df['label'])
            if pd.notna(corr) and abs(corr) > 0.6:
                suspicious.append((c, corr))
        if suspicious:
            print('Highly correlated numeric features (possible leakage):', suspicious)
        else:
            print('No numeric features with correlation > 0.6 found in train_df')
    else:
        print('train_df not available for correlation checks')
except Exception as e:
    print('Error computing feature correlations:', e)

# 7) Majority baseline
try:
    if 'val_df' in globals() and 'label' in val_df.columns:
        maj = val_df['label'].mode()[0]
        maj_acc = (val_df['label'] == maj).mean()
        print('majority class in val:', maj, 'majority baseline accuracy:', round(maj_acc,4))
    elif y_true is not None:
        vals, counts = np.unique(y_true, return_counts=True)
        maj = vals[np.argmax(counts)]
        maj_acc = counts.max() / counts.sum()
        print('majority class baseline (from y_true):', maj, maj_acc)
except Exception as e:
    print('Error computing majority baseline:', e)

# 8) Quick check for tokenization/preprocessing leakage: did tokenization use full dataset?
try:
    # Look for tokenized_datasets or tokenizer fitted on full corpus
    if 'tokenized_datasets' in globals():
        print('tokenized_datasets keys:', list(tokenized_datasets.keys()))
        # no direct proof of leakage here, just a note
        print('Note: if tokenization/feature stats were computed on full corpus before splitting, that can leak. Check code cells where tokenizer or vectorizer is fit.')
    else:
        print('tokenized_datasets not present; cannot check tokenization source programmatically')
except Exception as e:
    print('Error checking tokenization:', e)

# Synthesize a short diagnosis from checks
diag = []
if results.get('train_val_text_overlap', 0) > 0 or results.get('ds_train_val_text_overlap', 0) > 0:
    diag.append('DATA LEAKAGE: exact text overlap between train and validation detected.')
if any((results.get(k,0) > 0) for k in list(results.keys()) if k.startswith('id_overlap_')):
    diag.append('POTENTIAL LEAKAGE: shared ids between train and validation (see id overlap counts).')
# prediction collapse check
if results.get('prediction_collapse', False):
    diag.append('MODEL COLLAPSE: predictions collapse to a single class (majority). Likely severe class imbalance or improper loss/labels.')
# class imbalance check
try:
    if 'train_df' in globals():
        vc = train_df['label'].value_counts(normalize=True)
        min_frac = vc.min()
        if min_frac < 0.1:
            diag.append('CLASS IMBALANCE: minority class < 10% in training set. Use class weights or sampling.')
except Exception:
    pass

if not diag:
    diag.append('No smoking-gun leakage detected by these automated checks. Next steps: check manual code cells for uses of validation/test in preprocessing or Chroma indexing. Also try class-weighted training and lower LR.')

print('\n=== CONCISE DIAGNOSIS ===')
for d in diag:
    print('-', d)

print('\nDiagnostics completed.')

In [None]:
# Follow-up evidence cell: print sizes, sample overlaps and prediction lengths to pinpoint leakage source
import hashlib
from itertools import islice

print('--- Basic sizes and types ---')
names = ['train_df','val_df','test_df','train_split','val_split','train_split_indices','val_split_indices','train_texts','val_texts','train_labels','val_labels','roberta_predictions','roberta_confidences','tokenized_datasets']
for n in names:
    if n in globals():
        v = globals()[n]
        try:
            l = len(v)
        except Exception:
            l = type(v)
        print(f"{n}: type={type(v)}, len={l}")
    else:
        print(f"{n}: MISSING")

# If there are explicit index lists/sets, check intersections
try:
    if 'train_split_indices' in globals() and 'val_split_indices' in globals():
        s1 = set(train_split_indices)
        s2 = set(val_split_indices)
        inter = s1 & s2
        print('train_split_indices & val_split_indices intersection count:', len(inter))
        if len(inter) > 0:
            print('sample overlapping indices (up to 10):', list(islice(inter, 10)))
except Exception as e:
    print('Could not check index intersections:', e)

# If text arrays/lists exist, show sample overlapping texts
try:
    if 'train_texts' in globals() and 'val_texts' in globals():
        h1 = {hashlib.md5(t.strip().encode('utf-8')).hexdigest():t for t in train_texts if t}
        h2 = {hashlib.md5(t.strip().encode('utf-8')).hexdigest():t for t in val_texts if t}
        common = set(h1.keys()) & set(h2.keys())
        print('text-hash overlap count between train_texts and val_texts:', len(common))
        if len(common)>0:
            print('Examples of overlapping texts (up to 5):')
            for hh in list(common)[:5]:
                print('-', h1[hh][:200].replace('\n',' '))
except Exception as e:
    print('Could not compute overlapping texts sample:', e)

# Predictions vs validation labels
try:
    if 'roberta_predictions' in globals():
        print('len(roberta_predictions)=', len(roberta_predictions))
    if 'val_labels' in globals():
        print('len(val_labels)=', len(val_labels))
    if 'roberta_confidences' in globals():
        import numpy as np
        a = np.array(roberta_confidences)
        print('roberta_confidences shape:', a.shape)
except Exception as e:
    print('Error checking predictions lengths:', e)

# Print precomputed overlap variables
for v in ['train_val_overlap','train_test_overlap','val_test_overlap','train_test_overlap']:
    if v in globals():
        print(f'{v} =', globals()[v])

print('\nFollow-up check complete. If any overlaps > 0 above, you have data-split leakage and should rebuild splits to ensure uniqueness.')

In [None]:
# Rebuild balanced splits robustly and retrain (safer fallback behavior)
import random
import re
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report, confusion_matrix

print('Start: rebuild balanced dataset and retrain (robust)')

# 1) locate source DataFrame
df_candidates = ['final_df','combined_df','merged_df','reviews_clean','train_df']
src_df = None
for c in df_candidates:
    if c in globals():
        v = globals()[c]
        if isinstance(v, pd.DataFrame) and len(v)>0:
            src_df = v.copy()
            src_name = c
            break
if src_df is None:
    raise RuntimeError('No suitable source DataFrame found among: ' + ','.join(df_candidates))
print('Using', src_name, 'with', len(src_df), 'rows')

# 2) infer text column and label column, prefer explicit names
text_col = None
label_col = None
if 'review_text' in src_df.columns:
    text_col = 'review_text'
else:
    text_cols = [c for c in src_df.columns if 'review' in c.lower() or 'text' in c.lower() or 'body' in c.lower()]
    text_col = text_cols[0] if text_cols else None

if 'is_fake' in src_df.columns:
    label_col = 'is_fake'
else:
    label_cols = [c for c in src_df.columns if c.lower() in ('label','is_fake','fake','target','y') or set(src_df[c].dropna().unique()).issubset({0,1})]
    label_col = label_cols[0] if label_cols else None

if text_col is None or label_col is None:
    raise RuntimeError(f'Could not infer text or label columns. text_col={text_col}, label_col={label_col}')

print('Inferred text col =', text_col, 'label col =', label_col)

# Normalize labels to 0/1 where possible
if src_df[label_col].dtype == object:
    src_df[label_col] = src_df[label_col].astype(str).map(lambda s: 1 if re.search('fake|fraud|synthetic|bot', s, re.I) else 0)

# 3) deduplicate by text and drop exact duplicates to avoid leakage
import hashlib
src_df['text_hash'] = src_df[text_col].fillna('').astype(str).map(lambda s: hashlib.md5(s.strip().encode('utf-8')).hexdigest())
src_df = src_df.drop_duplicates(subset=['text_hash']).reset_index(drop=True)
print('After dedup by text_hash:', len(src_df))

# 4) ensure there are both classes — if not, synthesize fake examples by augmenting real texts
counts = src_df[label_col].value_counts()
print('Counts before synthetic balancing:\n', counts)
if len(counts) < 2 or counts.min() == 0:
    # Synthesize fake examples if missing
    print('Warning: only one class present or minority class missing; synthesizing fake examples from real texts')
    real_df = src_df[src_df[label_col]==0].copy()
    if real_df.empty:
        raise RuntimeError('No real examples to synthesize from')
    needed = max(1, int(0.1 * len(real_df)))
    def augment_text_simple_local(s):
        if not isinstance(s, str) or len(s.strip())==0:
            return s
        sentences = re.split(r'(?<=[.!?]) +', s)
        if len(sentences) > 1 and random.random() < 0.5:
            random.shuffle(sentences)
            return ' '.join(sentences)
        words = s.split()
        if len(words) > 6 and random.random() < 0.5:
            i, j = random.sample(range(len(words)), 2)
            words[i], words[j] = words[j], words[i]
            return ' '.join(words)
        return s + ' ' + random.choice(['Great product.', 'Would buy again.', 'Works as expected.'])
    synth_texts = [augment_text_simple_local(t) for t in real_df[text_col].sample(n=needed, replace=True, random_state=42).tolist()]
    synth_df = pd.DataFrame({text_col: synth_texts, label_col: [1]*len(synth_texts)})
    src_df = pd.concat([src_df, synth_df], ignore_index=True)
    print('After synthesis, class counts:', src_df[label_col].value_counts().to_dict())

# 5) build pool and split into train/val (80/20) with stratify
pool_df = src_df[[text_col, label_col]].rename(columns={text_col:'text', label_col:'label'}).sample(frac=1, random_state=42).reset_index(drop=True)
print('Pool size:', len(pool_df), 'class counts:', pool_df['label'].value_counts().to_dict())
from sklearn.model_selection import train_test_split
train_pool, val_pool = train_test_split(pool_df, test_size=0.2, stratify=pool_df['label'], random_state=42)
print('Train pool size:', len(train_pool), 'Val pool size:', len(val_pool))

# 6) Build HuggingFace Datasets and tokenize
if 'tokenizer' not in globals():
    raise RuntimeError('No tokenizer found in notebook environment. Please load tokenizer before running this cell.')

hf_train = Dataset.from_pandas(train_pool.rename(columns={'text':'text','label':'label'}))
hf_val = Dataset.from_pandas(val_pool.rename(columns={'text':'text','label':'label'}))

hf_train = hf_train.map(lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256), batched=True)
hf_val = hf_val.map(lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256), batched=True)

hf_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
hf_val.set_format(type='torch', columns=['input_ids','attention_mask','label'])

# 7) compute class weights
y_train = np.array(hf_train['label'])
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print('Class weights:', dict(zip(classes.tolist(), class_weights.tolist())))

# 8) Weighted Trainer (compat accepts extra kwargs and handles labels under 'labels' or 'label')
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get('labels') if 'labels' in inputs else inputs.get('label')
        outputs = model(input_ids=inputs.get('input_ids'), attention_mask=inputs.get('attention_mask'))
        logits = outputs.logits
        if labels is None:
            # fallback to model-provided loss
            loss = getattr(outputs, 'loss', torch.tensor(0.0, device=next(model.parameters()).device))
        else:
            loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
            loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 9) training arguments (compatibility-safe)
training_args = TrainingArguments(
    output_dir='./roberta_balanced_check',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=20,
)

from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics_small(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='binary')
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    return {'f1': f1, 'precision': precision, 'recall': recall}

# 10) load or reuse model
if 'model' not in globals():
    if 'model_dir' in globals():
        from transformers import RobertaForSequenceClassification
        model = RobertaForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    else:
        raise RuntimeError('No model or model_dir found in the notebook environment.')

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics_small,
)

print('Starting short retraining (1 epoch) for balanced check...')
train_result = trainer.train()
print('Training completed. Metrics:')
print(train_result.metrics)

# Evaluate
eval_out = trainer.predict(hf_val)
logits = eval_out.predictions
preds = np.argmax(logits, axis=1)
labels = eval_out.label_ids
print('Confusion matrix:\n', confusion_matrix(labels, preds))
print('Classification report:\n', classification_report(labels, preds, digits=4))

# Save small balanced splits
train_pool.to_csv('./balanced_train_pool.csv', index=False)
val_pool.to_csv('./balanced_val_pool.csv', index=False)
print('Saved balanced_train_pool.csv and balanced_val_pool.csv')

print('Retrain cell finished (robust path)')


In [None]:
# Quick inspect cell: show columns, sample rows, and sizes of fake/real lists to choose correct text/label columns
import pandas as pd
import inspect

candidates = ['combined_df','merged_df','final_df','reviews_clean','train_df']
for name in candidates:
    if name in globals():
        df = globals()[name]
        if isinstance(df, pd.DataFrame):
            print('---', name, '---')
            print('shape:', df.shape)
            print('columns:', df.columns.tolist())
            # show up to first 5 rows but only string/object cols and numeric cols that look like labels
            obj_cols = [c for c in df.columns if df[c].dtype == object]
            sample_cols = obj_cols[:5]
            if 'label' in df.columns:
                sample_cols = list(dict.fromkeys(['label'] + sample_cols))
            print('showing first 5 rows for cols:', sample_cols)
            display(df[sample_cols].head(5))
            # show value counts for any likely label cols
            possible_label_cols = [c for c in df.columns if c.lower() in ('label','is_fake','fake','target','y') or set(df[c].dropna().unique()).issubset({0,1})]
            for lc in possible_label_cols:
                print('value counts for', lc, '\n', df[lc].value_counts(dropna=False).to_dict())

# inspect in-memory lists if present
for list_name in ['fake_examples','real_examples','fake_examples_list','debug_fake','debug_real']:
    if list_name in globals():
        v = globals()[list_name]
        try:
            print(list_name, 'len =', len(v))
            print('sample (up to 3):', v[:3])
        except Exception as e:
            print('Could not print', list_name, e)

# show tokenizer info if available
if 'tokenizer' in globals():
    try:
        print('Tokenizer type:', type(tokenizer))
        if hasattr(tokenizer, 'vocab_size'):
            print('vocab_size:', tokenizer.vocab_size)
    except Exception as e:
        print('Could not inspect tokenizer:', e)

# show a small sample of model and trainer
if 'model' in globals():
    print('Model present:', model.__class__)
if 'trainer' in globals():
    print('Trainer present with args:', getattr(trainer, 'args', None))

print('\nQuick inspect complete. Use these outputs to pick correct text/label columns for rebuilding.')

In [None]:
# Rebuild splits explicitly using `final_df` (review_text, is_fake), create 50/50 training set, 80/20 realistic validation, tokenize, and retrain
import random
import re
import hashlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import torch
import torch.nn as nn
from transformers import TrainingArguments, Trainer

print('Starting explicit rebuild + retrain using final_df')
# 1) load final_df
if 'final_df' not in globals():
    raise RuntimeError('`final_df` not found in notebook namespace. It must contain `review_text` and `is_fake`.')
fd = final_df.copy()
print('final_df shape:', fd.shape)
if 'review_text' not in fd.columns or 'is_fake' not in fd.columns:
    raise RuntimeError('final_df must contain columns `review_text` and `is_fake`')

# 2) basic cleaning: drop NaN texts
fd['review_text'] = fd['review_text'].astype(str).fillna('').map(lambda s: s.strip())
fd = fd[fd['review_text'].str.len() > 0].reset_index(drop=True)
print('after dropping empty texts:', len(fd))

# 3) build validation set of size ~20% of data with 80% real and 20% fake
total = len(fd)
val_size = int(0.2 * total)
print('target val size:', val_size)
real_pool = fd[fd['is_fake']==0].copy()
fake_pool = fd[fd['is_fake']==1].copy()
print('real_pool:', len(real_pool), 'fake_pool:', len(fake_pool))

# compute target counts
val_real_target = int(round(val_size * 0.8))
val_fake_target = val_size - val_real_target
val_real = real_pool.sample(n=min(val_real_target, len(real_pool)), random_state=42)
val_fake = fake_pool.sample(n=min(val_fake_target, len(fake_pool)), random_state=42)
# if not enough fake examples for val_fake_target, adjust by taking as many as available
if len(val_fake) < val_fake_target:
    shortage = val_fake_target - len(val_fake)
    print('Warning: not enough fake examples for desired val composition; shortage:', shortage)
    # reduce val_real to keep val_size constant
    if len(val_real) > shortage:
        val_real = val_real.sample(n=(len(val_real)-shortage), random_state=42)

val_df = pd.concat([val_real, val_fake]).sample(frac=1, random_state=42).reset_index(drop=True)
print('Actual val composition:', val_df['is_fake'].value_counts().to_dict())

# 4) remaining df for training
val_idx = set(val_df.index)  # these are from subsets, but we need to remove by indices in original df
# Remove val samples from fd by text hashes to avoid accidental duplicates
val_hashes = set(val_df['review_text'].map(lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()))
fd['text_hash'] = fd['review_text'].map(lambda s: hashlib.md5(s.encode('utf-8')).hexdigest())
train_pool_df = fd[~fd['text_hash'].isin(val_hashes)].copy().reset_index(drop=True)
print('Train pool after removing val hashes:', len(train_pool_df), 'class counts:', train_pool_df['is_fake'].value_counts().to_dict())

# 5) create balanced training set 50/50 by oversampling minority with light augmentation
real_train = train_pool_df[train_pool_df['is_fake']==0].copy()
fake_train = train_pool_df[train_pool_df['is_fake']==1].copy()
print('Available for train - real:', len(real_train), 'fake:', len(fake_train))

def augment_text_simple(s):
    if not isinstance(s, str) or len(s.strip())==0:
        return s
    # shuffle sentences sometimes
    sentences = re.split(r'(?<=[.!?]) +', s)
    if len(sentences) > 1 and random.random() < 0.4:
        random.shuffle(sentences)
        return ' '.join(sentences)
    words = s.split()
    if len(words) > 6 and random.random() < 0.4:
        i, j = random.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
        return ' '.join(words)
    return s + ' '

# target per-class = max(len(real_train), len(fake_train))? We want balanced, so choose target = max(len(real_train), len(fake_train))
# but to avoid huge oversampling, cap target at available majority count
target = max(len(real_train), len(fake_train))
if target == 0:
    raise RuntimeError('No data available to form training set after removing validation.')

# build balanced sets
if len(real_train) > len(fake_train):
    # oversample fake to match real
    needed = len(real_train) - len(fake_train)
    if len(fake_train) > 0:
        extra = fake_train.sample(n=needed, replace=True, random_state=42).copy()
        # augment copies to reduce identical duplicates
        extra['review_text'] = extra['review_text'].map(lambda s: augment_text_simple(s))
        fake_balanced = pd.concat([fake_train, extra], ignore_index=True)
        real_balanced = real_train
    else:
        # synthesize fake examples from real_train by simple augment
        synth_texts = [augment_text_simple(t) for t in real_train['review_text'].sample(n=len(real_train), replace=True, random_state=42).tolist()]
        fake_balanced = pd.DataFrame({'review_text': synth_texts, 'is_fake': 1})
        real_balanced = real_train
elif len(fake_train) > len(real_train):
    needed = len(fake_train) - len(real_train)
    if len(real_train) > 0:
        extra = real_train.sample(n=needed, replace=True, random_state=42).copy()
        extra['review_text'] = extra['review_text'].map(lambda s: augment_text_simple(s))
        real_balanced = pd.concat([real_train, extra], ignore_index=True)
        fake_balanced = fake_train
else:
    real_balanced = real_train
    fake_balanced = fake_train

train_df = pd.concat([real_balanced[['review_text','is_fake']], fake_balanced[['review_text','is_fake']]], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
print('Final train counts:', train_df['is_fake'].value_counts().to_dict())

# 6) Tokenize using existing tokenizer
if 'tokenizer' not in globals():
    raise RuntimeError('tokenizer not found in notebook namespace; please load tokenizer before running this cell')

from datasets import Dataset
hf_train = Dataset.from_pandas(train_df.rename(columns={'review_text':'text','is_fake':'label'}))
hf_val = Dataset.from_pandas(val_df.rename(columns={'review_text':'text','is_fake':'label'}))

def tok_batch(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

hf_train = hf_train.map(lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256), batched=True)
hf_val = hf_val.map(lambda ex: tokenizer(ex['text'], truncation=True, padding='max_length', max_length=256), batched=True)

hf_train.set_format(type='torch', columns=['input_ids','attention_mask','label'])
hf_val.set_format(type='torch', columns=['input_ids','attention_mask','label'])

# 7) compute class weights (based on training distribution)
y_train = np.array(hf_train['label'])
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print('Computed class weights:', dict(zip(classes.tolist(), class_weights.tolist())))

# 8) Weighted Trainer subclass
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('label')
        outputs = model(input_ids=inputs.get('input_ids'), attention_mask=inputs.get('attention_mask'))
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 9) Prepare model and training args
if 'model' not in globals():
    if 'model_dir' in globals():
        from transformers import RobertaForSequenceClassification
        model = RobertaForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    else:
        raise RuntimeError('No model or model_dir present in environment')

training_args = TrainingArguments(
    output_dir='./roberta_balanced_finetune',
    evaluation_strategy='epoch',
    save_strategy='no',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=False,
)

# 10) compute_metrics
from sklearn.metrics import precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'f1': f1_score(labels, preds, average='binary'),
        'precision': precision_score(labels, preds, zero_division=0),
        'recall': recall_score(labels, preds, zero_division=0)
    }

# 11) instantiate trainer and train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics
)

print('Begin training...')
train_result = trainer.train()
print('Training finished, metrics:', train_result.metrics)

# 12) evaluate and print detailed report
pred_out = trainer.predict(hf_val)
logits = pred_out.predictions
preds = np.argmax(logits, axis=1)
labels = pred_out.label_ids
print('Validation confusion matrix:\n', confusion_matrix(labels, preds))
print('Validation classification report:\n', classification_report(labels, preds, digits=4))

# 13) update kernel variables for later steps
final_train_df = train_df
final_val_df = val_df

print('Retrain cell complete')

In [None]:
# Compatibility cell: create TrainingArguments without unsupported kwargs and run training (uses hf_train, hf_val, model, class_weights_tensor from kernel)
from transformers import TrainingArguments
print('Creating simpler TrainingArguments for compatibility...')
training_args = TrainingArguments(
    output_dir='./roberta_balanced_finetune',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=50,
)

# compute_metrics should exist from previous cell; if not, define it
try:
    compute_metrics
except NameError:
    from sklearn.metrics import f1_score, precision_score, recall_score
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            'f1': f1_score(labels, preds, average='binary'),
            'precision': precision_score(labels, preds, zero_division=0),
            'recall': recall_score(labels, preds, zero_division=0)
        }

# instantiate trainer and train
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics
)
print('Starting training (compatibility run)...')
train_result = trainer.train()
print('Training done. Metrics:', train_result.metrics)

# evaluate
pred_out = trainer.predict(hf_val)
logits = pred_out.predictions
preds = np.argmax(logits, axis=1)
labels = pred_out.label_ids
print('Validation confusion matrix:\n', confusion_matrix(labels, preds))
print('Validation classification report:\n', classification_report(labels, preds, digits=4))

In [None]:
# Fix: define a Trainer compute_loss compatible with extra kwargs passed by Trainer (accept **kwargs)
class WeightedTrainerCompat(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get('label')
        outputs = model(input_ids=inputs.get('input_ids'), attention_mask=inputs.get('attention_mask'))
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# instantiate and train using the compatibility trainer
trainer = WeightedTrainerCompat(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics
)
print('Starting compatible training run...')
train_result = trainer.train()
print('Training finished, metrics:', train_result.metrics)

# evaluate
pred_out = trainer.predict(hf_val)
logits = pred_out.predictions
preds = np.argmax(logits, axis=1)
labels = pred_out.label_ids
print('Validation confusion matrix:\n', confusion_matrix(labels, preds))
print('Validation classification report:\n', classification_report(labels, preds, digits=4))

In [None]:
# Fix 2: robust compute_loss that accepts either 'labels' or 'label' and handles None
class WeightedTrainerCompat2(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # get labels under either key
        labels = inputs.get('labels') if 'labels' in inputs else inputs.get('label')
        # move tensors to device if needed (Trainer will have already moved inputs)
        outputs = model(input_ids=inputs.get('input_ids'), attention_mask=inputs.get('attention_mask'))
        logits = outputs.logits
        if labels is None:
            # fallback to model's own loss if labels absent
            return outputs.loss if return_outputs else outputs.loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainerCompat2(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    compute_metrics=compute_metrics
)
print('Starting robust compatible training run...')
train_result = trainer.train()
print('Training finished, metrics:', train_result.metrics)

pred_out = trainer.predict(hf_val)
logits = pred_out.predictions
preds = np.argmax(logits, axis=1)
labels = pred_out.label_ids
print('Validation confusion matrix:\n', confusion_matrix(labels, preds))
print('Validation classification report:\n', classification_report(labels, preds, digits=4))