In [None]:
# Configuration Cell - Add this at the top of each notebook
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

# Set base directories based on environment
if IS_KAGGLE:
    INPUT_ROOT = "/kaggle/input"
    WORK_DIR = "/kaggle/working"
elif IS_COLAB:
    INPUT_ROOT = "/content/input"
    WORK_DIR = "/content/working"
else:
    # Local environment
    INPUT_ROOT = Path.cwd() / "input"
    WORK_DIR = Path.cwd() / "working"

# Create standard directories
OUT_DIR = os.path.join(WORK_DIR, "data")
EXPERIMENTS_DIR = os.path.join(WORK_DIR, "experiments")
SCRIPTS_DIR = os.path.join(WORK_DIR, "scripts")

# Create all directories
for directory in [OUT_DIR, EXPERIMENTS_DIR, SCRIPTS_DIR]:
    Path(directory).mkdir(parents=True, exist_ok=True)

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")
print(f"Input directory: {INPUT_ROOT}")
print(f"Working directory: {WORK_DIR}")
print(f"Data directory: {OUT_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")

In [None]:
# Master Experiment Runner for CS483 BiasBreakers Project
# This notebook orchestrates all experiments and generates results for the final report

import os
import subprocess
from pathlib import Path

# Configuration
DATA_DIR = "/kaggle/working/data"
EXPERIMENTS_DIR = "/kaggle/working/experiments"
SCRIPTS_DIR = "/kaggle/working/scripts"

# Create directories
Path(DATA_DIR).mkdir(exist_ok=True)
Path(EXPERIMENTS_DIR).mkdir(exist_ok=True)

print("="*80)
print("BiasBreakers: Master Experiment Runner")
print("="*80)
print(f"\nData directory: {DATA_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")
print(f"Scripts directory: {SCRIPTS_DIR}")

In [None]:
# STEP 1: Verify data preprocessing is complete
print("\n" + "="*80)
print("STEP 1: Verifying Data Files")
print("="*80 + "\n")

required_files = [
    "jigsaw_train.csv", "jigsaw_val.csv", "jigsaw_test.csv",
    "jigsaw_train_full.csv", "jigsaw_val_full.csv", "jigsaw_test_full.csv",
    "civil_train.csv", "civil_val.csv", "civil_test.csv",
    "civil_train_full.csv", "civil_val_full.csv", "civil_test_full.csv",
    "hatexplain_train.csv", "hatexplain_val.csv", "hatexplain_test.csv",
]

missing_files = []
for f in required_files:
    path = Path(DATA_DIR) / f
    if path.exists():
        size_mb = path.stat().st_size / (1024*1024)
        print(f"‚úì {f} ({size_mb:.1f} MB)")
    else:
        print(f"‚úó {f} - MISSING")
        missing_files.append(f)

if missing_files:
    print(f"\n‚ö†Ô∏è  WARNING: {len(missing_files)} files missing!")
    print("Please run preprocessing notebooks first:")
    print("  - cs483data.ipynb")
    print("  - civildata.ipynb")
    print("  - hatexplaindata.ipynb")
else:
    print(f"\n‚úì All {len(required_files)} data files found!")

In [None]:
# STEP 2: Run TF-IDF Baselines
print("\n" + "="*80)
print("STEP 2: Training TF-IDF Baselines")
print("="*80 + "\n")

import sys
from pathlib import Path

# FIXED: Add scripts directory to path
sys.path.insert(0, SCRIPTS_DIR)

# Verify script exists before importing
tfidf_script = Path(SCRIPTS_DIR) / "run_tfidf_baselines.py"
if not tfidf_script.exists():
    print(f"‚ö†Ô∏è  ERROR: Script not found: {tfidf_script}")
    print("Please ensure run_tfidf_baselines.py is in the scripts directory")
    raise FileNotFoundError(f"Missing script: {tfidf_script}")

# Import and run TF-IDF baseline
try:
    from run_tfidf_baselines import train_and_evaluate_tfidf
except ImportError as e:
    print(f"‚ö†Ô∏è  Import Error: {e}")
    print("Trying subprocess approach instead...")
    
    # Fallback: Run as subprocess
    import subprocess
    result = subprocess.run([
        sys.executable,
        str(tfidf_script),
        "--source_dataset", "jigsaw",
        "--target_datasets", "civil", "hatexplain",
        "--model", "logreg",
        "--seed", "42",
        "--data_dir", DATA_DIR,
        "--save_preds"
    ], capture_output=True, text=True)
    
    print(result.stdout)
    if result.returncode != 0:
        print("STDERR:", result.stderr)
    
    # Skip the rest if using subprocess
    results_tfidf = None
else:
    # Experiment 2.1: Jigsaw ‚Üí Civil & HateXplain
    print("Running: TF-IDF Logistic Regression (Jigsaw ‚Üí Civil, HateXplain)")
    results_tfidf = train_and_evaluate_tfidf(
        source_dataset="jigsaw",
        target_datasets=["civil", "hatexplain"],
        model_type="logreg",
        seed=42,
        data_dir=DATA_DIR,
        save_preds=True,
    )
    
    print("\n‚úì TF-IDF baseline complete!")
    print(f"In-domain test F1: {results_tfidf['in_domain_test']['f1']:.4f}")

In [None]:
# STEP 3: Run RoBERTa Models
print("\n" + "="*80)
print("STEP 3: Training RoBERTa Models")
print("="*80 + "\n")

from run_roberta import train_and_evaluate
import torch

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Experiment 3.1: Basic RoBERTa with calibration
print("\n--- Experiment 3.1: RoBERTa with Isotonic Calibration ---")
results_roberta = train_and_evaluate(
    source_dataset="jigsaw",
    target_datasets=["civil", "hatexplain"],
    model_name="roberta-base",
    epochs=3,
    batch_size=16,
    lr=2e-5,
    max_len=128,
    seed=42,
    data_dir=DATA_DIR,
    calibration="isotonic",
    early_stop=True,
    patience=2,
    tune_threshold=True,
    save_preds=True,
)

print("\n‚úì RoBERTa training complete!")
print(f"In-domain test F1: {results_roberta['in_domain']['test']['f1']:.4f}")

In [None]:
# STEP 4: Compute Fairness Metrics
print("\n" + "="*80)
print("STEP 4: Computing Fairness Metrics")
print("="*80 + "\n")

from scripts.fairness_metrics import compute_group_fairness
import pandas as pd

# Experiment 4.1: Cross-domain fairness (Jigsaw ‚Üí Civil)
print("Computing fairness for: Jigsaw ‚Üí Civil")

# Load predictions and full data
pred_file = Path(EXPERIMENTS_DIR) / "preds_jigsaw_to_civil.csv"
full_data_file = Path(DATA_DIR) / "civil_test_full.csv"

if pred_file.exists() and full_data_file.exists():
    pred_df = pd.read_csv(pred_file)
    full_df = pd.read_csv(full_data_file)
    
    # Merge on ID
    merged_df = pred_df.merge(full_df, on="id", how="inner", suffixes=("", "_full"))
    if "label_full" in merged_df.columns:
        merged_df = merged_df.drop(columns=["label_full"])
    
    # Find group columns
    group_cols = [c for c in merged_df.columns if c.startswith("g_")]
    
    print(f"Found {len(group_cols)} identity groups")
    print(f"Analyzing {len(merged_df)} predictions")
    
    # Compute fairness
    summary_df, per_group_df = compute_group_fairness(
        merged_df,
        group_cols=group_cols,
        label_col="label",
        pred_col="pred",
    )
    
    # Save results
    summary_df.to_csv(Path(EXPERIMENTS_DIR) / "fairness_jigsaw_to_civil_summary.csv", index=False)
    per_group_df.to_csv(Path(EXPERIMENTS_DIR) / "fairness_jigsaw_to_civil_per_group.csv", index=False)
    
    # Print top fairness violations
    print("\nTop 5 groups by Demographic Parity difference:")
    print(summary_df.nlargest(5, "dp_diff")[["group_col", "dp_diff", "eop_diff", "eo_diff"]])
    
    print("\n‚úì Fairness analysis complete!")
else:
    print("‚ö†Ô∏è  Required files not found. Ensure predictions and full data exist.")

In [None]:
# STEP 5: Generate Summary Statistics
print("\n" + "="*80)
print("STEP 5: Summary Statistics")
print("="*80 + "\n")

# Load all summary CSVs
summary_files = list(Path(EXPERIMENTS_DIR).glob("summary_*.csv"))
print(f"Found {len(summary_files)} summary files:\n")

all_summaries = {}
for f in summary_files:
    df = pd.read_csv(f)
    model_name = f.stem.replace("summary_", "")
    all_summaries[model_name] = df
    
    print(f"--- {model_name} ---")
    print(df[["split", "accuracy", "f1", "auroc", "pr_auc"]].to_string(index=False))
    print()

# Create comparison table
comparison_rows = []
for model_name, df in all_summaries.items():
    for _, row in df.iterrows():
        comparison_rows.append({
            "Model": model_name,
            "Split": row["split"],
            "Accuracy": f"{row.get('accuracy', 0):.4f}",
            "F1": f"{row.get('f1', 0):.4f}",
            "AUROC": f"{row.get('auroc', 0):.4f}",
            "PR-AUC": f"{row.get('pr_auc', 0):.4f}",
        })

comparison_df = pd.DataFrame(comparison_rows)
comparison_df.to_csv(Path(EXPERIMENTS_DIR) / "model_comparison.csv", index=False)
print("‚úì Model comparison saved to: model_comparison.csv")

In [None]:
# STEP 6: Quick Visualization Preview
print("\n" + "="*80)
print("STEP 6: Quick Visualization Preview")
print("="*80 + "\n")

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# Plot 1: Cross-domain performance comparison
fig, ax = plt.subplots(figsize=(12, 6))

models_to_plot = ["jigsaw", "tfidf_jigsaw_logreg"]
colors = ["steelblue", "coral"]

for idx, (model_name, color) in enumerate(zip(models_to_plot, colors)):
    if model_name in all_summaries:
        df = all_summaries[model_name]
        splits = df["split"].values
        f1_scores = df["f1"].values
        
        x_pos = range(len(splits))
        x_pos = [x + idx*0.35 for x in x_pos]
        
        ax.bar(x_pos, f1_scores, width=0.35, label=model_name, color=color, alpha=0.7)

ax.set_xlabel("Dataset Split", fontsize=12)
ax.set_ylabel("F1 Score", fontsize=12)
ax.set_title("Cross-Domain Performance Comparison", fontsize=14, fontweight="bold")
ax.set_xticks([r + 0.175 for r in range(len(splits))])
ax.set_xticklabels(splits, rotation=45, ha="right")
ax.legend()
ax.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.savefig(Path(EXPERIMENTS_DIR) / "plots" / "quick_comparison.png", dpi=150, bbox_inches="tight")
plt.show()

print("\n‚úì Quick visualization complete!")
print("For full analysis, run: scripts/analysis_plots.ipynb")

In [None]:
# FINAL: Experiment Summary Report
print("\n" + "="*80)
print("EXPERIMENT SUMMARY REPORT")
print("="*80 + "\n")

print("üìä COMPLETED EXPERIMENTS:")
print("  ‚úì TF-IDF Baseline (Logistic Regression)")
print("  ‚úì RoBERTa with Calibration")
print("  ‚úì Cross-Domain Evaluation (Civil, HateXplain)")
print("  ‚úì Fairness Analysis")
print("  ‚úì Summary Statistics\n")

print("üìÅ OUTPUT FILES:")
output_files = list(Path(EXPERIMENTS_DIR).glob("*"))
print(f"  Total files generated: {len(output_files)}")
print(f"  Summary CSVs: {len(list(Path(EXPERIMENTS_DIR).glob('summary_*.csv')))}")
print(f"  Prediction CSVs: {len(list(Path(EXPERIMENTS_DIR).glob('preds_*.csv')))}")
print(f"  Fairness CSVs: {len(list(Path(EXPERIMENTS_DIR).glob('fairness_*.csv')))}")

print("\nüìà NEXT STEPS:")
print("  1. Run scripts/analysis_plots.ipynb to generate all visualizations")
print("  2. Review fairness metrics in fairness_*_summary.csv")
print("  3. Copy key plots from experiments/plots/ to your report")
print("  4. Use model_comparison.csv for quantitative results table")

print("\n" + "="*80)
print("All experiments complete! Ready for report generation.")
print("="*80)