In [None]:
import torch
import pandas as pd
import gc
from src.final_models import PivotBaseline, ZeroShotNeutralizer, SyntheticNeutralizer
from src.evaluate import Evaluator

# 1. Load the Gold Test Data
# Ensure you ran 'src/data_collection/create_pseudo_gold.py' first!
test_df = pd.read_csv("data/processed/test_chinese_gold.csv")
inputs = test_df["Chinese_Biased"].tolist()
gold_refs = test_df["Chinese_Neutral_Gold"].tolist()

print(f"Loaded {len(inputs)} test cases.")

# Dictionary to store outputs from each model
model_outputs = {}

# Define the models we want to test
# We use a list of classes so we can instantiate -> run -> delete to save RAM
model_classes = [
    ("Baseline (Pivot)", PivotBaseline),
    ("Model 2 (ZeroShot)", ZeroShotNeutralizer),
    ("Model 3 (Synthetic)", SyntheticNeutralizer),
]

for name, ModelClass in model_classes:
    print(f"\nüöÄ Running Inference for: {name}...")

    try:
        # A. Initialize Model
        model_instance = ModelClass()

        # B. Generate
        # Using batch_debias if available, else loop
        outputs = [model_instance.debias(text) for text in inputs]
        model_outputs[name] = outputs

        # C. Cleanup (CRITICAL for GPU Memory)
        del model_instance
        torch.cuda.empty_cache()
        gc.collect()

    except Exception as e:
        print(f"‚ùå Failed to run {name}: {e}")
        model_outputs[name] = ["Error"] * len(inputs)

print("\n‚úÖ Inference Complete. Starting Evaluation...")

In [None]:
# 1. Initialize Evaluator (Loads BERT Judge + GPT2)
evaluator = Evaluator()

results_table = []

for model_name, predictions in model_outputs.items():
    print(f"\nüìä Evaluating {model_name}...")

    # Check for empty/error outputs
    if not predictions or predictions[0] == "Error":
        continue

    # A. Style Accuracy (Did it remove bias?)
    acc = evaluator.get_style_accuracy(predictions)

    # B. Content Preservation (Did it keep meaning?)
    # Compare Prediction vs Original Biased Input (or Gold Neutral if you prefer)
    # Comparing to Input checks if we kept the *topic*.
    # Comparing to Gold checks if we matched the *reference*.
    # Standard Style Transfer usually compares to INPUT for preservation.
    bert_score = evaluator.get_bert_score(predictions, inputs)

    # C. Fluency (Is it natural Chinese?)
    ppl = evaluator.get_perplexity(predictions)

    # D. Composite Score (Geometric Mean)
    # We use 100/PPL scaling just to make the number readable in the table (e.g., 0.45 instead of 0.004)
    # Formula: (Acc * BERT * (100/PPL))^(1/3)
    composite = evaluator.calculate_composite_score(acc, bert_score, ppl)

    results_table.append(
        {
            "Model": model_name,
            "Style Acc (‚Üë)": f"{acc:.2%}",
            "Content Sim (‚Üë)": f"{bert_score:.3f}",
            "Fluency PPL (‚Üì)": f"{ppl:.2f}",
            "Composite Score": f"{composite:.3f}",
        }
    )

In [None]:
# ==========================================
# PART 3: FINAL REPORT
# ==========================================
print("\n" + "=" * 40)
print(" FINAL RESEARCH RESULTS ")
print("=" * 40)
df_results = pd.DataFrame(results_table)
display(df_results)

# Save for your paper
df_results.to_csv("results/final_metrics_table.csv", index=False)

# Optional: Show a few qualitative examples
print("\n--- Qualitative Analysis (First 3 Examples) ---")
for i in range(3):
    print(f"\nInput: {inputs[i]}")
    for model_name, preds in model_outputs.items():
        print(f"[{model_name}]: {preds[i]}")