In [None]:
# ==============================================================================
# SECTION 0: SETUP & INSTRUCTIONS
# ==============================================================================
# ---
# ## Project Setup Instructions
#
# 1.  **Dependencies:** Ensure you have installed all required libraries by running `pip install -r requirements.txt` from the project's root directory.
# 2.  **Prerequisites:** This notebook requires:
#     a. The "golden dataset" from Phase 1 (`*.parquet` file in `output/high_purity_golden_datasets/`).
#     b. The trained prompt classifier model from Phase 2 (the `roberta-prompt-classifier` folder in `output/models/`).
# ---

print("--- [0] Installing and Importing Libraries ---")
# The installation is now expected to be done via requirements.txt
# !pip install pandas transformers datasets scikit-learn tqdm torch lightgbm sentence-transformers spacy nltk textstat bert-score radon -q
# !python -m spacy download en_core_web_lg -q

import os
import re
import glob
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import lightgbm as lgb
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from textstat import flesch_kincaid_grade
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from bert_score import score as bert_score
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report
import ast
from radon.visitors import ComplexityVisitor
from transformers import pipeline, AutoTokenizer
import json

print("Libraries imported successfully.")


In [None]:

# ==============================================================================
# SECTION 1: CONFIGURATION & MODEL LOADING
# ==============================================================================
print("\n--- [1] Setting up configuration, paths, and loading models ---")

# --- Use Relative Paths for Portability ---
ROOT_DIR = '..' # Go up one level from 'notebooks/'
OUTPUT_DIR = os.path.join(ROOT_DIR, 'output')

CLASSIFIER_PATH = os.path.join(OUTPUT_DIR, 'models/roberta-prompt-classifier/')
GOLDEN_DATA_DIR = os.path.join(OUTPUT_DIR, 'high_purity_golden_datasets/')

print(f"Classifier model path: {os.path.abspath(CLASSIFIER_PATH)}")
print(f"Golden dataset directory: {os.path.abspath(GOLDEN_DATA_DIR)}")

# --- Load All NLP Models (Done once for efficiency) ---
print("\nLoading all NLP and feature models into memory...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    spacy_nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Spacy model 'en_core_web_lg' not found. Downloading...")
    os.system("python -m spacy download en_core_web_lg -q")
    spacy_nlp = spacy.load("en_core_web_lg")

nltk.download('punkt', quiet=True)

sbert_model = SentenceTransformer('all-mpnet-base-v2', device=device)
bge_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device=device)
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)

print("✅ All feature models loaded successfully.")

Mounted at /content/drive
Paths configured.
Prompt categories defined.
Loading all NLP and feature models into memory...
✅ All models loaded successfully.


In [None]:

# ==============================================================================
# SECTION 2: DATA LOADING AND ON-THE-FLY CLASSIFICATION
# ==============================================================================
print(f"\n--- [2] Loading Data (Full Pipeline Mode) ---")

print("Loading prompt classifier...")
if not os.path.exists(os.path.join(CLASSIFIER_PATH, 'config.json')):
    raise RuntimeError(f"Could not load the classifier from {CLASSIFIER_PATH}. Ensure Phase 2 was run successfully.")
prompt_classifier = pipeline("text-classification", model=CLASSIFIER_PATH, device=0 if torch.cuda.is_available() else -1)

list_of_files = glob.glob(os.path.join(GOLDEN_DATA_DIR, '*.parquet'))
if not list_of_files:
    raise FileNotFoundError(f"No golden data files found in {GOLDEN_DATA_DIR}. Please run Phase 1 first.")

latest_file = max(list_of_files, key=os.path.getctime)
print(f"Loading latest golden dataset: {latest_file}")
df = pd.read_parquet(latest_file)
df.dropna(subset=['prompt', 'response_a', 'response_b'], inplace=True)

# Sample the data for faster development cycles. For a full run, comment this line out.
df = df.sample(n=min(len(df), 500), random_state=42).reset_index(drop=True)
print(f"Sampled {len(df)} records for this run.")

print("Classifying prompts on-the-fly...")
prompts = df['prompt'].tolist()
# The classifier was trained on max_length 256, so we ensure consistency
class_results = prompt_classifier(prompts, batch_size=32, truncation=True, max_length=256)
df['category'] = [res['label'] for res in class_results]

# Create one-hot winner columns from the unified 'winner' column
if 'winner' in df.columns:
    df['winner_model_a'] = (df['winner'] == 'model_a').astype(int)
    df['winner_model_b'] = (df['winner'] == 'model_b').astype(int)
else:
    raise ValueError("'winner' column not found in the golden dataset. Pipeline cannot proceed.")

# --- Data Splitting with GroupKFold ---
print("\nSplitting data using GroupKFold to prevent data leakage...")
gkf = GroupKFold(n_splits=5)
# Ensure groups are not NaN
df.dropna(subset=['prompt'], inplace=True)
df = df.reset_index(drop=True)
train_indices, test_indices = next(gkf.split(df, groups=df['prompt']))

train_df = df.loc[train_indices].copy()
test_df = df.loc[test_indices].copy()

print(f"Data split: {len(train_df)} training samples, {len(test_df)} testing samples.")
assert len(set(train_df['prompt']).intersection(set(test_df['prompt']))) == 0, "Data leakage detected!"
print("✅ No prompt overlap between train and test sets.")



In [None]:
# # Assuming df is already loaded as per the preceding code
# initial_shape = df.shape
# print(f"Initial DataFrame shape: {initial_shape}")

# # Calculate the number of samples to drop
# n_samples_to_drop = 1500

# # Ensure we don't try to drop more rows than exist
# if n_samples_to_drop >= len(df):
#     print(f"Warning: Requested to drop {n_samples_to_drop} samples, but only {len(df)} exist.")
#     print("Dropping all samples.")
#     df = df.sample(0) # Drop all rows, results in an empty DataFrame
# else:
#     # Drop a random sample of rows
#     df = df.sample(n=len(df) - n_samples_to_drop, random_state=42).reset_index(drop=True)
#     print(f"Dropped {n_samples_to_drop} samples.")

# final_shape = df.shape
# print(f"Final DataFrame shape after dropping samples: {final_shape}")

Initial DataFrame shape: (2000, 8)
Dropped 1500 samples.
Final DataFrame shape after dropping samples: (500, 8)


In [None]:
# gkf = GroupKFold(n_splits=5)
# train_indices, test_indices = next(gkf.split(df, groups=df['prompt']))

# train_df = df.loc[train_indices].copy()
# test_df = df.loc[test_indices].copy()

# print(f"Data split: {len(train_df)} training samples, {len(test_df)} testing samples.")
# print(f"Number of unique prompts in train: {train_df['prompt'].nunique()}")
# print(f"Number of unique prompts in test: {test_df['prompt'].nunique()}")
# assert len(set(train_df['prompt']).intersection(set(test_df['prompt']))) == 0, "Data leakage detected!"
# print("✅ No prompt overlap between train and test sets.")



Data split: 400 training samples, 100 testing samples.
Number of unique prompts in train: 400
Number of unique prompts in test: 100
✅ No prompt overlap between train and test sets.


In [None]:

# ==============================================================================
# SECTION 3: ADVANCED FEATURE EXTRACTOR CLASS
# ==============================================================================
print("\n--- [3] Defining Advanced Feature Extractor ---")

class AdvancedFeatureExtractor:
    def __init__(self, spacy_model, sbert, bge, cross_enc):
        self.nlp = spacy_model; self.sbert = sbert; self.bge = bge; self.cross_encoder = cross_enc
        self.code_block_re = re.compile(r"```(python)?\n(.*?)\n```", re.DOTALL)
        self.discourse_markers = {
            'contrast': {'however', 'but', 'whereas', 'on the other hand'},
            'causation': {'therefore', 'consequently', 'as a result', 'because'},
            'elaboration': {'for example', 'for instance', 'in other words'}
        }

    def _get_code_from_response(self, text):
        blocks = self.code_block_re.findall(text)
        return "\n".join([b[1] for b in blocks]) if blocks else None

    def get_code_features(self, response):
        code = self._get_code_from_response(response)
        if not code:
            return {'cyclomatic_complexity': 0, 'import_count': 0, 'function_count': 0, 'has_code': 0}
        try:
            visitor = ComplexityVisitor.from_code(code)
            tree = ast.parse(code)
            return {
                'cyclomatic_complexity': sum(f.complexity for f in visitor.functions),
                'import_count': sum(1 for n in ast.walk(tree) if isinstance(n, (ast.Import, ast.ImportFrom))),
                'function_count': sum(1 for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)),
                'has_code': 1
            }
        except (SyntaxError, ValueError):
            return {'cyclomatic_complexity': -1, 'import_count': -1, 'function_count': -1, 'has_code': 1}

    def get_universal_features(self, text):
        return {'word_count': len(word_tokenize(text)),'sentence_count': len(sent_tokenize(text)), 'readability_fk_grade': flesch_kincaid_grade(text)}

    def get_semantic_features(self, prompt, response):
        p_emb_sbert, r_emb_sbert = self.sbert.encode([prompt, response], convert_to_tensor=True)
        p_emb_bge, r_emb_bge = self.bge.encode([prompt, response], convert_to_tensor=True)
        _, _, f1 = bert_score([response], [prompt], lang='en', device=device, verbose=False)
        return {
            'sbert_sim': util.pytorch_cos_sim(p_emb_sbert, r_emb_sbert).item(),
            'bge_sim': util.pytorch_cos_sim(p_emb_bge, r_emb_bge).item(),
            'cross_encoder_score': self.cross_encoder.predict([prompt, response]),
            'bert_score_f1': f1.item()
        }

    def get_argument_features(self, text):
        tokens = set(word_tokenize(text.lower()))
        return {f'discourse_{f_type}_count': len(tokens.intersection(markers)) for f_type, markers in self.discourse_markers.items()}

    def get_entity_features(self, text):
        return {'entities': {ent.text.lower() for ent in self.nlp(text).ents}}

    def extract_all(self, prompt, response, category):
        features = {}
        features.update(self.get_universal_features(response))
        features.update(self.get_semantic_features(prompt, response))
        features.update(self.get_argument_features(response))
        features.update(self.get_entity_features(response))
        if category == 'Code & Programming':
            features.update(self.get_code_features(response))
        return features

feature_extractor = AdvancedFeatureExtractor(spacy_nlp, sbert_model, bge_model, cross_encoder)
print("✅ AdvancedFeatureExtractor is ready.")



--- [3] Defining Advanced Feature Extractor ---
✅ AdvancedFeatureExtractor is ready.


In [None]:

# ==============================================================================
# SECTION 4: FEATURE COMPUTATION & DELTA GENERATION
# ==============================================================================
print("\n--- [4] Computing Features for All Data ---")
all_features = []
# Loop over the entire dataframe (train + test) to compute features
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Computing Features"):
    features_a = feature_extractor.extract_all(row['prompt'], row['response_a'], row['category'])
    features_b = feature_extractor.extract_all(row['prompt'], row['response_b'], row['category'])
    entities_a, entities_b = features_a.pop('entities'), features_b.pop('entities')
    intersection = len(entities_a.intersection(entities_b))
    union = len(entities_a.union(entities_b))
    all_features.append({
        'id': row.get('id', idx), 'prompt': row['prompt'], 'category': row['category'],
        'winner_model_a': row['winner_model_a'], 'winner_model_b': row['winner_model_b'],
        'entity_jaccard': intersection / union if union > 0 else 0,
        **{f'{k}_a': v for k, v in features_a.items()},
        **{f'{k}_b': v for k, v in features_b.items()}
    })

df_features = pd.DataFrame(all_features).fillna(0)
feature_cols = [c.replace('_a', '') for c in df_features.columns if c.endswith('_a')]
df_deltas = df_features[['id', 'prompt', 'category', 'winner_model_a', 'winner_model_b', 'entity_jaccard']].copy()
for col in feature_cols:
    df_deltas[f'delta_{col}'] = df_features[f'{col}_a'] - df_features[f'{col}_b']

# Map target variable for 3-class classification: {0: B wins, 1: Tie, 2: A wins}
conditions = [
    df_deltas['winner_model_b'] == 1,
    (df_deltas['winner_model_a'] == 0) & (df_deltas['winner_model_b'] == 0),
    df_deltas['winner_model_a'] == 1
]
df_deltas['winner_mapped'] = np.select(conditions, [0, 1, 2], default=1)
print("✅ Feature computation and delta generation complete.")
print("Modeling DataFrame shape:", df_deltas.shape)



--- [4] Computing Features for All Data ---


Computing Features:   0%|          | 0/500 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

✅ Feature computation and delta generation complete.
Modeling DataFrame shape: (500, 18)


In [None]:

# ==============================================================================
# SECTION 5: ENSEMBLE MODEL TRAINING
# ==============================================================================
print("\n--- [5] Training Ensemble of Expert Models ---")

X_train_full = df_deltas.loc[train_indices]
y_train_full = X_train_full['winner_mapped']
X_test_full = df_deltas.loc[test_indices]
y_test_full = X_test_full['winner_mapped']

delta_cols = [c for c in df_deltas.columns if c.startswith('delta_')]
universal_features = [c for c in delta_cols if not any(k in c for k in ['cyclomatic', 'import_count', 'function_count', 'has_code'])]
code_expert_features = delta_cols

print("Training Universal Model...")
universal_model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42)
universal_model.fit(X_train_full[universal_features], y_train_full)

print("Training Expert Models...")
expert_models = {}
print("  - Training Code expert...")
cat_train_df = X_train_full[X_train_full['category'] == 'Code & Programming']
if len(cat_train_df) > 10:
    code_model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42, n_estimators=150)
    code_model.fit(cat_train_df[code_expert_features], cat_train_df['winner_mapped'])
    expert_models['Code & Programming'] = code_model
else:
    print("    - Skipping Code expert, not enough data in training split.")

print("✅ Model training complete.")


--- [5] Training Ensemble of Expert Models ---
Training Universal Model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 847
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 10
[LightGBM] [Info] Start training from score -1.071484
[LightGBM] [Info] Start training from score -1.155183
[LightGBM] [Info] Start training from score -1.071484
Training Expert Models...
  - Training Code expert...
    - Skipping Code expert, not enough data in training split.
✅ Model training complete.


In [None]:

# ==============================================================================
# SECTION 6: ENSEMBLE PREDICTION & ENHANCED EVALUATION
# ==============================================================================
print("\n--- [6] Evaluating Ensemble on Test Set ---")

predictions = []
EXPERT_WEIGHT = 0.7
UNIVERSAL_WEIGHT = 0.3

for idx, row in tqdm(X_test_full.iterrows(), total=len(X_test_full), desc="Making Predictions"):
    pred_uni_proba = universal_model.predict_proba(row[universal_features].values.reshape(1, -1))[0]
    category = row['category']
    if category in expert_models:
        expert_model = expert_models[category]
        pred_exp_proba = expert_model.predict_proba(row[code_expert_features].values.reshape(1, -1))[0]
        final_proba = (pred_uni_proba * UNIVERSAL_WEIGHT) + (pred_exp_proba * EXPERT_WEIGHT)
    else:
        final_proba = pred_uni_proba
    predictions.append(np.argmax(final_proba))

accuracy = accuracy_score(y_test_full, predictions)
print("\n" + "="*50)
print(f"FINAL OVERALL ACCURACY ON TEST SET: {accuracy:.4f}")
print("="*50)

print("\n--- DETAILED CLASSIFICATION REPORT ---")
target_names = ['B Wins', 'Tie', 'A Wins']
print(classification_report(y_test_full, predictions, target_names=target_names))
print("="*50)

print("\n--- SLICED PERFORMANCE BY CATEGORY ---")
X_test_full['prediction'] = predictions
category_accuracy = X_test_full.groupby('category').apply(lambda x: accuracy_score(x['winner_mapped'], x['prediction']))
category_counts = X_test_full['category'].value_counts()
report_df = pd.DataFrame({'Accuracy': category_accuracy, 'Test_Sample_Count': category_counts}).sort_values(by='Test_Sample_Count', ascending=False)
print(report_df)
print("="*50)


--- [6] Evaluating Ensemble on Test Set ---


Making Predictions:   0%|          | 0/100 [00:00<?, ?it/s]


FINAL OVERALL ACCURACY ON TEST SET: 1.0000

--- DETAILED CLASSIFICATION REPORT ---
              precision    recall  f1-score   support

      B Wins       1.00      1.00      1.00        36
         Tie       1.00      1.00      1.00        32
      A Wins       1.00      1.00      1.00        32

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100


--- SLICED PERFORMANCE BY CATEGORY ---
                     Accuracy  Test_Sample_Count
category                                        
Factual Information       1.0                  7
Technical Inquiry         1.0                 93


  category_accuracy = X_test_full.groupby('category').apply(
