In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from datasets import load_dataset
from tqdm import tqdm

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '..'))
sys.path.append(project_root)
from baseline.utils.setup import setup_experiment_dir
exp_dir = setup_experiment_dir('global_mlp')

In [3]:
def prepare_data():
    dataset = load_dataset("hazyresearch/MATH_with_LM_Judges_and_Reward_Model_Results_V2")
    data = dataset['data']
    
    # get first row
    first_item = data[0]
    judge_columns = [k for k in first_item.keys() if k.startswith('judge_')]
    score_columns = [k for k in first_item.keys() 
                    if k.endswith('_scores') and not k.startswith('judge_')]
    feature_columns = judge_columns + score_columns

    # First pass: collect all values for each verdict column to compute mode
    verdict_values = {col: [] for col in judge_columns}
    for problem in data:
        for col in judge_columns:
            verdict_values[col].extend([v for v in problem[col] if v is not None])
    
    # Compute mode for each verdict column
    modes = {col: max(set(values), key=values.count) 
            for col, values in verdict_values.items()}

    data_rows = []
    for idx in range(len(data)):
        problem = data[idx]
        normalized_scores = normalize_scores(problem, score_columns)
        
        for i in range(len(problem['samples'])):
            row = {
                'problem_idx': idx,
                'solution_idx': i,
                'is_correct': problem['answer_correct'][i],
                # Fill judge columns with mode if None
                **{column: modes[column] if problem[column][i] is None else problem[column][i] 
                   for column in judge_columns},
                # Use normalized scores for score columns
                **{column: normalized_scores[column][i] for column in score_columns}
            }
            data_rows.append(row)
            
    df = pd.DataFrame(data_rows)
    return df, feature_columns

def normalize_scores(problem, score_columns):
    """Normalize scores within each problem"""
    normalized_scores = {}
    for score_col in score_columns:
        scores = np.array(problem[score_col])
        min_score = np.min(scores)
        max_score = np.max(scores)
        score_range = max_score - min_score
        if score_range == 0:
            normalized_scores[score_col] = [0.5] * len(scores)
        else:
            normalized_scores[score_col] = (scores - min_score) / score_range
    return normalized_scores

In [4]:
df, feature_columns = prepare_data()
print(f"Dataset shape: {df.shape}")
print(f"Number of features: {len(feature_columns)}")
print(f"\nFeature columns: {feature_columns}")

Dataset shape: (100000, 22)
Number of features: 19

Feature columns: ['judge_qwen2-72b-instruct_verdicts', 'judge_qwen2.5-72b-instruct-turbo_verdicts', 'judge_qwq-32b-preview_verdicts', 'judge_nous-hermes-2-mixtral-8x7b-dpo_verdicts', 'judge_llama-3.1-nemotron-70b-instruct-hf_verdicts', 'judge_meta-llama-3.1-405b-instruct-turbo_verdicts', 'judge_gemma-2-27b-it_verdicts', 'judge_claude-3-5-sonnet-latest_verdicts', 'judge_llama-3.3-70b-instruct-turbo_verdicts', 'judge_gpt-4o_verdicts', 'internlm_scores', 'gpm_scores', 'offset_bias_scores', 'grm_llama32_scores', 'qrm_scores', 'grm_scores', 'urm_scores', 'skyworks_scores', 'grm_gemma_scores']


In [5]:
def calculate_metrics(df_test, y_pred, y_pred_proba):
    """Calculate all required metrics including generation accuracy"""
    metrics_per_problem = []
    total_correct_predictions = 0
    total_predictions = 0
    
    # Calculate metrics for each problem
    for prob_idx in df_test['problem_idx'].unique():
        prob_mask = df_test['problem_idx'] == prob_idx
        prob_df = df_test[prob_mask]
        prob_proba = y_pred_proba[prob_mask]
        prob_pred = y_pred[prob_mask]
        
        # Calculate generation accuracy for this problem
        correct_predictions = (prob_pred == prob_df['is_correct']).sum()
        total_predictions += len(prob_pred)
        total_correct_predictions += correct_predictions
        
        # Selection metrics (as before)
        selected_idx = np.argmax(prob_proba)
        prob_labels = prob_df['is_correct'].values
        
        tp = 1 if prob_labels[selected_idx] else 0
        fp = 1 if not prob_labels[selected_idx] else 0
        fn = 1 if sum(prob_labels) > 0 and not prob_labels[selected_idx] else 0
        tn = 1 if sum(prob_labels) == 0 and not prob_labels[selected_idx] else 0
        
        metrics_per_problem.append({
            'true_positive': tp,
            'false_positive': fp,
            'false_negative': fn,
            'true_negative': tn
        })
    
    # Aggregate metrics
    n_problems = len(metrics_per_problem)
    tp = sum(m['true_positive'] for m in metrics_per_problem)
    fp = sum(m['false_positive'] for m in metrics_per_problem)
    fn = sum(m['false_negative'] for m in metrics_per_problem)
    tn = sum(m['true_negative'] for m in metrics_per_problem)
    
    metrics = {
        'selection_accuracy': tp / n_problems,
        'selection_precision': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'selection_recall': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'selection_tp': tp,
        'selection_tn': tn,
        'selection_fp': fp,
        'selection_fn': fn,
        'generation_accuracy': total_correct_predictions / total_predictions
    }

    metrics['selection_f1'] = (
        2 * metrics['selection_precision'] * metrics['selection_recall'] /
        (metrics['selection_precision'] + metrics['selection_recall'])
        if (metrics['selection_precision'] + metrics['selection_recall']) > 0 else 0
    )

    return metrics

In [6]:
def train_global_mlp(df, feature_columns, train_percentage):
    """Train global MLP with fixed architecture"""
    X = df[feature_columns].copy()
    y = df['is_correct']
    
    problems_per_generation = len(df) / len(df['problem_idx'].unique())
    k = int(problems_per_generation * train_percentage)
    train_mask = df.groupby('problem_idx').cumcount() < k
    
    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test = X[~train_mask]
    y_test = y[~train_mask]
    df_test = df[~train_mask]
    
    mlp = MLPClassifier(
        hidden_layer_sizes=(100, 50),
        learning_rate_init=0.001,
        max_iter=1000,
        random_state=42
    )
    
    mlp.fit(X_train, y_train)

    test_pred = mlp.predict(X_test)
    test_pred_proba = mlp.predict_proba(X_test)[:, 1]
    
    metrics = calculate_metrics(df_test, test_pred, test_pred_proba)
    metrics['train_percentage'] = train_percentage
    
    return metrics

In [7]:
def plot_results(results_df: pd.DataFrame, model_type: str):
    """Plot accuracy metrics"""
    plt.figure(figsize=(10, 5))
    
    # Selection@1 plot
    plt.subplot(1, 2, 1)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['selection_accuracy'],
             '-o', label=f'{model_type} MLP')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Selection@1')
    plt.title('Selection@1 vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()

    # Generation Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['generation_accuracy'],
             '-o', label=f'{model_type} MLP')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Generation Accuracy')
    plt.title('Generation Accuracy vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(exp_dir, 'plots.png'))
    plt.close()

In [8]:
train_percentages = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
results = []

for train_percentage in tqdm(train_percentages):
    print(f"\nTraining with {train_percentage*100}% of data")
    metrics = train_global_mlp(df, feature_columns, train_percentage)
    results.append(metrics)
    print(f"Selection accuracy: {metrics['selection_accuracy']:.3f}")
    print(f"Generation Accuracy: {metrics['generation_accuracy']:.3f}")

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(exp_dir, 'metrics.csv'), index=False)  
plot_results(results_df, model_type='Global')  

  0%|          | 0/6 [00:00<?, ?it/s]


Training with 0.1% of data


 17%|█▋        | 1/6 [00:00<00:01,  2.87it/s]

Selection accuracy: 0.460
Generation Accuracy: 0.693

Training with 1.0% of data


 33%|███▎      | 2/6 [00:01<00:03,  1.24it/s]

Selection accuracy: 0.490
Generation Accuracy: 0.744

Training with 10.0% of data


 50%|█████     | 3/6 [00:07<00:10,  3.35s/it]

Selection accuracy: 0.570
Generation Accuracy: 0.781

Training with 30.0% of data


 67%|██████▋   | 4/6 [00:24<00:17,  8.75s/it]

Selection accuracy: 0.550
Generation Accuracy: 0.805

Training with 50.0% of data


 83%|████████▎ | 5/6 [01:07<00:20, 20.98s/it]

Selection accuracy: 0.620
Generation Accuracy: 0.815

Training with 70.0% of data


100%|██████████| 6/6 [01:58<00:00, 19.72s/it]

Selection accuracy: 0.630
Generation Accuracy: 0.815



