In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from datasets import load_dataset
from typing import Dict, List, Tuple
from joblib import Parallel, delayed
from tqdm import tqdm
import os

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '..'))
sys.path.append(project_root)
from baseline.utils.setup import setup_experiment_dir
exp_dir = setup_experiment_dir('local_rf')

In [3]:
def prepare_data():
    """Prepare dataset for training and testing."""
    dataset = load_dataset("hazyresearch/MATH_with_LM_Judges_and_Reward_Model_Results_V2")
    data = dataset['data']
    
    # Get columns
    first_item = data[0]
    judge_columns = [k for k in first_item.keys() if k.startswith('judge_')]
    score_columns = [k for k in first_item.keys() 
                    if k.endswith('_scores') and not k.startswith('judge_')]
    feature_columns = judge_columns + score_columns
    
    # First pass: collect modes for judge columns
    verdict_values = {col: [] for col in judge_columns}
    for problem in data:
        for col in judge_columns:
            verdict_values[col].extend([v for v in problem[col] if v is not None])
    
    modes = {col: max(set(values), key=values.count) 
            for col, values in verdict_values.items()}
    
    data_rows = []
    for idx in range(len(data)):
        problem = data[idx]
        normalized_scores = normalize_scores(problem, score_columns)
        
        for i in range(len(problem['samples'])):
            row = {
                'problem_idx': idx,
                'solution_idx': i,
                'is_correct': problem['answer_correct'][i],
                **{column: modes[column] if problem[column][i] is None else problem[column][i] 
                   for column in judge_columns},
                **{column: normalized_scores[column][i] for column in score_columns}
            }
            data_rows.append(row)
            
    df = pd.DataFrame(data_rows)
    return df, feature_columns

def normalize_scores(problem, score_columns):
    """Normalize scores within each problem"""
    normalized_scores = {}
    for score_col in score_columns:
        scores = np.array(problem[score_col])
        min_score = np.min(scores)
        max_score = np.max(scores)
        score_range = max_score - min_score
        if score_range == 0:
            normalized_scores[score_col] = [0.5] * len(scores)
        else:
            normalized_scores[score_col] = (scores - min_score) / score_range
    return normalized_scores


def get_feature_thresholds(rf_model, feature_columns) -> Dict:
    """Get median thresholds for each feature from the RF model"""
    if rf_model is None:
        return {}

    feature_thresholds = {}
    for tree in rf_model.estimators_:
        for feature_idx, threshold in zip(tree.tree_.feature, tree.tree_.threshold):
            if feature_idx >= 0:
                feature_name = feature_columns[feature_idx]
                if feature_name not in feature_thresholds:
                    feature_thresholds[feature_name] = []
                feature_thresholds[feature_name].append(threshold)

    median_thresholds = {
        feature: np.median(thresholds)
        for feature, thresholds in feature_thresholds.items()
    }

    return median_thresholds

In [4]:
def calculate_metrics(df_test, y_pred, y_pred_proba, rf_model, feature_columns):
    """Calculate metrics for a single problem's local RF model"""
    # Generation accuracy
    generation_accuracy = accuracy_score(df_test['is_correct'], y_pred)
    
    # Selection metrics
    prob_proba = y_pred_proba
    prob_labels = df_test['is_correct'].values
    
    # Select solution with highest probability
    selected_idx = np.argmax(prob_proba)
    
    # Calculate base metrics
    tp = 1 if prob_labels[selected_idx] else 0
    fp = 1 if not prob_labels[selected_idx] else 0
    fn = 1 if sum(prob_labels) > 0 and not prob_labels[selected_idx] else 0
    tn = 1 if sum(prob_labels) == 0 and not prob_labels[selected_idx] else 0
    
    # Calculate derived metrics
    selection_accuracy = tp  # Will be 1 or 0 for a single problem
    selection_precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    selection_recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    selection_f1 = 2 * (selection_precision * selection_recall) / (
        selection_precision + selection_recall) if (selection_precision + selection_recall) > 0 else 0
    
    return {
        'problem_idx': df_test['problem_idx'].iloc[0],
        'generation_accuracy': generation_accuracy,
        'selection_accuracy': selection_accuracy,
        'selection_precision': selection_precision,
        'selection_recall': selection_recall,
        'selection_f1': selection_f1,
        'selection_tp': tp,
        'selection_tn': tn,
        'selection_fp': fp,
        'selection_fn': fn,
        'feature_thresholds': get_feature_thresholds(rf_model, feature_columns)
    }

In [5]:
def train_problem_rf(problem_data: Tuple[pd.DataFrame, float, List[str]]) -> Dict:
    """Train and evaluate RF model for a single problem"""
    df_prob, train_percentage, feature_columns = problem_data
    
    X = df_prob[feature_columns].copy()
    y = df_prob['is_correct']
    
    n_positive = y.sum()
    
    # Case 1: insufficient positive examples - predict majority class
    if n_positive <= 3:
        majority_class = int(y.mean() >= 0.5)
        y_pred = np.full(len(y), majority_class)
        y_pred_proba = np.full(len(y), float(majority_class))
        metrics = calculate_metrics(df_prob, y_pred, y_pred_proba, None, feature_columns)
        metrics['insufficient_data'] = True
        return metrics
    
    # Case 2: sufficient data for training
    skf = StratifiedKFold(n_splits=max(2, int(np.ceil(1/train_percentage))),
                         shuffle=True, random_state=42)
    train_idx, test_idx = next(skf.split(X, y))
    
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]
    df_test = df_prob.iloc[test_idx]
    
    # Fixed RF parameters
    rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_pred_proba = rf.predict_proba(X_test)[:, 1]
    
    metrics = calculate_metrics(df_test, y_pred, y_pred_proba, rf, feature_columns)
    metrics['insufficient_data'] = False
    return metrics

In [6]:
def aggregate_metrics(problem_metrics: List[Dict]) -> Dict:
    """Aggregate metrics across all problems"""
    return {
        'generation_accuracy': np.mean([m['generation_accuracy'] for m in problem_metrics]),
        'selection_accuracy': np.mean([m['selection_accuracy'] for m in problem_metrics]),
        'selection_precision': np.mean([m['selection_precision'] for m in problem_metrics]),
        'selection_recall': np.mean([m['selection_recall'] for m in problem_metrics]),
        'selection_f1': np.mean([m['selection_f1'] for m in problem_metrics]),
        'selection_tp': sum(m['selection_tp'] for m in problem_metrics),
        'selection_tn': sum(m['selection_tn'] for m in problem_metrics),
        'selection_fp': sum(m['selection_fp'] for m in problem_metrics),
        'selection_fn': sum(m['selection_fn'] for m in problem_metrics),
        'total_problems': len(problem_metrics),
        'insufficient_data_problems': sum(1 for m in problem_metrics if m.get('insufficient_data', False))
    }

In [7]:
def train_local_rfs(train_percentage: float):
    """Train and evaluate local RF models in parallel"""
    print(f"\nTraining with {train_percentage*100}% of problems")
    
    df, feature_columns = prepare_data()
    
    problem_groups = [
        (group, train_percentage, feature_columns)
        for _, group in df.groupby('problem_idx')
    ]
    
    print("Training models for each problem...")
    problem_metrics = Parallel(n_jobs=-1, verbose=1)(
        delayed(train_problem_rf)(problem_data)
        for problem_data in problem_groups
    )
    
    aggregated_metrics = aggregate_metrics(problem_metrics)
    
    problem_metrics_df = pd.DataFrame(problem_metrics)
    problem_metrics_df.to_csv(
        os.path.join(exp_dir, 'per_problem', f'metrics_{train_percentage}.csv'),
        index=False
    )
    
    return aggregated_metrics

In [8]:
def plot_results(results_df):
    """Plot accuracy metrics"""
    plt.figure(figsize=(10, 5))
    
    # Selection@1 plot
    plt.subplot(1, 2, 1)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['selection_accuracy'],
             '-o', label='Local RF')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Selection@1')
    plt.title('Selection@1 vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()

    # Generation Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['generation_accuracy'],
             '-o', label='Local RF')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Generation Accuracy')
    plt.title('Generation Accuracy vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(exp_dir, 'plots.png'))
    plt.close()

print("Starting local RF training pipeline...")
train_percentages = [0.1, 0.3, 0.5, 0.7]
results = []

for train_percentage in train_percentages:
    metrics = train_local_rfs(train_percentage)
    metrics['train_percentage'] = train_percentage
    results.append(metrics)
    
    print(f"\nResults for {train_percentage*100}% training data:")
    print(f"Selection accuracy: {metrics['selection_accuracy']:.3f}")
    print(f"Generation accuracy: {metrics['generation_accuracy']:.3f}")
    print(f"Selection F1: {metrics['selection_f1']:.3f}")
    print(f"Insufficient data problems: {metrics['insufficient_data_problems']}")

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(exp_dir, 'metrics.csv'), index=False)

plot_results(results_df)

Starting local RF training pipeline...

Training with 10.0% of problems
Training models for each problem...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.5s finished



Results for 10.0% training data:
Selection accuracy: 0.740
Generation accuracy: 0.921
Selection F1: 0.740
Insufficient data problems: 4

Training with 30.0% of problems
Training models for each problem...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished



Results for 30.0% training data:
Selection accuracy: 0.780
Generation accuracy: 0.920
Selection F1: 0.780
Insufficient data problems: 4

Training with 50.0% of problems
Training models for each problem...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.3s finished



Results for 50.0% training data:
Selection accuracy: 0.780
Generation accuracy: 0.916
Selection F1: 0.780
Insufficient data problems: 4

Training with 70.0% of problems
Training models for each problem...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.0s



Results for 70.0% training data:
Selection accuracy: 0.780
Generation accuracy: 0.916
Selection F1: 0.780
Insufficient data problems: 4


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.1s finished
