In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from datasets import load_dataset
from typing import Dict
import os

project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), '..', '..'))
sys.path.append(project_root)
from baseline.utils.setup import setup_experiment_dir
exp_dir = setup_experiment_dir('global_rf')

In [2]:
def prepare_data(dataset_name):
    dataset = load_dataset(dataset_name)
    data = dataset['data']

    METADATA_COLUMNS = {
        'problem', 'samples', 'solution', 'instruction', 'type', 'level', 'answer_correct', 'extracted_answers',
        'problem_idx', 'solution_idx'
    }
    
    # get first row (problem) to identify columns
    first_item = data[0]
    all_columns = set(first_item.keys())
    
    # First identify judge columns (they're special case with T/F values)
    judge_columns = [col for col in all_columns 
                    if col.startswith('judge_') and col not in METADATA_COLUMNS]
    
    # Then identify numeric feature columns (excluding judge columns)
    numeric_columns = [col for col in all_columns 
                      if col not in METADATA_COLUMNS and 
                      col not in judge_columns and
                      isinstance(first_item[col][0], (int, float, np.number))]
    
    # 1st pass: collect all values for each verdict column
    verdict_values = {col: [] for col in judge_columns}
    for problem in data:
        num_samples = len(problem['samples'])
        for col in judge_columns:
            # Only collect values if they match the number of samples
            if len(problem[col]) == num_samples:
                verdict_values[col].extend([v for v in problem[col] if v is not None])
    
    # calc mode for each verdict column with safety check
    modes = {}
    for col, values in verdict_values.items():
        if not values:  # If all values were None
            modes[col] = False  # default to False for empty columns
        else:
            modes[col] = max(set(values), key=values.count)
    
    # score cols are numeric columns that end with "_score(s)"
    score_columns = [col for col in numeric_columns
                    if col.endswith('_score') or col.endswith('_scores')]
    
    # other numerical columns (e.g. rewards, steps)
    other_num_columns = [col for col in numeric_columns
                        if col not in score_columns]
    
    data_rows = []
    for idx in range(len(data)):
        problem = data[idx]
        num_samples = len(problem['samples'])
        
        # filter columns to only those that match the number of samples/generations
        valid_judge_columns = [col for col in judge_columns if len(problem[col]) == num_samples]
        valid_score_columns = [col for col in score_columns if len(problem[col]) == num_samples]
        valid_other_columns = [col for col in other_num_columns if len(problem[col]) == num_samples]
        
        normalized_scores = normalize_scores(problem, valid_score_columns)
        normalized_nums = normalize_scores(problem, valid_other_columns)
        
        for i in range(num_samples):
            if i < len(problem['answer_correct']):  # check if we have a corresponding answer
                row = {
                    'problem_idx': idx,
                    'solution_idx': i,
                    'is_correct': problem['answer_correct'][i],
                    # Handle judge columns separately (no normalization)
                    **{column: modes[column] if problem[column][i] is None
                       else problem[column][i] for column in valid_judge_columns},
                    # Normalize numeric columns
                    **{column: normalized_scores[column][i] for column in valid_score_columns},
                    **{column: normalized_nums[column][i] for column in valid_other_columns}
                }
                data_rows.append(row)
            
    df = pd.DataFrame(data_rows)
    final_feature_columns = [col for col in (judge_columns + numeric_columns) 
                            if col in df.columns]
    
    return df, final_feature_columns

def normalize_scores(problem, columns):
    """Normalize scores within each problem"""
    normalized = {}
    for col in columns:
        values = np.array(problem[col])
        min_val = np.min(values)
        max_val = np.max(values)
        score_range = max_val - min_val
        if score_range == 0:
            normalized[col] = [0.5] * len(values)
        else:
            normalized[col] = (values - min_val) / score_range
    return normalized

def get_feature_thresholds(rf_model, feature_columns) -> Dict:
    """Get median thresholds for each feature from the RF model"""
    if rf_model is None:
        return {}

    feature_thresholds = {}
    for tree in rf_model.estimators_:
        for feature_idx, threshold in zip(tree.tree_.feature, tree.tree_.threshold):
            if feature_idx >= 0:
                feature_name = feature_columns[feature_idx]
                if feature_name not in feature_thresholds:
                    feature_thresholds[feature_name] = []
                feature_thresholds[feature_name].append(threshold)

    median_thresholds = {
        feature: np.median(thresholds)
        for feature, thresholds in feature_thresholds.items()
    }

    return median_thresholds

In [3]:
def calculate_metrics(df_test, y_pred, y_pred_proba, rf_model, feature_columns):
    """Calculate all metrics for global RF model"""
    metrics_per_problem = []
    total_correct_predictions = 0
    total_predictions = 0
    
    for prob_idx in df_test['problem_idx'].unique():
        prob_mask = df_test['problem_idx'] == prob_idx
        prob_df = df_test[prob_mask]
        prob_proba = y_pred_proba[prob_mask]
        prob_pred = y_pred[prob_mask]
        
        # Generation accuracy
        correct_predictions = (prob_pred == prob_df['is_correct']).sum()
        total_predictions += len(prob_pred)
        total_correct_predictions += correct_predictions
        
        # Selection metrics
        selected_idx = np.argmax(prob_proba)
        prob_labels = prob_df['is_correct'].values
        
        tp = 1 if prob_labels[selected_idx] else 0
        fp = 1 if not prob_labels[selected_idx] else 0
        fn = 1 if sum(prob_labels) > 0 and not prob_labels[selected_idx] else 0
        tn = 1 if sum(prob_labels) == 0 and not prob_labels[selected_idx] else 0
        
        metrics_per_problem.append({
            'true_positive': tp,
            'false_positive': fp,
            'false_negative': fn,
            'true_negative': tn
        })
    
    # Aggregate metrics
    n_problems = len(metrics_per_problem)
    tp = sum(m['true_positive'] for m in metrics_per_problem)
    fp = sum(m['false_positive'] for m in metrics_per_problem)
    fn = sum(m['false_negative'] for m in metrics_per_problem)
    tn = sum(m['true_negative'] for m in metrics_per_problem)
    
    metrics = {
        'selection_accuracy': tp / n_problems,
        'selection_precision': tp / (tp + fp) if (tp + fp) > 0 else 0,
        'selection_recall': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'selection_tp': tp,
        'selection_tn': tn,
        'selection_fp': fp,
        'selection_fn': fn,
        'generation_accuracy': total_correct_predictions / total_predictions,
        'feature_thresholds': get_feature_thresholds(rf_model, feature_columns)
    }
    
    # Calculate F1 score
    metrics['selection_f1'] = (
        2 * metrics['selection_precision'] * metrics['selection_recall'] /
        (metrics['selection_precision'] + metrics['selection_recall'])
        if (metrics['selection_precision'] + metrics['selection_recall']) > 0 else 0
    )
    
    return metrics

In [4]:
def train_global_rf(df, feature_columns, train_percentage):
    """Train global RF with fixed architecture"""
    X = df[feature_columns].copy()
    y = df['is_correct']
    
    # Calculate k for training split
    problems_per_generation = len(df) / len(df['problem_idx'].unique())
    k = int(problems_per_generation * train_percentage)
    train_mask = df.groupby('problem_idx').cumcount() < k
    
    X_train = X[train_mask]
    y_train = y[train_mask]
    X_test = X[~train_mask]
    y_test = y[~train_mask]
    df_test = df[~train_mask]
    
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    rf.fit(X_train, y_train)
    
    # get predictions
    y_pred = rf.predict(X_test)
    y_pred_proba = rf.predict_proba(X_test)[:, 1]
    
    # get metrics
    metrics = calculate_metrics(df_test, y_pred, y_pred_proba, rf, feature_columns)
    metrics['train_percentage'] = train_percentage
    
    # get feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return metrics, feature_importance

In [5]:
def plot_results(results_df):
    """Plot accuracy metrics"""
    plt.figure(figsize=(10, 5))
    
    # Selection@1 plot
    plt.subplot(1, 2, 1)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['selection_accuracy'],
             '-o', label='Global RF')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Selection@1')
    plt.title('Selection@1 vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()

    # Generation Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(results_df['train_percentage'] * 100,
             results_df['generation_accuracy'],
             '-o', label='Global RF')
    plt.xlabel('Percentage of Training Data')
    plt.ylabel('Generation Accuracy')
    plt.title('Generation Accuracy vs Training Data Size')
    plt.grid(True)
    plt.ylim(0, 1)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(exp_dir, 'plots.png'))
    plt.close()

In [6]:
print("Preparing data...")
df, feature_columns = prepare_data("hazyresearch/CodeContests_Llama_70B_with_LM_Judges_and_RMs_v1")

train_percentages = [0.001, 0.01, 0.1, 0.3, 0.5, 0.7]
results = []
feature_importances = []

for train_percentage in train_percentages:
    print(f"\nTraining with {train_percentage*100}% of data")
    metrics, importance_df = train_global_rf(df, feature_columns, train_percentage)
    results.append(metrics)
    feature_importances.append({
        'train_percentage': train_percentage,
        'importance_df': importance_df
    })
    
    print(f"Selection accuracy: {metrics['selection_accuracy']:.3f}")
    print(f"Generation accuracy: {metrics['generation_accuracy']:.3f}")

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(exp_dir, 'metrics.csv'), index=False)  

# save feature importances for each training percentage
os.makedirs(os.path.join(
            exp_dir, 'feature_importance'), exist_ok=True)
for result in feature_importances:
    result['importance_df'].to_csv(
        os.path.join(exp_dir, 'feature_importance', f'feature_importance_{train_percentage}.csv'),
        index=False
    )

plot_results(results_df)

Preparing data...

Training with 0.1% of data
Selection accuracy: 0.150
Generation accuracy: 0.950

Training with 1.0% of data
Selection accuracy: 0.157
Generation accuracy: 0.965

Training with 10.0% of data
Selection accuracy: 0.171
Generation accuracy: 0.976

Training with 30.0% of data
Selection accuracy: 0.179
Generation accuracy: 0.976

Training with 50.0% of data
Selection accuracy: 0.171
Generation accuracy: 0.975

Training with 70.0% of data
Selection accuracy: 0.171
Generation accuracy: 0.976
