# Model Performance Analysis

This notebook evaluates and compares the performance of the different LLM prediction models, both with and without PPO adjustments.

## 1. Import Libraries

In [21]:
import pandas as pd
import numpy as np
import json

print("Libraries imported.")

Libraries imported.


## 2. Load Prediction Data

In [22]:
try:
    # With PPO adjustments
    inference_ppo_df = pd.read_csv('../results/test_predictions_with_ppo.csv')
    justification_ppo_df = pd.read_csv('../results/test_predictions_justification_ppo.csv')
    cot_ppo_df = pd.read_csv('../results/test_predictions_cot_ppo.csv')
    
    # Without PPO adjustments (raw checkpoints)
    with open('../results/llm_predictions_checkpoint.json', 'r') as f:
        inference_raw = json.load(f)
    with open('../results/llm_predictions_justification_checkpoint.json', 'r') as f:
        justification_raw = json.load(f)
    with open('../results/llm_predictions_cot_checkpoint.json', 'r') as f:
        cot_raw = json.load(f)
        
    print("All prediction files loaded successfully.")
    
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Please ensure all inference notebooks have been run.")

All prediction files loaded successfully.


## 3. Calculate and Compare MAE

In [23]:
def calculate_mae_from_df(df, pred_col, actual_col='actual_price'):
    """Calculates the Mean Absolute Error from a DataFrame."""
    return np.mean(np.abs(df[pred_col] - df[actual_col]))

def calculate_mae_from_raw(raw_data):
    """Calculates the Mean Absolute Error from raw checkpoint data, handling None values."""
    valid_predictions = []
    valid_actuals = []
    
    llm_results = raw_data.get('llm_results', [])
    actual_prices = raw_data.get('actual_prices', [])
    
    num_samples = min(len(llm_results), len(actual_prices))

    for i in range(num_samples):
        # Use .get() for safety in case 'predicted_close' is missing
        prediction = llm_results[i].get('predicted_close')
        actual = actual_prices[i]
        
        if prediction is not None:
            valid_predictions.append(float(prediction))
            valid_actuals.append(float(actual))
            
    if not valid_predictions:
        return np.nan

    return np.mean(np.abs(np.array(valid_predictions) - np.array(valid_actuals)))

results = {}

# PPO models
if 'inference_ppo_df' in locals():
    results['Inference (PPO)'] = {'MAE': calculate_mae_from_df(inference_ppo_df, 'ppo_adjusted_prediction')}

if 'justification_ppo_df' in locals():
    results['Justification (PPO)'] = {'MAE': calculate_mae_from_df(justification_ppo_df, 'ppo_adjusted_prediction')}

if 'cot_ppo_df' in locals():
    results['CoT (PPO)'] = {'MAE': calculate_mae_from_df(cot_ppo_df, 'ppo_adjusted_prediction')}

# Raw LLM models
if 'inference_raw' in locals():
    results['Inference (LLM only)'] = {'MAE': calculate_mae_from_raw(inference_raw)}

if 'justification_raw' in locals():
    results['Justification (LLM only)'] = {'MAE': calculate_mae_from_raw(justification_raw)}

if 'cot_raw' in locals():
    results['CoT (LLM only)'] = {'MAE': calculate_mae_from_raw(cot_raw)}

if results:
    results_df = pd.DataFrame(results).T
    results_df = results_df.sort_values(by='MAE', ascending=True)
    
    print("Model Performance Comparison (MAE):")
    display(results_df)
else:
    print("No data to compare. Please check the file loading step.")

Model Performance Comparison (MAE):


Unnamed: 0,MAE
Justification (LLM only),10.379693
CoT (LLM only),10.971941
Justification (PPO),17.024346
CoT (PPO),17.742438
Inference (LLM only),62.115208
Inference (PPO),64.599908


## 4. Calculate and Compare MAPE

In [24]:
def calculate_mape_from_df(df, pred_col, actual_col='actual_price'):
    """Calculates the Mean Absolute Percentage Error from a DataFrame."""
    return np.mean(np.abs((df[actual_col] - df[pred_col]) / df[actual_col])) * 100

def calculate_mape_from_raw(raw_data):
    """Calculates the Mean Absolute Percentage Error from raw checkpoint data."""
    valid_predictions = []
    valid_actuals = []
    
    llm_results = raw_data.get('llm_results', [])
    actual_prices = raw_data.get('actual_prices', [])
    
    num_samples = min(len(llm_results), len(actual_prices))

    for i in range(num_samples):
        prediction = llm_results[i].get('predicted_close')
        actual = actual_prices[i]
        
        if prediction is not None and actual is not None and actual != 0:
            valid_predictions.append(float(prediction))
            valid_actuals.append(float(actual))
            
    if not valid_predictions:
        return np.nan

    return np.mean(np.abs((np.array(valid_actuals) - np.array(valid_predictions)) / np.array(valid_actuals))) * 100

mape_results = {}

# PPO models
if 'inference_ppo_df' in locals():
    mape_results['Inference (PPO)'] = {'MAPE (%)': calculate_mape_from_df(inference_ppo_df, 'ppo_adjusted_prediction')}

if 'justification_ppo_df' in locals():
    mape_results['Justification (PPO)'] = {'MAPE (%)': calculate_mape_from_df(justification_ppo_df, 'ppo_adjusted_prediction')}

if 'cot_ppo_df' in locals():
    mape_results['CoT (PPO)'] = {'MAPE (%)': calculate_mape_from_df(cot_ppo_df, 'ppo_adjusted_prediction')}

# Raw LLM models
if 'inference_raw' in locals():
    mape_results['Inference (LLM only)'] = {'MAPE (%)': calculate_mape_from_raw(inference_raw)}

if 'justification_raw' in locals():
    mape_results['Justification (LLM only)'] = {'MAPE (%)': calculate_mape_from_raw(justification_raw)}

if 'cot_raw' in locals():
    mape_results['CoT (LLM only)'] = {'MAPE (%)': calculate_mape_from_raw(cot_raw)}

if mape_results:
    mape_df = pd.DataFrame(mape_results).T
    mape_df = mape_df.sort_values(by='MAPE (%)', ascending=True)
    
    print("Model Performance Comparison (MAPE %):")
    display(mape_df)
else:
    print("No data to compare. Please check the file loading step.")

Model Performance Comparison (MAPE %):


Unnamed: 0,MAPE (%)
Justification (LLM only),1.329401
CoT (LLM only),1.36538
Justification (PPO),2.347263
CoT (PPO),2.417415
Inference (LLM only),6.789956
Inference (PPO),7.320734
