In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

In [3]:
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("colorblind")
sns.set_context("notebook", font_scale=1.2)

In [4]:
os.makedirs('results/visualizations', exist_ok=True)
os.makedirs('results/tables', exist_ok=True)

In [5]:
df_detailed = pd.read_csv('results/data/all_results_detailed.csv')

In [6]:
def create_summary():
    summary_data = []
    for (system, dataset), group in df_detailed.groupby(['System', 'Dataset']):
        row = {'System': system, 'Dataset': dataset}
        
        # Calculate averages and std devs
        for prefix in ['LR', 'DL']:
            for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
                col = f"{prefix}_{metric}"
                if col in df_detailed.columns:
                    row[f"{col}_mean"] = group[col].mean()
                    row[f"{col}_std"] = group[col].std()
        
        # Calculate improvements
        for metric in ['MAPE', 'MAE', 'RMSE']:
            lr_mean = row[f"LR_{metric}_mean"]
            dl_mean = row[f"DL_{metric}_mean"]
            # For error metrics, lower is better
            row[f"{metric}_improvement"] = ((lr_mean - dl_mean) / lr_mean) * 100
        
        # For R2, higher is better
        if row['LR_R2_mean'] > 0:
            row['R2_improvement'] = ((row['DL_R2_mean'] - row['LR_R2_mean']) / abs(row['LR_R2_mean'])) * 100
        else:
            row['R2_improvement'] = row['DL_R2_mean'] - row['LR_R2_mean']
        
        # Calculate statistical significance
        for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
            lr_col = f"LR_{metric}"
            dl_col = f"DL_{metric}"
            if lr_col in df_detailed.columns and dl_col in df_detailed.columns:
                try:
                    if metric == 'R2':
                        # For R2, higher is better
                        stat, p_value = stats.wilcoxon(group[dl_col], group[lr_col], alternative='greater')
                    else:
                        # For error metrics, lower is better
                        stat, p_value = stats.wilcoxon(group[lr_col], group[dl_col], alternative='greater')
                    
                    row[f"{metric}_p_value"] = p_value
                    row[f"{metric}_significant"] = p_value < 0.05
                except:
                    row[f"{metric}_p_value"] = np.nan
                    row[f"{metric}_significant"] = False
        
        summary_data.append(row)
    
    return pd.DataFrame(summary_data)

In [7]:
def create_metric_plots(df_summary):
    metrics = ['MAPE', 'MAE', 'RMSE', 'R2']
    
    # For each metric, show bar graph with standard deviation for all systems
    for metric in metrics:
        # Create a figure with two subplots side by side
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8), 
                                      gridspec_kw={'width_ratios': [3, 1]})
        
        # Extract systems
        all_systems = df_summary['System'].unique()
        
        # Separate 'h2' from other systems
        other_systems = [s for s in all_systems if s != 'h2']
        
        # First subplot: all systems except 'h2'
        x_positions_main = np.arange(len(other_systems))
        lr_means_main = []
        lr_stds_main = []
        dl_means_main = []
        dl_stds_main = []
        
        # Collect data for other systems
        for system in other_systems:
            system_data = df_summary[df_summary['System'] == system]
            
            # Aggregate across datasets if multiple exist per system
            lr_mean = system_data[f'LR_{metric}_mean'].mean()
            lr_std = np.sqrt(np.sum(system_data[f'LR_{metric}_std']**2)) / len(system_data)
            dl_mean = system_data[f'DL_{metric}_mean'].mean()
            dl_std = np.sqrt(np.sum(system_data[f'DL_{metric}_std']**2)) / len(system_data)
            
            lr_means_main.append(lr_mean)
            lr_stds_main.append(lr_std)
            dl_means_main.append(dl_mean)
            dl_stds_main.append(dl_std)
        
        # Width of the bars
        bar_width = 0.35
        
        # Create bars with error bars for main systems
        bars1 = ax1.bar(x_positions_main - bar_width/2, lr_means_main, bar_width, 
               yerr=lr_stds_main, label='Linear Regression', alpha=0.7, capsize=5)
        bars2 = ax1.bar(x_positions_main + bar_width/2, dl_means_main, bar_width, 
               yerr=dl_stds_main, label='Deep Learning', alpha=0.7, capsize=5)
        
        # Add data labels to bars
        for i, (bar1, bar2) in enumerate(zip(bars1, bars2)):
            height1 = bar1.get_height()
            height2 = bar2.get_height()
            ax1.text(bar1.get_x() + bar1.get_width()/2., height1 + lr_stds_main[i] + 0.01,
                    f'{height1:.3f}', ha='center', va='bottom', fontsize=8)
            ax1.text(bar2.get_x() + bar2.get_width()/2., height2 + dl_stds_main[i] + 0.01,
                    f'{height2:.3f}', ha='center', va='bottom', fontsize=8)
            
        # Set labels and title for main systems
        ax1.set_xlabel('System')
        ax1.set_ylabel(metric)
        ax1.set_title(f'{metric} Comparison Across Systems')
        ax1.set_xticks(x_positions_main)
        ax1.set_xticklabels(other_systems, rotation=45)
        ax1.legend(loc='best')
        
        # Second subplot: only 'h2'
        if 'h2' in all_systems:
            h2_data = df_summary[df_summary['System'] == 'h2']
            
            # Calculate metrics for h2
            lr_mean_h2 = h2_data[f'LR_{metric}_mean'].mean()
            lr_std_h2 = np.sqrt(np.sum(h2_data[f'LR_{metric}_std']**2)) / len(h2_data)
            dl_mean_h2 = h2_data[f'DL_{metric}_mean'].mean()
            dl_std_h2 = np.sqrt(np.sum(h2_data[f'DL_{metric}_std']**2)) / len(h2_data)
            
            # Create bars for h2
            bar1_h2 = ax2.bar(0 - bar_width/2, lr_mean_h2, bar_width, 
                   yerr=lr_std_h2, label='Linear Regression', alpha=0.7, capsize=5)
            bar2_h2 = ax2.bar(0 + bar_width/2, dl_mean_h2, bar_width, 
                   yerr=dl_std_h2, label='Deep Learning', alpha=0.7, capsize=5)
            
            # Add data labels
            ax2.text(bar1_h2[0].get_x() + bar1_h2[0].get_width()/2., lr_mean_h2 + lr_std_h2 + 0.01,
                    f'{lr_mean_h2:.3f}', ha='center', va='bottom', fontsize=8)
            ax2.text(bar2_h2[0].get_x() + bar2_h2[0].get_width()/2., dl_mean_h2 + dl_std_h2 + 0.01,
                    f'{dl_mean_h2:.3f}', ha='center', va='bottom', fontsize=8)
            
            # Set labels for h2
            ax2.set_xlabel('System')
            ax2.set_title('h2 System')
            ax2.set_xticks([0])
            ax2.set_xticklabels(['h2'])
        
        # Add note about which direction is better
        if metric == 'R2':
            plt.figtext(0.01, 0.01, "Higher is better", style='italic', fontsize=10)
        else:
            plt.figtext(0.01, 0.01, "Lower is better", style='italic', fontsize=10)
        
        plt.tight_layout()
        plt.savefig(f'results/visualizations/{metric}_all_systems.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # Create improvement percentage visualization across all systems
    plt.figure(figsize=(14, 8))
    
    # Prepare data for plotting improvement percentages
    improvement_data = []
    
    for system in all_systems:
        system_data = df_summary[df_summary['System'] == system]
        for metric in metrics:
            improvement = system_data[f'{metric}_improvement'].mean()
            
            improvement_data.append({
                'System': system,
                'Metric': metric,
                'Improvement (%)': improvement
            })
    
    # Convert to DataFrame
    imp_df = pd.DataFrame(improvement_data)
    
    # Create grouped bar plot for improvements
    ax = sns.barplot(x='System', y='Improvement (%)', hue='Metric', data=imp_df)
    
    # Add data labels to bars
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2., 
                height + (0.5 if height >= 0 else -2.5),
                f'{height:.1f}%', 
                ha='center', fontsize=8)
    
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title('Improvement Percentages Across Systems and Metrics')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('results/visualizations/all_improvements.png', dpi=300, bbox_inches='tight')
    plt.close()

In [8]:
df_summary = create_summary()
create_metric_plots(df_summary)