In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

In [7]:
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("colorblind")
sns.set_context("notebook", font_scale=1.2)

In [8]:
os.makedirs('results/visualizations', exist_ok=True)
os.makedirs('results/tables', exist_ok=True)

In [9]:
df_detailed = pd.read_csv('results/data/all_results_detailed.csv')

# median iqr

In [5]:
def create_summary():
    summary_data = []
    for (system, dataset), group in df_detailed.groupby(['System', 'Dataset']):
        row = {'System': system, 'Dataset': dataset}
        
        # Calculate medians and IQRs instead of means and std devs
        for prefix in ['LR', 'DL']:
            for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
                col = f"{prefix}_{metric}"
                if col in df_detailed.columns:
                    # Calculate median
                    row[f"{col}_median"] = group[col].median()
                    
                    # Calculate IQR (75th percentile - 25th percentile)
                    q1 = group[col].quantile(0.25)
                    q3 = group[col].quantile(0.75)
                    row[f"{col}_iqr"] = q3 - q1
        
        # Calculate improvements based on medians
        for metric in ['MAPE', 'MAE', 'RMSE']:
            lr_median = row[f"LR_{metric}_median"]
            dl_median = row[f"DL_{metric}_median"]
            # For error metrics, lower is better
            row[f"{metric}_improvement"] = ((lr_median - dl_median) / lr_median) * 100
        
        # For R2, higher is better
        if row['LR_R2_median'] > 0:
            row['R2_improvement'] = ((row['DL_R2_median'] - row['LR_R2_median']) / abs(row['LR_R2_median'])) * 100
        else:
            row['R2_improvement'] = row['DL_R2_median'] - row['LR_R2_median']
        
        # Calculate statistical significance
        for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
            lr_col = f"LR_{metric}"
            dl_col = f"DL_{metric}"
            if lr_col in df_detailed.columns and dl_col in df_detailed.columns:
                try:
                    if metric == 'R2':
                        # For R2, higher is better
                        stat, p_value = stats.wilcoxon(group[dl_col], group[lr_col], alternative='greater')
                    else:
                        # For error metrics, lower is better
                        stat, p_value = stats.wilcoxon(group[lr_col], group[dl_col], alternative='greater')
                    
                    row[f"{metric}_p_value"] = p_value
                    row[f"{metric}_significant"] = p_value < 0.05
                except:
                    row[f"{metric}_p_value"] = np.nan
                    row[f"{metric}_significant"] = False
        
        summary_data.append(row)
    
    return pd.DataFrame(summary_data)

In [6]:
def create_metric_plots(df_summary):
    metrics = ['MAPE', 'MAE', 'RMSE', 'R2']
    
    # For each metric, show bar graph with IQR for all systems
    for metric in metrics:
        # Create a figure with two subplots side by side
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8), 
                                      gridspec_kw={'width_ratios': [3, 1]})
        
        # Extract systems
        all_systems = df_summary['System'].unique()
        
        # Separate 'h2' from other systems
        other_systems = [s for s in all_systems if s != 'h2']
        
        # First subplot: all systems except 'h2'
        x_positions_main = np.arange(len(other_systems))
        lr_medians_main = []
        lr_iqrs_main = []
        dl_medians_main = []
        dl_iqrs_main = []
        
        # Collect data for other systems
        for system in other_systems:
            system_data = df_summary[df_summary['System'] == system]
            
            # Aggregate across datasets if multiple exist per system
            # For median, take the median of medians
            lr_median = np.median(system_data[f'LR_{metric}_median'])
            
            # For IQR aggregation, we take the average IQR 
            # (could also use median of IQRs as an alternative)
            lr_iqr = system_data[f'LR_{metric}_iqr'].mean()
            
            dl_median = np.median(system_data[f'DL_{metric}_median'])
            dl_iqr = system_data[f'DL_{metric}_iqr'].mean()
            
            lr_medians_main.append(lr_median)
            lr_iqrs_main.append(lr_iqr)
            dl_medians_main.append(dl_median)
            dl_iqrs_main.append(dl_iqr)
        
        # Width of the bars
        bar_width = 0.35
        
        # Create bars with error bars for main systems
        # For IQR, we display median +/- IQR/2
        bars1 = ax1.bar(x_positions_main - bar_width/2, lr_medians_main, bar_width, 
               yerr=[iqr/2 for iqr in lr_iqrs_main], label='Linear Regression', alpha=0.7, capsize=5)
        bars2 = ax1.bar(x_positions_main + bar_width/2, dl_medians_main, bar_width, 
               yerr=[iqr/2 for iqr in dl_iqrs_main], label='Deep Learning', alpha=0.7, capsize=5)
        
        # Add data labels to bars
        for i, (bar1, bar2) in enumerate(zip(bars1, bars2)):
            height1 = bar1.get_height()
            height2 = bar2.get_height()
            ax1.text(bar1.get_x() + bar1.get_width()/2., height1 + lr_iqrs_main[i]/2 + 0.01,
                    f'{height1:.3f}', ha='center', va='bottom', fontsize=8)
            ax1.text(bar2.get_x() + bar2.get_width()/2., height2 + dl_iqrs_main[i]/2 + 0.01,
                    f'{height2:.3f}', ha='center', va='bottom', fontsize=8)
            
        # Set labels and title for main systems
        ax1.set_xlabel('System')
        ax1.set_ylabel(metric)
        ax1.set_title(f'{metric} Comparison Across Systems (Median with IQR)')
        ax1.set_xticks(x_positions_main)
        ax1.set_xticklabels(other_systems, rotation=45)
        ax1.legend(loc='best')
        
        # Second subplot: only 'h2'
        if 'h2' in all_systems:
            h2_data = df_summary[df_summary['System'] == 'h2']
            
            # Calculate metrics for h2
            lr_median_h2 = np.median(h2_data[f'LR_{metric}_median'])
            lr_iqr_h2 = h2_data[f'LR_{metric}_iqr'].mean()
            dl_median_h2 = np.median(h2_data[f'DL_{metric}_median'])
            dl_iqr_h2 = h2_data[f'DL_{metric}_iqr'].mean()
            
            # Create bars for h2
            bar1_h2 = ax2.bar(0 - bar_width/2, lr_median_h2, bar_width, 
                   yerr=lr_iqr_h2/2, label='Linear Regression', alpha=0.7, capsize=5)
            bar2_h2 = ax2.bar(0 + bar_width/2, dl_median_h2, bar_width, 
                   yerr=dl_iqr_h2/2, label='Deep Learning', alpha=0.7, capsize=5)
            
            # Add data labels
            ax2.text(bar1_h2[0].get_x() + bar1_h2[0].get_width()/2., lr_median_h2 + lr_iqr_h2/2 + 0.01,
                    f'{lr_median_h2:.3f}', ha='center', va='bottom', fontsize=8)
            ax2.text(bar2_h2[0].get_x() + bar2_h2[0].get_width()/2., dl_median_h2 + dl_iqr_h2/2 + 0.01,
                    f'{dl_median_h2:.3f}', ha='center', va='bottom', fontsize=8)
            
            # Set labels for h2
            ax2.set_xlabel('System')
            ax2.set_title('h2 System')
            ax2.set_xticks([0])
            ax2.set_xticklabels(['h2'])
        
        # Add note about which direction is better
        if metric == 'R2':
            plt.figtext(0.01, 0.01, "Higher is better", style='italic', fontsize=10)
        else:
            plt.figtext(0.01, 0.01, "Lower is better", style='italic', fontsize=10)
        
        plt.tight_layout()
        plt.savefig(f'results/visualizations/{metric}_all_systems_median_iqr.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # Create improvement percentage visualization across all systems
    plt.figure(figsize=(14, 8))
    
    # Prepare data for plotting improvement percentages based on medians
    improvement_data = []
    
    for system in all_systems:
        system_data = df_summary[df_summary['System'] == system]
        for metric in metrics:
            # Use median of improvement percentages for each system
            improvement = system_data[f'{metric}_improvement'].median()
            
            improvement_data.append({
                'System': system,
                'Metric': metric,
                'Improvement (%)': improvement
            })
    
    # Convert to DataFrame
    imp_df = pd.DataFrame(improvement_data)
    
    # Create grouped bar plot for improvements
    ax = sns.barplot(x='System', y='Improvement (%)', hue='Metric', data=imp_df)
    
    # Add data labels to bars
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2., 
                height + (0.5 if height >= 0 else -2.5),
                f'{height:.1f}%', 
                ha='center', fontsize=8)
    
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.title('Improvement Percentages Across Systems and Metrics (Based on Median Values)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('results/visualizations/all_improvements_median.png', dpi=300, bbox_inches='tight')
    plt.close()

In [7]:
df_summary = create_summary()
create_metric_plots(df_summary)

# mean std

In [11]:
def create_summary():
    summary_data = []
    for (system, dataset), group in df_detailed.groupby(['System', 'Dataset']):
        row = {'System': system, 'Dataset': dataset}
        
        # Calculate averages and std devs
        for prefix in ['LR', 'DL']:
            for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
                col = f"{prefix}_{metric}"
                if col in df_detailed.columns:
                    row[f"{col}_mean"] = group[col].mean()
                    row[f"{col}_std"] = group[col].std()
        
        # Calculate improvements
        for metric in ['MAPE', 'MAE', 'RMSE']:
            lr_mean = row[f"LR_{metric}_mean"]
            dl_mean = row[f"DL_{metric}_mean"]
            # For error metrics, lower is better
            row[f"{metric}_improvement"] = ((lr_mean - dl_mean) / lr_mean) * 100
        
        # For R2, higher is better
        if row['LR_R2_mean'] > 0:
            row['R2_improvement'] = ((row['DL_R2_mean'] - row['LR_R2_mean']) / abs(row['LR_R2_mean'])) * 100
        else:
            row['R2_improvement'] = row['DL_R2_mean'] - row['LR_R2_mean']
        
        # Calculate statistical significance
        for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
            lr_col = f"LR_{metric}"
            dl_col = f"DL_{metric}"
            if lr_col in df_detailed.columns and dl_col in df_detailed.columns:
                try:
                    if metric == 'R2':
                        # For R2, higher is better
                        stat, p_value = stats.wilcoxon(group[dl_col], group[lr_col], alternative='greater')
                    else:
                        # For error metrics, lower is better
                        stat, p_value = stats.wilcoxon(group[lr_col], group[dl_col], alternative='greater')
                    
                    row[f"{metric}_p_value"] = p_value
                    row[f"{metric}_significant"] = p_value < 0.05
                except:
                    row[f"{metric}_p_value"] = np.nan
                    row[f"{metric}_significant"] = False
        
        summary_data.append(row)
    
    return pd.DataFrame(summary_data)

df_summary = create_summary()

In [14]:
def create_system_summary_table(df_summary):
    """
    Create a summary table showing metrics and improvements for each system
    """
    # Prepare the data for the summary table
    table_data = []
    
    # Process each system
    for system in df_summary['System'].unique():
        system_data = df_summary[df_summary['System'] == system]
        
        # Calculate average metrics and std across datasets for this system
        row = {'System': system}
        
        # Calculate metrics with standard deviations
        for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
            # Average values
            lr_mean = system_data[f'LR_{metric}_mean'].mean()
            dl_mean = system_data[f'DL_{metric}_mean'].mean()
            
            # Standard deviations
            lr_std = system_data[f'LR_{metric}_std'].mean()
            dl_std = system_data[f'DL_{metric}_std'].mean()
            
            # Format with 2 decimal places
            row[f'LR_{metric}'] = f"{lr_mean:.2f} ± {lr_std:.2f}"
            row[f'DL_{metric}'] = f"{dl_mean:.2f} ± {dl_std:.2f}"
            
            # Calculate improvement
            imp = system_data[f'{metric}_improvement'].mean()
            row[f'{metric}_Improvement'] = f"{imp:.2f}%"
            
            # Count significant improvements
            sig_count = sum(system_data[f'{metric}_significant'])
            total_count = len(system_data)
            row[f'{metric}_Significant'] = f"{sig_count}/{total_count}"
            
            # Is DL better?
            row[f'{metric}_Better'] = "Yes" if imp > 0 else "No"
        
        table_data.append(row)
    
    # Add overall average row
    overall_row = {'System': 'Overall'}
    for metric in ['MAPE', 'MAE', 'RMSE', 'R2']:
        # Average values across all systems
        all_lr_means = [float(row[f'LR_{metric}'].split(' ')[0]) for row in table_data]
        all_dl_means = [float(row[f'DL_{metric}'].split(' ')[0]) for row in table_data]
        
        overall_lr = np.mean(all_lr_means)
        overall_dl = np.mean(all_dl_means)
        
        overall_row[f'LR_{metric}'] = f"{overall_lr:.2f}"
        overall_row[f'DL_{metric}'] = f"{overall_dl:.2f}"
        
        # Overall improvement
        all_imps = [float(row[f'{metric}_Improvement'].replace('%', '')) for row in table_data]
        overall_imp = np.mean(all_imps)
        overall_row[f'{metric}_Improvement'] = f"{overall_imp:.2f}%"
        
        # Count total significant improvements
        sig_counts = [int(row[f'{metric}_Significant'].split('/')[0]) for row in table_data]
        total_counts = [int(row[f'{metric}_Significant'].split('/')[1]) for row in table_data]
        overall_row[f'{metric}_Significant'] = f"{sum(sig_counts)}/{sum(total_counts)}"
        
        # Overall better?
        overall_row[f'{metric}_Better'] = "Yes" if overall_imp > 0 else "No"
    
    table_data.append(overall_row)
    
    # Create DataFrame
    summary_table = pd.DataFrame(table_data)
    
    # Save to CSV
    summary_table.to_csv('results/tables/simple_summary.csv', index=False)
    
    print("Table created at 'results/tables/simple_summary.csv'")
    
    return summary_table

# Call this function after creating df_summary
simple_summary = create_system_summary_table(df_summary)

Table created at 'results/tables/simple_summary.csv'


# Significance Test

In [24]:
import pandas as pd
from scipy.stats import wilcoxon
import numpy as np

df = pd.read_csv('results/data/all_results_detailed.csv')

# Define metrics and whether higher or lower values are better
metrics = {
    'MAPE': 'lower',  # Lower MAPE is better
    'MAE': 'lower',   # Lower MAE is better
    'RMSE': 'lower',  # Lower RMSE is better
    'R2': 'higher'    # Higher R2 is better
}

# Store results for table
results = []

print("Wilcoxon Signed-Rank Test Results:")
print("=================================")

for metric, better in metrics.items():
    dl_values = df[f'DL_{metric}']
    lr_values = df[f'LR_{metric}']
    
    # Determine which model performs better on average
    if better == 'lower':
        avg_better = 'DL' if np.mean(dl_values) < np.mean(lr_values) else 'LR'
    else:  # higher is better
        avg_better = 'DL' if np.mean(dl_values) > np.mean(lr_values) else 'LR'
    
    # Perform the Wilcoxon test with appropriate alternative hypothesis
    if better == 'lower':
        # For metrics where lower is better
        statistic, p_value = wilcoxon(lr_values, dl_values, alternative='greater')
    else:  # better == 'higher'
        # For metrics where higher is better
        statistic, p_value = wilcoxon(dl_values, lr_values, alternative='greater')
    
    # Determine if the difference is statistically significant
    is_significant = p_value < 0.05
    
    # Store the results
    results.append({
        'Metric': metric,
        'DL Mean': np.mean(dl_values),
        'LR Mean': np.mean(lr_values),
        'Difference': np.mean(dl_values - lr_values),
        'Better Model': avg_better,
        'P-value': p_value,
        'Significant': is_significant
    })
    
    # Print results
    print(f"\nMetric: {metric} (better if {better})")
    print(f"  DL mean: {np.mean(dl_values):.4f}")
    print(f"  LR mean: {np.mean(lr_values):.4f}")
    print(f"  Difference: {np.mean(dl_values - lr_values):.4f}")
    print(f"  Better on average: {avg_better}")
    print(f"  Wilcoxon statistic: {statistic}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Statistically significant: {'Yes' if is_significant else 'No'}")
    
    if is_significant:
        conclusion = f"{avg_better} is significantly better for {metric}"
    else:
        conclusion = f"No significant difference between DL and LR for {metric}"
    print(f"  Conclusion: {conclusion}")
    

# Create a results table
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('results/tables/wilcoxon_test_results.csv', index=False)
print("\nResults saved to wilcoxon_test_results.csv")

# System-level analysis
print("\n\nSystem-Level Analysis:")
print("======================")

systems = df['System'].unique()
for system in systems:
    system_df = df[df['System'] == system]
    
    print(f"\nSystem: {system}")
    for metric, better in metrics.items():
        system_dl = system_df[f'DL_{metric}']
        system_lr = system_df[f'LR_{metric}']
        
        # Perform the test
        try:
            if better == 'lower':
                statistic, p_value = wilcoxon(system_lr, system_dl, alternative='greater')
            else:
                statistic, p_value = wilcoxon(system_dl, system_lr, alternative='greater')
                
            is_significant = p_value < 0.05
            
            if better == 'lower':
                avg_better = 'DL' if np.mean(system_dl) < np.mean(system_lr) else 'LR'
            else:
                avg_better = 'DL' if np.mean(system_dl) > np.mean(system_lr) else 'LR'
                
            print(f"  {metric}: {avg_better} better (p={p_value:.4f}, {'significant' if is_significant else 'not significant'})")
        except Exception as e:
            print(f"  {metric}: Error in test - {str(e)}")

Wilcoxon Signed-Rank Test Results:

Metric: MAPE (better if lower)
  DL mean: 0.2689
  LR mean: 1.7680
  Difference: -1.4991
  Better on average: DL
  Wilcoxon statistic: 423970.0
  P-value: 0.0000
  Statistically significant: Yes
  Conclusion: DL is significantly better for MAPE

Metric: MAE (better if lower)
  DL mean: 93.1182
  LR mean: 120.1139
  Difference: -26.9957
  Better on average: DL
  Wilcoxon statistic: 410963.0
  P-value: 0.0000
  Statistically significant: Yes
  Conclusion: DL is significantly better for MAE

Metric: RMSE (better if lower)
  DL mean: 117.6779
  LR mean: 142.5734
  Difference: -24.8956
  Better on average: DL
  Wilcoxon statistic: 344675.0
  P-value: 0.0000
  Statistically significant: Yes
  Conclusion: DL is significantly better for RMSE

Metric: R2 (better if higher)
  DL mean: 0.7184
  LR mean: 0.5865
  Difference: 0.1319
  Better on average: DL
  Wilcoxon statistic: 360442.0
  P-value: 0.0000
  Statistically significant: Yes
  Conclusion: DL is signif