# 02 - Correlation Analysis

Compute Pearson, Spearman, and Kendall correlations between metrics and human simplicity scores.
All correlations are reported as **absolute values** since we care about strength of relationship.

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import load_all_datasets, get_metric_names
from src.statistics import compute_all_correlations, compute_metric_correlations

plt.style.use('seaborn-v0_8-whitegrid')

results_dir = project_root / 'results'
figures_dir = results_dir / 'figures'
figures_dir.mkdir(parents=True, exist_ok=True)

print('Setup complete!')

## 1. Load Datasets

In [None]:
datasets = load_all_datasets()

## 2. Compute All Correlations (Pearson, Spearman, Kendall)

In [None]:
# Compute all three correlation types for each dataset
correlation_results = {}

for dataset_name, data in datasets.items():
    print(f'\n{dataset_name}:')
    
    human = data['human_scores']
    metrics = {k: v for k, v in data['metric_scores'].items() if k != 'human'}
    
    # Compute Pearson, Spearman, and Kendall correlations
    corr_df = compute_all_correlations(metrics, human)
    correlation_results[dataset_name] = corr_df
    
    print(f'  Samples: {data["n_samples"]}')
    print(f'  Metrics: {len(corr_df)}')
    if len(corr_df) > 0:
        best = corr_df.iloc[0]
        print(f'  Best (avg |r|): {best["metric"]} ({best["avg_abs"]:.3f})')

## 3. Correlation Tables by Dataset

Showing absolute correlations (|r|) for Pearson, Spearman, and Kendall's tau.

In [None]:
# Display correlation tables for key datasets
for dataset_name in ['ARTS3k', 'SDA', 'ST-sent', 'D-Wiki']:
    if dataset_name in correlation_results:
        print(f'\n{dataset_name} - Absolute Correlations:')
        print('=' * 80)
        
        df = correlation_results[dataset_name].copy()
        
        # Display absolute correlations
        display_df = df[['metric', 'P_abs', 'S_abs', 'K_abs', 'avg_abs', 'n']].copy()
        display_df.columns = ['Metric', '|Pearson|', '|Spearman|', '|Kendall|', 'Average', 'N']
        display_df = display_df.round(3)
        
        display(display_df.head(15))

## 4. Cross-Dataset Comparison Matrix

Average absolute correlation across Pearson, Spearman, and Kendall.

In [None]:
# Build comparison matrix (metrics x datasets) using average absolute correlation
all_metrics = set()
for df in correlation_results.values():
    all_metrics.update(df['metric'].tolist())

comparison_data = {}
for dataset_name, df in correlation_results.items():
    metric_to_corr = dict(zip(df['metric'], df['avg_abs']))
    comparison_data[dataset_name] = metric_to_corr

comparison_df = pd.DataFrame(comparison_data)

# Reorder columns
col_order = ['ARTS94', 'ARTS300', 'ARTS3k', 'LR-ARTS94', 'LR-ARTS300', 'LR-ARTS3k', 'SDA', 'ST-sent', 'ST-para', 'D-Wiki']
col_order = [c for c in col_order if c in comparison_df.columns]
comparison_df = comparison_df[col_order]

# Sort by average
comparison_df['Average'] = comparison_df.mean(axis=1)
comparison_df = comparison_df.sort_values('Average', ascending=False)

print('Average Absolute Correlation |r| (Metric vs Human):')
display(comparison_df.round(3))

## 5. Correlation Heatmap

In [None]:
# Heatmap of absolute correlations
plot_df = comparison_df.drop(columns=['Average'])

fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(
    plot_df,
    annot=True,
    fmt='.2f',
    cmap='YlGn',
    vmin=0,
    vmax=1,
    ax=ax,
    linewidths=0.5,
    cbar_kws={'label': 'Absolute Correlation |r|'},
)

ax.set_title('Average Absolute Correlation with Human Scores\n(Pearson, Spearman, Kendall)', fontsize=14)
ax.set_xlabel('Dataset')
ax.set_ylabel('Metric')

plt.tight_layout()
plt.savefig(figures_dir / 'correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.savefig(figures_dir / 'correlation_heatmap.pdf', bbox_inches='tight')
plt.show()
print('Saved: correlation_heatmap.png/pdf')

## 6. Correlation Method Comparison

Compare Pearson vs Spearman vs Kendall for each metric.

In [None]:
# For ARTS3k, compare the three methods
dataset_name = 'ARTS3k'
df = correlation_results[dataset_name]

print(f'{dataset_name} - Correlation Method Comparison:')
print('=' * 70)

method_df = df[['metric', 'P_abs', 'S_abs', 'K_abs']].copy()
method_df.columns = ['Metric', 'Pearson', 'Spearman', 'Kendall']
method_df['Max'] = method_df[['Pearson', 'Spearman', 'Kendall']].max(axis=1)
method_df['Min'] = method_df[['Pearson', 'Spearman', 'Kendall']].min(axis=1)
method_df['Range'] = method_df['Max'] - method_df['Min']

display(method_df.round(3))

print(f'\nAverage range across methods: {method_df["Range"].mean():.3f}')
print('(Lower range = more consistent across correlation types)')

## 7. Ranking by Average Absolute Correlation

In [None]:
# Final ranking
comparison_df['Count'] = comparison_df.drop(columns=['Average']).notna().sum(axis=1)
ranking = comparison_df[['Average', 'Count']].sort_values('Average', ascending=False)

print('Average Absolute Correlation Across All Datasets:')
print('=' * 50)
display(ranking.round(3))

## 8. Summary

In [None]:
print('CORRELATION ANALYSIS SUMMARY')
print('=' * 60)
print(f'Datasets analyzed: {len(correlation_results)}')
print(f'Unique metrics: {len(all_metrics)}')
print('\nCorrelation methods: Pearson, Spearman, Kendall')
print('All values reported as absolute correlations |r|')

print('\nBest Metric per Dataset (by avg |r|):')
for dataset_name, df in correlation_results.items():
    if len(df) > 0:
        best = df.iloc[0]
        print(f'  {dataset_name}: {best["metric"]} (|r|={best["avg_abs"]:.3f})')

print('\nTop 5 by Average Absolute Correlation:')
for i, (metric, row) in enumerate(ranking.head(5).iterrows(), 1):
    print(f'  {i}. {metric}: |r|={row["Average"]:.3f} (in {int(row["Count"])} datasets)')