# Correlation analysis
Date: 27.10.2025

In this notebook I do the correlation analyisis to look for what internal variable correlates the best with the chi-values.

Steps:
1. Normalize the data
2. Compute Spearman, Mutual Information, and Random Forest Importance
3. Analyze micro and macro variables
4. Produce comparision charts

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde


import sys
import os

In [2]:
def kde_mi(x, y, gridsize=100, bw_method=None):
    """
    KDE-based mutual information matching ISOKANN.jl implementation
    """
    # Create grid
    xg = np.linspace(x.min(), x.max(), gridsize)
    yg = np.linspace(y.min(), y.max(), gridsize)
    dx = xg[1] - xg[0]
    dy = yg[1] - yg[0]
    
    # Estimate joint density
    xy = np.vstack([x, y])
    kde_joint = gaussian_kde(xy, bw_method=bw_method)
    xg_mesh, yg_mesh = np.meshgrid(xg, yg)
    positions = np.vstack([xg_mesh.ravel(), yg_mesh.ravel()])
    pxy = kde_joint(positions).reshape(gridsize, gridsize).T
    
    # Estimate marginals
    px = np.sum(pxy, axis=1, keepdims=True) * dy
    py = np.sum(pxy, axis=0, keepdims=True) * dx
    
    # Compute MI (avoid log(0))
    px_py = px @ py
    with np.errstate(divide='ignore', invalid='ignore'):
        integrand = pxy * np.log(pxy / px_py)
        integrand[~np.isfinite(integrand)] = 0
    
    return np.sum(integrand) * dx * dy

In [3]:
#1 Load data
# Get the absolute path of the parent directory (project root)
project_root = os.path.abspath(os.path.join(os.getcwd(), "..",'..'))

# Add it to the Python path
if project_root not in sys.path:
    sys.path.append(project_root)

#Load data formatted for ISOKANN.jl from ABM simulations
# Create the path and ensure the directory exists for saving the data
data_path = os.path.join('data', 'processed', 'isokann', '001_results')
data_dir =  os.path.join(project_root, data_path,'')
# Read data
states_data = np.load(data_dir + '20225-11-05-Simulation_data_for_validation.npz')
xs = states_data['xs'] #xs.shape (n_dim, n_samples)

# # # --- Load chi function ---
chivals_path = os.path.join('data', 'processed', 'isokann', '001_results', 'chi_vals', '')
chivals_dir = os.path.join(project_root, chivals_path)
chi = np.load(data_dir + '/chi_vals/chi_values_rand_init_capitals.npz')

print("data loaded!")

data loaded!


In [4]:
# 2 Normalize the microvariables
n_micro = 200
n_macro = 12

xs_micro = xs[:n_micro,:]
xs_macro = xs[-n_macro:, :]

scaler = StandardScaler()
xs_micro_scaled = scaler.fit_transform(xs_micro)

In [None]:
# 2 Normalize the microvariables
n_micro = 200
n_macro = 12

xs_micro = xs[:n_micro,:]
xs_macro = xs[-n_macro:, :]

scaler = StandardScaler()
xs_micro_scaled = scaler.fit_transform(xs_micro)


# Combine micro + macro into single DataFrame for analysis
all_features = np.vstack((xs_micro_scaled, xs_macro))
#ll_features = xs
#labels = [f"agent_{i}" for i in range(n_micro)] + [f"macro_{i}" for i in range(n_macro)]
labels = []
for i in range(100):
    n = i 
    labels.append(f"agent_{i} PB")
    labels.append(f"agent_{i} PG")

labels += [f"macro_{i}" for i in range(n_macro)]

df = pd.DataFrame(all_features.T, columns=labels)
df["chi"] = chi

In [6]:
df.columns

Index(['agent_0 PB', 'agent_0 PG', 'agent_1 PB', 'agent_1 PG', 'agent_2 PB',
       'agent_2 PG', 'agent_3 PB', 'agent_3 PG', 'agent_4 PB', 'agent_4 PG',
       ...
       'macro_3', 'macro_4', 'macro_5', 'macro_6', 'macro_7', 'macro_8',
       'macro_9', 'macro_10', 'macro_11', 'chi'],
      dtype='object', length=213)

In [7]:
# =====================================================
# === 1. Spearman Correlation =========================
# =====================================================
print("Computing Spearman correlations...")
spearman_corr = []
for var in labels:
    rho, _ = spearmanr(df[var], df["chi"])
    spearman_corr.append(rho)

# =====================================================
# === 2. Mutual Information ===========================
# =====================================================
print("Computing Mutual Information...")
mi = mutual_info_regression(df[labels], df["chi"], random_state=42)

# =====================================================
# === 3.  KDE-based Mutual Information  ===============
# =====================================================
print("\nComputing KDE-based Mutual Information...")
feature_columns = [col for col in df.columns if col != "chi"]
mi_kde = [kde_mi(df["chi"].values, df[col].values) for col in feature_columns]


# # =====================================================
# # === 3. Random Forest Feature Importance =============
# # =====================================================
# print("Training Random Forest for feature importance...")
# rf = RandomForestRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42,
#     n_jobs=-1
# )
# rf.fit(df[labels], df["chi"])
# rf_importance = rf.feature_importances_

# =====================================================
# === 4. Summarize Results ============================
# =====================================================
results = pd.DataFrame({
    "variable": labels,
    "spearman": spearman_corr,
    "mutual_info": mi,
    "mi_kde": mi_kde
})

results["abs_spearman"] = results["spearman"].abs()
results.sort_values("abs_spearman", ascending=False, inplace=True)

# Print top 10
print("\nTop 10 χ-correlated variables:")
print(results.head(10))

Computing Spearman correlations...
Computing Mutual Information...

Computing KDE-based Mutual Information...

Top 10 χ-correlated variables:
        variable  spearman  mutual_info    mi_kde  abs_spearman
211     macro_11  0.763439     0.540429  0.469620      0.763439
208      macro_8 -0.763439     0.540362  0.469620      0.763439
209      macro_9  0.678735     0.414105  0.394057      0.678735
79   agent_39 PG  0.671946     0.424694  0.344978      0.671946
197  agent_98 PG  0.671078     0.424295  0.344772      0.671078
195  agent_97 PG  0.670927     0.423278  0.344799      0.670927
43   agent_21 PG  0.669838     0.419656  0.340368      0.669838
173  agent_86 PG  0.669766     0.422844  0.341790      0.669766
165  agent_82 PG  0.669727     0.421471  0.345656      0.669727
181  agent_90 PG  0.669105     0.421418  0.342945      0.669105


In [29]:
results["mi_kde_clipped"] = results["mi_kde"].clip(lower=0)

macro_vars = results[results["variable"].str.contains("macro")]


print(macro_vars[['variable', 'spearman', 'mi_kde_clipped']].style.format(precision=3).hide(axis='index').to_latex()
        )

\begin{tabular}{lrr}
variable & spearman & mi_kde_clipped \\
macro_11 & 0.763 & 0.470 \\
macro_8 & -0.763 & 0.470 \\
macro_9 & 0.679 & 0.394 \\
macro_10 & 0.632 & 0.352 \\
macro_6 & 0.136 & 0.000 \\
macro_0 & 0.128 & 0.000 \\
macro_4 & 0.111 & 0.000 \\
macro_2 & 0.073 & 0.000 \\
macro_1 & 0.068 & 0.000 \\
macro_5 & 0.059 & 0.000 \\
macro_3 & 0.054 & 0.000 \\
macro_7 & 0.053 & 0.000 \\
\end{tabular}



In [None]:
# #Save results so analysis don#t have to be done everytime
# results.to_csv('results_correlation_validation.csv', index=False)

In [None]:
# #read results
# results_kde = pd.read_csv('results_kde.csv')
# results_kde.reset_index(drop=True, inplace=True)