# Correlation analysis for configuration: voter dynamics in small-world network with three interacting agents
Date: 26.11.2025

In this notebook I do the correlation analyisis to look for what internal variable correlates the best with the chi-values.

Steps:
1. Normalize the data
2. Compute Spearman, Mutual Information, and Random Forest Importance
3. Analyze micro and macro variables
4. Produce comparision charts

In [8]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde


import sys
import os

In [9]:
def kde_mi(x, y, gridsize=100, bw_method=None):
    """
    KDE-based mutual information matching ISOKANN.jl implementation
    """
    # Create grid
    xg = np.linspace(x.min(), x.max(), gridsize)
    yg = np.linspace(y.min(), y.max(), gridsize)
    dx = xg[1] - xg[0]
    dy = yg[1] - yg[0]
    
    # Estimate joint density
    xy = np.vstack([x, y])
    kde_joint = gaussian_kde(xy, bw_method=bw_method)
    xg_mesh, yg_mesh = np.meshgrid(xg, yg)
    positions = np.vstack([xg_mesh.ravel(), yg_mesh.ravel()])
    pxy = kde_joint(positions).reshape(gridsize, gridsize).T
    
    # Estimate marginals
    px = np.sum(pxy, axis=1, keepdims=True) * dy
    py = np.sum(pxy, axis=0, keepdims=True) * dx
    
    # Compute MI (avoid log(0))
    px_py = px @ py
    with np.errstate(divide='ignore', invalid='ignore'):
        integrand = pxy * np.log(pxy / px_py)
        integrand[~np.isfinite(integrand)] = 0
    
    return np.sum(integrand) * dx * dy

In [11]:
#1 Load data
# Get the absolute path of the parent directory (project root)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..','..','..'))

# Add it to the Python path
if project_root not in sys.path:
    sys.path.append(project_root)

sims_path = os.path.join('ABa-KiTo', 'voter_dynamics_SW', 'data', 'simulations','')
data_dir =  os.path.join(project_root, sims_path)

#Load data formatted for ISOKANN.jl from ABM simulations
# Read data
states_data = np.load(data_dir + '2025-11-26-dataCorrelation_VD_SW_3InteractingAgents.npz')
xs = states_data['xs'] #xs.shape (n_dim, n_samples)


# # # --- Load chi function ---
chivals_path = os.path.join('ABa-KiTo', 'voter_dynamics_SW','data',  'chi_vals', '')
chivals_dir = os.path.join(project_root, chivals_path)
chi0 = np.load(chivals_dir + 'chi_values_VD_SW.npz')

print("data loaded!")

data loaded!


In [12]:
# 2 Normalize the microvariables
n_micro = 200
n_macro = 12

xs_micro = xs[:n_micro,:]
xs_macro = xs[-n_macro:, :]

scaler = StandardScaler()
xs_micro_scaled = scaler.fit_transform(xs_micro)

In [13]:
# Combine micro + macro into single DataFrame for analysis
all_features = np.vstack((xs_micro_scaled, xs_macro))
#ll_features = xs
#labels = [f"agent_{i}" for i in range(n_micro)] + [f"macro_{i}" for i in range(n_macro)]
labels = []
for i in range(100):
    n = i 
    labels.append(f"agent_{i} PB")
    labels.append(f"agent_{i} PG")

labels += [f"macro_{i}" for i in range(n_macro)]

df = pd.DataFrame(all_features.T, columns=labels)
df["chi"] = chi0

In [14]:
df.columns

Index(['agent_0 PB', 'agent_0 PG', 'agent_1 PB', 'agent_1 PG', 'agent_2 PB',
       'agent_2 PG', 'agent_3 PB', 'agent_3 PG', 'agent_4 PB', 'agent_4 PG',
       ...
       'macro_3', 'macro_4', 'macro_5', 'macro_6', 'macro_7', 'macro_8',
       'macro_9', 'macro_10', 'macro_11', 'chi'],
      dtype='object', length=213)

In [15]:
# =====================================================
# === 1. Spearman Correlation =========================
# =====================================================
print("Computing Spearman correlations...")
spearman_corr = []
for var in labels:
    rho, _ = spearmanr(df[var], df["chi"])
    spearman_corr.append(rho)

# =====================================================
# === 2. Mutual Information ===========================
# =====================================================
print("Computing Mutual Information...")
mi = mutual_info_regression(df[labels], df["chi"], random_state=42)

# =====================================================
# === 3.  KDE-based Mutual Information  ===============
# =====================================================
print("\nComputing KDE-based Mutual Information...")
feature_columns = [col for col in df.columns if col != "chi"]
mi_kde = [kde_mi(df["chi"].values, df[col].values) for col in feature_columns]


# # =====================================================
# # === 3. Random Forest Feature Importance =============
# # =====================================================
# print("Training Random Forest for feature importance...")
# rf = RandomForestRegressor(
#     n_estimators=100,
#     max_depth=10,
#     random_state=42,
#     n_jobs=-1
# )
# rf.fit(df[labels], df["chi"])
# rf_importance = rf.feature_importances_

# =====================================================
# === 4. Summarize Results ============================
# =====================================================
results = pd.DataFrame({
    "variable": labels,
    "spearman": spearman_corr,
    "mutual_info": mi,
    "mi_kde": mi_kde
})

results["abs_spearman"] = results["spearman"].abs()
results.sort_values("abs_spearman", ascending=False, inplace=True)

# Print top 10
print("\nTop 10 χ-correlated variables:")
print(results.head(10))

Computing Spearman correlations...
Computing Mutual Information...

Computing KDE-based Mutual Information...

Top 10 χ-correlated variables:
        variable  spearman  mutual_info    mi_kde  abs_spearman
211     macro_11  0.596632     0.390633  0.279110      0.596632
208      macro_8 -0.596632     0.390357  0.279110      0.596632
113  agent_56 PG  0.566618     0.330814  0.235432      0.566618
127  agent_63 PG  0.565345     0.333854  0.234814      0.565345
85   agent_42 PG  0.557994     0.322085  0.225314      0.557994
3     agent_1 PG  0.557212     0.325033  0.227033      0.557212
167  agent_83 PG  0.556097     0.325605  0.225476      0.556097
5     agent_2 PG  0.555728     0.324056  0.225090      0.555728
129  agent_64 PG  0.555566     0.323636  0.225969      0.555566
117  agent_58 PG  0.554620     0.321735  0.224784      0.554620


In [16]:
results["mi_kde_clipped"] = results["mi_kde"].clip(lower=0)

macro_vars = results[results["variable"].str.contains("macro")]


print(macro_vars[['variable', 'spearman', 'mi_kde_clipped']].style.format(precision=3).hide(axis='index').to_latex()
        )

\begin{tabular}{lrr}
variable & spearman & mi_kde_clipped \\
macro_11 & 0.597 & 0.279 \\
macro_8 & -0.597 & 0.279 \\
macro_9 & 0.515 & 0.297 \\
macro_10 & 0.471 & 0.241 \\
macro_6 & 0.087 & 0.000 \\
macro_0 & 0.081 & 0.000 \\
macro_4 & 0.068 & 0.000 \\
macro_2 & 0.037 & 0.000 \\
macro_1 & 0.033 & 0.000 \\
macro_5 & 0.025 & 0.000 \\
macro_3 & 0.023 & 0.000 \\
macro_7 & 0.023 & 0.000 \\
\end{tabular}



In [None]:
# from scipy.stats import gaussian_kde
# import numpy as np

# def kde_mi(x, y, gridsize=100, bw_method=None):
#     """
#     KDE-based mutual information matching ISOKANN.jl implementation
#     """
#     # Create grid
#     xg = np.linspace(x.min(), x.max(), gridsize)
#     yg = np.linspace(y.min(), y.max(), gridsize)
#     dx = xg[1] - xg[0]
#     dy = yg[1] - yg[0]
    
#     # Estimate joint density
#     xy = np.vstack([x, y])
#     kde_joint = gaussian_kde(xy, bw_method=bw_method)
#     xg_mesh, yg_mesh = np.meshgrid(xg, yg)
#     positions = np.vstack([xg_mesh.ravel(), yg_mesh.ravel()])
#     pxy = kde_joint(positions).reshape(gridsize, gridsize).T
    
#     # Estimate marginals
#     px = np.sum(pxy, axis=1, keepdims=True) * dy
#     py = np.sum(pxy, axis=0, keepdims=True) * dx
    
#     # Compute MI (avoid log(0))
#     px_py = px @ py
#     with np.errstate(divide='ignore', invalid='ignore'):
#         integrand = pxy * np.log(pxy / px_py)
#         integrand[~np.isfinite(integrand)] = 0
    
#     return np.sum(integrand) * dx * dy

# # Check what columns you actually have
# print("DataFrame columns:", df.columns.tolist())
# print("\nLabels list:", labels[:5])  # Show first 5 labels

# # Use the CORRECT column names from your DataFrame
# # These should match what you created earlier
# feature_columns = [col for col in df.columns if col != "chi"]

# # Compute KDE-based MI for each feature
# print("\nComputing KDE-based Mutual Information...")
# mi_kde = [kde_mi(df["chi"].values, df[col].values) for col in feature_columns]

# # Add to your results
# results_kde = pd.DataFrame({
#     "variable": feature_columns,
#     "mi_kde": mi_kde
# })

# print("\nTop 10 by KDE MI:")
# results_kde.sort_values("mi_kde", ascending=False, inplace=True)
# print(results_kde.head(10))

# # Optional: merge with your existing results
# # results = results.merge(results_kde, on="variable", how="left")

In [None]:
# #Save results so analysis don#t have to be done everytime
# results.to_csv('results_correlation_validation.csv', index=False)

In [None]:
# #read results
# results_kde = pd.read_csv('results_kde.csv')
# results_kde.reset_index(drop=True, inplace=True)