# Correlation analysis

In [None]:
import pickle
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel

import warnings
warnings.filterwarnings("ignore")

import sys

## Load data

In [None]:
dataset = 'train'
test = False if dataset == 'train' else False

Loading training data

In [None]:
results_ern_crn_lat_demo_df = pd.read_pickle(f"../data/models_pickles_new_dass/ern_crn_cov_fal_models_{dataset}_id_clean.pkl")

Loading testing data

In [None]:
results_ern_crn_lat_demo_df_test = pd.read_pickle(f"../data/models_pickles_new_dass/ern_crn_cov_fal_models_test_id_clean.pkl")

Drop unnecessery columns

In [None]:
datasets = [results_ern_crn_lat_demo_df, results_ern_crn_lat_demo_df_test]

In [None]:
datasets = [dataset.drop(columns=['Sex', 'id'])for dataset in datasets]

## Create correlation tables

In [None]:
control_mapping = {
    "RRQ": "RRQ",
    "DASS-21 Stress": "STR",
    "DASS-21 Dep": "DEP",
    "STAI-T": "ANX",
    "STAI-S Diff": 'AFL',
    "BIS": "BIS",
    "OBSESS": "OBS",
    "HOARD": "HRD",
    "ORD": "ORD",
    "CHECK": "CHK",
    "WBSI": "TSU",
    "IUS-P": "PIU",
    "IUS-I": "IIU",
    "SES": "SES",
    'BAS_D': "DRV",
    'BAS_PRZY': "FUN",
    'BAS_NAG': "RER",
    'INDEC_F': "IND",
    'PUN': "PUN",
    'REW': "REW",
    'HARM': "HRM",
    'T-CTR': "TCR",
    "OT": "THR",
    'OB_PERF': "PER",
    'PS': "STN",
    'G_SE': "GLT",
    'AMB': "AMB",
    'PRED': "NPR",
    'STAND': "STD",   
    "Age": "AGE",
    "Handness": "HND",
    'e_ERN': "ERN",
    'e_LT_F': "LT-E",
    'performance': "PRF",
    'e_CRN': "CRN",
    'e_LT_F2_C': "LT-C",
    'performance': "PRF",
    'Sex': 'SEX'
}

In [None]:
def corr_with_p(df, mapping):
    df = df.rename(columns=mapping)
    rho = df.corr()
    pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p = pval.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
    return rho.round(2).astype(str) + p, rho

In [None]:
corr_p_tables = []

for index, this_dataset in enumerate(datasets):
    print(index)
    corr_p, corr = corr_with_p(this_dataset, control_mapping)
    corr_p.to_csv(f'../new_results/correlation_matrixes/correlation_matrix_{index}.csv')
    display(corr_p)

### Plot results

In [None]:
df = datasets[0].rename(columns=control_mapping)

In [None]:
cm = 1/2.54
dpi = 500

sns.set_style("white")
plt.rcParams['figure.dpi'] = dpi
plt.rcParams['figure.figsize'] = [15*cm,15*cm]
plt.rcParams["font.size"] = 4

from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau
import textwrap
# Generate the correlation matrix afresh
corr = df.corr(numeric_only=True)

# mask the correlation matrix to diagonal
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
np.fill_diagonal(mask, False)

fix,ax = plt.subplots(figsize=(15*cm,15*cm))

# Generate heatmap
heatmap = sns.heatmap(corr,
                      annot= True,
                      annot_kws={"fontsize": 3.5},
                      fmt='.2f',
                      linewidths=0.5,
                      cmap='RdBu',
                      cbar=False,
                      mask=mask,
                      ax=ax)

# calculate and format p-values
p_values = np.full((corr.shape[0], corr.shape[1]), np.nan)
for i in range(corr.shape[0]):
  for j in range(i+1, corr.shape[1]):
    x = df.iloc[:, i]
    y = df.iloc[:, j]
    mask = ~np.logical_or(np.isnan(x), np.isnan(y))
    if np.sum(mask) > 0:
      p_values[i, j] = kendalltau(x[mask], y[mask])[1] #change to pearsonr or spearmanr

# Create a dataframe object for p_values
p_values = pd.DataFrame(p_values, columns=corr.columns, index=corr.index)

# Mask the p values
mask_pvalues = np.triu(np.ones_like(p_values), k=1)

# Generate maximum and minimum correlation coefficients for p-value annotation color
max_corr = np.max(corr.max())
min_corr = np.min(corr.min())

# Assign p-value annotations, include asterisks for significance
for i in range (p_values.shape[0]):
  for j in range(p_values.shape[1]):
    if mask_pvalues[i, j]:
      p_value = p_values.iloc[i, j]
      if not np.isnan(p_value):
        correlation_value = corr.iloc[i, j]
        text_color = 'white' if correlation_value >= (max_corr - 0.4) or correlation_value <= (min_corr + 0.4) else 'black'
        if p_value <= 0.01:
            #include double asterisks for p-value <= 0.01
            ax.text(i + 0.5, j + 0.8, f'[{p_value:.2f}]',
                    horizontalalignment='center',
                    verticalalignment='center',
                    fontsize=3,
                    color=text_color)
        elif p_value <= 0.05:
            #include single asterisk for p-value <= 0.05
            ax.text(i + 0.5, j + 0.8, f'[{p_value:.2f}]',
                    horizontalalignment='center',
                    verticalalignment='center',
                    fontsize=3,
                    color=text_color)
        else:
            ax.text(i + 0.5, j + 0.8, f'[{p_value:.2f}]',
                    horizontalalignment='center',
                    verticalalignment='center',
                    fontsize=3,
                    color=text_color)

# Customize x-axis labels
x_labels = [textwrap.fill(label.get_text(), 4) for label in ax.get_xticklabels()]
ax.set_xticklabels(x_labels, rotation=90, ha="center")

# Customize y-axis labels
y_labels = [textwrap.fill(label.get_text(), 4) for label in ax.get_yticklabels()]
ax.set_yticklabels(y_labels, rotation=0, ha="right")

# Display the plot
plt.show()
fix.savefig(f"../new_results/correlation_matrixes/correlation_matrix_train.png",  bbox_inches='tight')