# Correlation analysis

In [None]:
import pickle
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
from scipy.stats import ttest_ind
from scipy.stats import ttest_rel

import warnings
warnings.filterwarnings("ignore")

import sys

## Load data

In [None]:
dataset = 'test'
test = False if dataset == 'train' else False

Loading training data

In [None]:
results_ern_lat_demo_df = pd.read_pickle(f"../data/models_pickles_new_dass/ern_cov_fal_models_{dataset}.pkl")
results_crn_lat_demo_df = pd.read_pickle(f"../data/models_pickles_new_dass/crn_cov_fal2_models_{dataset}.pkl")

train_datasets = [results_ern_lat_demo_df, results_crn_lat_demo_df]

Loading testing data

In [None]:
results_ern_lat_demo_test_df = pd.read_pickle(f"../data/models_pickles_new_dass/ern_cov_fal_models_test.pkl")
results_crn_lat_demo_test_df = pd.read_pickle(f"../data/models_pickles_new_dass/crn_cov_fal2_models_test.pkl")

# fill nan
results_crn_lat_demo_test_df['e_LT_F2_C'] = results_crn_lat_demo_test_df['e_LT_F2_C'].fillna(results_crn_lat_demo_test_df['e_LT_F2_C'].mean())

test_datasets = [results_ern_lat_demo_test_df, results_crn_lat_demo_test_df]

### Create delta ERN - CRN measure

In [None]:
datasets = test_datasets if test else train_datasets

ern_values = datasets[0]['e_ERN'].to_numpy()
crn_values = datasets[1]['e_CRN'].to_numpy()

ern_crn_diff_values = ern_values - crn_values
ern_crn_diff_df = pd.DataFrame(ern_crn_diff_values.reshape(-1,1), columns=['ERN\CRN delta'])

# Add new column to dataset
ern_delta_data_df = pd.concat([ern_crn_diff_df, datasets[0]], axis=1).drop(columns='e_ERN')

datasets.append(ern_delta_data_df)

Drop unnecessery columns

In [None]:
datasets = [dataset.drop(columns=['Sex', 'WASH', 'NEU'])for dataset in datasets]

## Create correlation tables

In [None]:
mapping = {
    "RRQ": "1Q",
    "DASS-21 Stress": "2Q",
    "DASS-21 Dep": "3Q",
    "STAI-T": "4Q",
    "STAI-S Diff": '5Q',
    "BIS": "6Q",
    "OBSESS": "7Q",
    "HOARD": "8Q",
    "ORD": "9Q",
    "CHECK": "10Q",
    "WBSI": "11Q",
    "IUS-P": "12Q",
    "IUS-I": "13Q",
    "SES": "14Q",
    'BAS_D': "15Q",
    'BAS_PRZY': "16Q",
    'BAS_NAG': "17Q",
    'INDEC_F': "18Q",
    'PUN': "19Q",
    'REW': "20Q",
    'HARM': "21Q",
    'T-CTR': "22Q",
    "OT": "23Q",
    'OB_PERF': "24Q",
    'PS': "25Q",
    'G_SE': "26Q",
    'AMB': "27Q",
    'PRED': "28Q",
    'STAND': "29Q",   
    "Age": "1C",
    "Handness": "2C",
    'e_ERN': "1E",
    'e_LT_F': "4C",
    'performance': "3C",
    'e_CRN': "2E",
    'e_LT_F2_C': "4C",
}

In [None]:
def corr_with_p(df):
    df = df.rename(columns=mapping)
    rho = df.corr()
    pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p = pval.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
    return rho.round(2).astype(str) + p, rho

In [None]:
corr_p_tables = []

for index, this_dataset in enumerate(datasets):
    print(index)
    corr_p, corr = corr_with_p(this_dataset)
    corr_p.to_csv(f'../data/correlation_matrixes/{dataset}/correlation_matrix_{index}.csv')
    display(corr_p)