## Correlate online ratings with offline annotations

Zizhuang Miao

This script is used to calculate the correlations between online ratings and offline annotations of social interactions, and online ratings of ToM and offline ratings of ToM demands. Significance of the correlations will be tested using non-parametric methods (phase randomization) because of the autocorrelation of time series data.

In [6]:
import pandas as pd
import numpy as np
from os.path import join
from nltools.stats import phase_randomize

### Social interactions

In [18]:
dataDir = 'C:\\'
outputDir = 'C:\\'

runDict = {1: [7, 8], 2: [5, 6], 3: [3, 4], 4: [1, 2]}
nBoot = 10000

allData = pd.DataFrame()

corr_by_run = pd.DataFrame(columns=['run', 'r', 'p'])

for run in runDict:
    runData = pd.DataFrame()
    corr_by_run.loc[run-1, 'run'] = run

    # get the data for the run
    for n in runDict[run]:
        data = pd.read_csv(join(dataDir, f'narrative{n}_median.csv'))
        data['narrative'] = n
        if n%2 == 0:
            data['time'] += runData['time'].max() + 0.23
        runData = pd.concat([runData, data], axis=0)
        runData = runData.reset_index(drop=True)

    # estimate a null distribution the correlation between labels and medians by phase randomization
    nullDist = np.zeros(nBoot)
    corrData = runData[['labels', 'median']].dropna()
    for i in range(nBoot):
        nullDist[i] = np.corrcoef(phase_randomize(corrData['median']), corrData['labels'])[0, 1]
    
    pd.DataFrame({'correlation': nullDist}).to_csv(join(outputDir, 'nullDist', f'phasRandom_social_conti_v_cons_median_run{run}.csv'), index=False)

    # calculate the correlation between labels and medians
    r = np.corrcoef(corrData['median'], corrData['labels'])[0, 1]
    p = np.sum(nullDist > r) * 2 / nBoot
    corr_by_run.loc[run-1, 'r'] = r
    corr_by_run.loc[run-1, 'p'] = p

    allData = pd.concat([allData, runData], axis=0)

# for all data across runs
allData = allData.reset_index(drop=True)
nullDist = np.zeros(nBoot)
corrData = allData[['labels', 'median']].dropna()
for i in range(nBoot):
    nullDist[i] = np.corrcoef(phase_randomize(corrData['median']), corrData['labels'])[0, 1]
pd.DataFrame({'correlation': nullDist}).to_csv(join(outputDir, 'nullDist', f'phasRandom_social_conti_v_cons_median_all.csv'), index=False)

r = np.corrcoef(corrData['median'], corrData['labels'])[0, 1]
p = np.sum(nullDist > r) * 2 / nBoot
corr_by_run.loc[4, 'run'] = 'all'
corr_by_run.loc[4, 'r'] = r
corr_by_run.loc[4, 'p'] = p

print(corr_by_run)
corr_by_run.to_csv(join(outputDir, 'corr_social_conti_v_cons_median.csv'), index=False)

   run         r       p
0    1  0.481458  0.0008
1    2  0.489046  0.0006
2    3   0.63388     0.0
3    4  0.587763     0.0
4  all  0.578997     0.0


### ToM

In [16]:
dataDir = 'C:\\'
outputDir = 'C:\\'

modalityDict = {'audio': [5, 6, 7, 8], 'text': [1, 2, 3, 4]}
nBoot = 10000

allData = pd.DataFrame()

corr_by_run = pd.DataFrame(columns=['modality', 'r', 'p'])

for i, mod in enumerate(modalityDict):
    runData = pd.DataFrame()
    corr_by_run.loc[i, 'modality'] = mod

    # get the data for the run
    for n in modalityDict[mod]:
        data = pd.read_csv(join(dataDir, f'narrative{n}_median.csv'))
        data['narrative'] = n
        if n%4 != 1:
            data['time'] += runData['time'].max() + 0.23
        runData = pd.concat([runData, data], axis=0)
        runData = runData.reset_index(drop=True)

    # estimate a null distribution the correlation between labels and medians by phase randomization
    nullDist = np.zeros(nBoot)
    corrData = runData[['labels', 'median']].dropna()
    for j in range(nBoot):
        nullDist[j] = np.corrcoef(phase_randomize(corrData['median']), corrData['labels'])[0, 1]
    
    pd.DataFrame({'correlation': nullDist}).to_csv(join(outputDir, 'nullDist', f'phasRandom_tom_conti_v_cons_median_{mod}.csv'), index=False)

    # calculate the correlation between labels and medians
    r = np.corrcoef(corrData['median'], corrData['labels'])[0, 1]
    p = np.sum(nullDist > r) * 2 / nBoot
    corr_by_run.loc[i, 'r'] = r
    corr_by_run.loc[i, 'p'] = p

    allData = pd.concat([allData, runData], axis=0)

# for all data across runs
allData = allData.reset_index(drop=True)
nullDist = np.zeros(nBoot)
corrData = allData[['labels', 'median']].dropna()
for j in range(nBoot):
    nullDist[j] = np.corrcoef(phase_randomize(corrData['median']), corrData['labels'])[0, 1]
pd.DataFrame({'correlation': nullDist}).to_csv(join(outputDir, 'nullDist', f'phasRandom_tom_conti_v_cons_median_all.csv'), index=False)

r = np.corrcoef(corrData['median'], corrData['labels'])[0, 1]
p = np.sum(nullDist > r) * 2 / nBoot
corr_by_run.loc[2, 'modality'] = 'all'
corr_by_run.loc[2, 'r'] = r
corr_by_run.loc[2, 'p'] = p

print(corr_by_run)
corr_by_run.to_csv(join(outputDir, 'corr_tom_conti_v_cons_median_modality.csv'), index=False)


  modality         r       p
0    audio  0.191432  0.0462
1     text  0.140896  0.1092
2      all  0.152662  0.0214
