# Reliability Analysis Notebook

### 1. Process manual codes

In [None]:
from glob import glob
from emocodes import CodeTimeSeries

raw_dir = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/raw/'

movie = 'MLP'
    
if movie=='AHKJ':
    raw_files = glob(raw_dir + 'AHKJ*_subjective_char*')
    video = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/episodes/AHKJ_S1E2.mp4'
    preproc_dir = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/processed/AHKJ_S1E2/'
elif movie=='MLP':
    raw_files = glob(raw_dir + 'MLP*_subjective_char*')
    video = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/episodes/MLP_S8E3_20.mp4'
    preproc_dir = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/processed/MLP_S8E3/'

cts = CodeTimeSeries(sampling_rate=10)

for file in raw_files:
    initials = file[-6:-4]
    name_start = file.find('codes_') + len('codes_')
    name_end = file.find('_'+initials)
    char_name = file[name_start:name_end].lower().capitalize()
    print(initials)
    print(char_name)
    cts.proc_codes_file(codes_file=file, video_file=video, save_file_name=preproc_dir+'subjective_character_{1}_timeseries_{0}'.format(initials, char_name))    

### 2. Pull in processed data and combine into 1 dataframe

In [None]:
# pull in data and assign processing directories
from glob import glob
preproc_dir = '/Users/catcamacho/Box/CCP/EmoCodes_project/reliability_data/processed/'

episode = 'AHKJ_S1E2'
files = sorted(glob(preproc_dir + '{0}/*Julien*'.format(episode)))

In [None]:
# pull in data and combine into 1 dataframe with a rater column
import pandas as pd

dfs = []
for file in files:
    rater = file[-22:-20]
    temp = pd.read_csv(file, index_col=None)
    if 'on_screen' in temp.columns:
        variables = temp.columns.tolist()
        variables.remove('time')
        temp.loc[temp['on_screen']==0, variables] = 0 
    temp.index = pd.to_datetime(temp['time'],unit='ms')
    temp.index.name = 'timeseries'
    temp2 = temp.resample('1s').mean().round(0)
    temp2['rater'] = rater
    temp2.index = range(0,len(temp2),1)
    temp2.index.name='onset_seconds'
    dfs.append(temp2)
    
data = pd.concat(dfs)

### 3. Compute ICCs across the entire video

In [None]:
# compute ICCs
import pingouin as pg

variables = data1hz.columns.to_list()
variables.remove('rater')
variables.remove('time')
icc_df = pd.DataFrame(columns=['instance_level_ICC', 'instance_level_consistency','overall_mean_ICC'])

for x in variables:
    icc = pg.intraclass_corr(data=data, targets='time',raters='rater',ratings=x, nan_policy='omit').round(3)
    icc_df.loc[x, 'instance_level_ICC'] = icc.loc[1, 'ICC']
    icc_df.loc[x, 'overall_mean_ICC'] = icc.loc[4, 'ICC']

    # evaluate item-level ICCs
    if icc.loc[1, 'ICC'] < 0.50:
        icc_df.loc[x, 'instance_level_consistency'] = 'poor'
    elif (icc.loc[1, 'ICC'] >= 0.50) & (icc.loc[1, 'ICC'] < 0.75):
        icc_df.loc[x, 'instance_level_consistency'] = 'moderate'
    elif (icc.loc[1, 'ICC'] >= 0.75) & (icc.loc[1, 'ICC'] < 0.90):
        icc_df.loc[x, 'instance_level_consistency'] = 'good'
    elif icc.loc[1, 'ICC'] >= 0.90:
        icc_df.loc[x, 'instance_level_consistency'] = 'excellent'

icc_df.to_csv('iccs_mlp_subjectivechar.csv')

### 4. Compute ICCs for 3 minute windows

In [None]:
# compute ICCs
import pingouin as pg
import numpy as np

variables = data.columns.to_list()
variables.remove('rater')
variables.remove('time')
wind_icc_df = pd.DataFrame(columns=['start', 'end', 'variable', 'instance_level_ICC', 'overall_mean_ICC'])

window_size = 180
overlap = 20
start = 0
end = start + window_size

while end < max(data1hz.index):
    temp = data[(data.index >= start) & (data.index <= end)]
    for x in variables:
        wind_icc_df.loc['{0}_{1}_{2}'.format(x, start, end), 'start'] = start
        wind_icc_df.loc['{0}_{1}_{2}'.format(x, start, end), 'end'] = end
        wind_icc_df.loc['{0}_{1}_{2}'.format(x, start, end), 'variable'] = x
        
        icc = pg.intraclass_corr(data=temp, targets='time',raters='rater',ratings=x, nan_policy='omit').round(3)
        wind_icc_df.loc['{0}_{1}_{2}'.format(x, start, end), 'instance_level_ICC'] = icc.loc[1, 'ICC']
        wind_icc_df.loc['{0}_{1}_{2}'.format(x, start, end), 'overall_mean_ICC'] = icc.loc[4, 'ICC']
    
    start = start + overlap
    end = start + window_size

wind_icc_df.to_csv('iccs_windows_ahkj_subjectivechar.csv')
wind_icc_df[wind_icc_df['instance_level_ICC']<=0] = np.nan
wind_icc_df_min = wind_icc_df.groupby('variable').min().to_csv('iccs_windows_ahkj_subjectivechar_min.csv')
wind_icc_df_max = wind_icc_df.groupby('variable').max().to_csv('iccs_windows_ahkj_subjectivechar_max.csv')

### 5. Average codes across raters and plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(context='talk', style='white')

data_mean = data.groupby('onset_seconds').mean()
data_mean.drop('time', axis=1, inplace=True)
data_mean.to_csv('AHKJ_subjective_ratings_Julien.csv')

data_mean[['on_screen','char_intensity','char_valence_negative', 'char_valence_positive','c_fear_body',
              'c_fear_face', 'c_fear_verbal']].plot(subplots=True, figsize=(12,12), xlim=(0,data.index[-1]))
plt.savefig('AHKJ_subjective_all.svg')