In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import seaborn as sns
from valid_funct import *
import importlib
from collections import Counter
from pathlib import Path

# load data

In [None]:
path = os.getcwd()
parent_dir = Path().resolve().parents[0]
data_dir = os.path.join(parent_dir, 'data')

data = pd.read_csv(os.path.join(data_dir, 'Wave1-16_paper_release.csv'),
                   encoding = 'ISO-8859-1',dtype=str, keep_default_na=False, 
                   na_values=['','NA'], low_memory=False)

# init output data

In [None]:
valid_data = data[['CVDID', 'wave']].copy() 


# data quality checks

Attention Check Questions

In [None]:
attDf = pd.DataFrame()
n_w = list(data['wave'].unique())
for wave in n_w:
    tmp = attent_check(data, wave)
    attDf = pd.concat([attDf, tmp], ignore_index=True, sort=False)
valid_data = valid_data.merge(attDf, on = ['CVDID', 'wave'], how = 'left')

check_for_errors(data, valid_data, 'attention questions')
del attDf


Wave completed 

In [None]:
w_completed = w_completed(data)
valid_data = valid_data.merge(w_completed, on = ['CVDID', 'wave'], how = 'left')

check_for_errors(data, valid_data, 'wave completed')
del w_completed


Free response questions: at least one noun or one verb

In [None]:
nounVerb_counts = nounVerb_count(data)
valid_data = valid_data.merge(nounVerb_counts, on = ['CVDID', 'wave'], how = 'left')

check_for_errors(data, valid_data, 'noun verb count')
del nounVerb_counts


IQR completion duration

In [None]:
# add completion time
delta_t = pd.to_datetime(data['V4']) - pd.to_datetime(data['V3'])
data['duration_in_s'] = delta_t.dt.total_seconds()/60
iqr_time = interq_analysis(data[['CVDID','wave','duration_in_s']],3)
iqr_time.rename(columns = {'duration_in_s': 'duration_outlier'}, inplace = True)
iqr_time = iqr_time.replace({3.0: True, 2.0: False, 1.0: False, 0.0: False})
valid_data = valid_data.merge(iqr_time, how = 'left', on = ['CVDID','wave'])

IQR of mean response string length 

In [None]:
long_string_df = data[['CVDID', 'wave']].copy()

string_col_dict = {'PANA': data.loc[:,'PANA1_1':'PANA1_20'].columns,
                'STAI_state': construct_vars('AnxS1_', 20),
                'disgust_COVID': construct_vars('DISG1.2_', 22, skip=[13, 16]), 
                'fear_COVID': construct_vars('Fear2_', 7),
                'EMSB': construct_vars('EMSB1_', 9),
                'EMSC': construct_vars('EMSC1_', 9),
                'NIHE':  construct_vars('NIHE1_', 8),
                'NIHL': construct_vars('NIHL1_', 5),
                'BDI': construct_vars('BDI', 21),
                'CD_RISK': construct_vars('RISC1_', 10),
                'consensBsl': construct_vars('cons1_', 35),
                'consensCvd':construct_vars('cons2_', 20),
                'EPII_infectHist_you':construct_vars('EPII10_', 8, '_1', skip=[7]),
                'EPII_infectHist_inHome':construct_vars('EPII10_', 8, '_2', skip=[7]),
                'EPII_infectHist_no':construct_vars('EPII10_', 8, '_3', skip=[7]),
                'EPII_infectHist_NA': construct_vars('EPII10_', 8, '_4', skip=[7]),
                'EPII_posChange_you': construct_vars('EPII11_', 18, '_1'),
                'EPII_posChange_inHome': construct_vars('EPII11_', 18, '_2'),
                'EPII_posChange_no': construct_vars('EPII11_', 18, '_3'), 
                'EPII_posChange_NA': construct_vars('EPII11_', 18, '_4'),
                'EPII_work_you': construct_vars('EPII2_', 11, '_1'),
                'EPII_work_inHome': construct_vars('EPII2_', 11, '_2'),
                'EPII_work_no': construct_vars('EPII2_', 11, '_4'),
                'EPII_work_NA': construct_vars('EPII2_', 11, '_5'),
                'EPII_home_you': construct_vars('EPII4_', 13, '_1'),
                'EPII_home_inHome': construct_vars('EPII4_', 13, '_2'),
                'EPII_home_no': construct_vars('EPII4_', 13, '_3'),
                'EPII_home_NA': construct_vars('EPII4_', 13, '_4'), 
                'EPII_socAct_you': construct_vars('EPII5_', 10, '_1'),
                'EPII_socAct_inHome':construct_vars('EPII5_', 10, '_2'),
                'EPII_socAct_no': construct_vars('EPII5_', 10, '_3'),
                'EPII_socAct_NA': construct_vars('EPII5_', 10, '_4'),
                'EPII_econ_you': construct_vars('EPII6_', 5, '_1'), 
                'EPII_econ_inHome': construct_vars('EPII6_', 5, '_2'), 
                'EPII_econ_no': construct_vars('EPII6_', 5, '_3'),
                'EPII_econ_NA':  construct_vars('EPII6_', 5, '_4'),
                'EPII_emo_you': construct_vars('EPII7_', 8, '_1', skip=[1,2]),
                'EPII_emo_inHome': construct_vars('EPII7_', 8, '_2', skip=[1,2]),
                'EPII_emo_no': construct_vars('EPII7_', 8, '_3'),
                'EPII_emo_no': construct_vars('EPII7_', 8, '_4'),
                'EPII_phys_you': construct_vars('EPII8_', 8, '_1'),
                'EPII_phys_inHome': construct_vars('EPII8_', 8, '_2'),
                'EPII_phys_no': construct_vars('EPII8_', 8, '_3'),
                'EPII_phys_NA': construct_vars('EPII8_', 8, '_4'),
                'EPII_dist_you': construct_vars('EPII9_', 8, '_1', skip=[8]),
                'EPII_dist_inHome': construct_vars('EPII9_', 8, '_2', skip=[8]), 
                'EPII_dist_no': construct_vars('EPII9_', 8, '_3'), 
                'EPII_dist_NA': construct_vars('EPII9_', 8, '_4'),  
                'trustPolit': construct_vars('RW6_1_', 11,skip=[7]),
                'CovidImpact_inHouse': construct_vars('RW21_1_', 6),     
                'CovidImpact_inHouse_v2': construct_vars('RW21v2_1_', 6),
                'CovidImpact_hasCvd' : construct_vars('RW21_2_', 6) ,
                'CovidImpact_hasCvd_v2': construct_vars('RW21v2_2_', 6),
                'CovidImpact_posTest':  construct_vars('RW21_3_', 6), 
                'CovidImpact_posTest_v2': construct_vars('RW21v2_3_', 6),
                'CovidImpact_hospital':  construct_vars('RW21_4_', 6), 
                'CovidImpact_hospital_v2':  construct_vars('RW21v2_4_', 6),
                'CovidImpact_deceased':  construct_vars('RW21_5_', 6), 
                'CovidImpact_deceased_v2': construct_vars('RW21v2_5_', 6),
                'CovidImpact_NA':  construct_vars('RW21_6_', 6), 
                'CovidImpact_NA_v2': construct_vars('RW21v2_6_', 6),
                'approve': construct_vars('RW25_', 8), 
                'CvdPrevent': construct_vars('RW26_', 18),
                'protest2': construct_vars('GFPS2_', 11, skip=[6]),
                'protest2_v2': construct_vars('GFPS2v2_', 10, skip=[6]),
                'protest4': construct_vars('GFPS4_', 7, skip=[3,5]),
                'protest4_v2': construct_vars('GFPS4v2_', 7, skip=[3,5]),
                'protest5': construct_vars('GFPS5_', 7, skip=[6]),
                'protest5_v2': construct_vars('GFPS5v2_', 7, skip=[6]),
                'protest6_happened': construct_vars('GFPS6_', 12, '_1', skip=[1,2,3,5,6,8]),
                'protest6_happened_v2': construct_vars('GFPS6v2_', 12, '_1', skip=[1,2,3,5,6,8]),
                'protest6_witnessed': construct_vars('GFPS6_', 12, '_3', skip=[1,2,3,5,6,8]),
                'protest6_witnessed_v2': construct_vars('GFPS6v2_', 12, '_3', skip=[1,2,3,5,6,8]),
                'protest6_learned': construct_vars('GFPS6_', 12, '_4', skip=[1,2,3,5,6,8]),
                'protest6_learned_v2': construct_vars('GFPS6v2_', 12, '_4', skip=[1,2,3,5,6,8]),
                'protest6_no': construct_vars('GFPS6_', 12, '_5', skip=[1,2,3,5,6,8]),
                'protest6_no_v2': construct_vars('GFPS6v2_', 12, '_5', skip=[1,2,3,5,6,8]),
                'protest6_noDisclose': construct_vars('GFPS6_', 12, '_6', skip=[1,2,3,5,6,8]),
                'protest6_noDisclose_v2': construct_vars('GFPS6v2_', 12, '_6', skip=[1,2,3,5,6,8]),
                'protest6_NA': construct_vars('GFPS6_', 12, '_7', skip=[1,2,3,5,6,8]),
                'protest6_NA_v2': construct_vars('GFPS6v2_', 12, '_7', skip=[1,2,3,5,6,8]),
                'protest9': construct_vars('GFPS9_', 7),
                'protest10': construct_vars('GFPS10_', 8),
                'protest11':construct_vars('GFPS11_', 7),
                'protest12':construct_vars('GFPS12_', 8),
                'protest12_v2':construct_vars('GFPS12v2_', 8),
                'protest13': construct_vars('GFPS13_', 7),
                'protest14': construct_vars('GFPS14_', 8),
                'protest15': construct_vars('GFPS15_', 7),
                'protest16': construct_vars('GFPS16_', 8),
                'protest17': construct_vars('GFPS17_', 12, skip=[4,5,7]),
                'protest17_v2': construct_vars('GFPS17v2_', 12, skip=[4,5,7]),
                'protest33': construct_vars('GFPS33_', 5),
                'EES': construct_vars('EES1_', 31),
                'Dscr': construct_vars('Dscr1_', 9),
                'FWI': construct_vars('FWI1_', 15),
                'hum': construct_vars('Hum1_', 10),
                'PC': construct_vars('PC5.2_', 5),
                'PPK': construct_vars('CvPP1_', 12),
                'ResSe': construct_vars('ReSe1_', 22),
                'SPS_city': construct_vars('City_', 12),
                'SPS_state': construct_vars('State_', 12),
                'SPS_fed': construct_vars('Fed_', 12),
                'STAI_trait':  construct_vars('AnxT_', 20),
                'VSA': construct_vars('VSA1_', 6),
                'NEO1': construct_vars('NEO1_', 10),
                'NEO2': construct_vars('NEO2_', 10),
                'NEO3': construct_vars('NEO3_', 10),
                'NEO4': construct_vars('NEO4_', 10),
                'NEO5': construct_vars('NEO5_', 10),
                'NEO6': construct_vars('NEO6_', 10),
                'LEC_me': construct_vars('LEC1_', 17, '_1'),
                'LEC_me_May': construct_vars('LEC1_May_', 17, '_1'),
                'LEC_witness': construct_vars('LEC1_May_', 17, '_2'),
                'LEC_witness_May': construct_vars('LEC1_', 17, '_2'),
                'LEC_learned': construct_vars('LEC1_May_', 17, '_3'),
                'LEC_learned_May': construct_vars('LEC1_', 17, '_3'),
                'LEC_job': construct_vars('LEC1_', 17, '_4'),
                'LEC_job_May': construct_vars('LEC1_May_', 17, '_4'),
                'LEC_notSure': construct_vars('LEC1_', 17, '_5'),
                'LEC_notSure_May': construct_vars('LEC1_May_', 17, '_5'),
                'LEC_NA': construct_vars('LEC1_', 17, '_6'),
                'LEC_NA_May': construct_vars('LEC1_May_', 17, '_6'),
                'Cnsp': data.loc[:,'Cnsp1_1':'Cnsp4_8'].columns}
for name in list(string_col_dict.keys()):
    input_vars = string_col_dict[name]
    long_string_df = long_string_df = extract_long_string(long_string_df, data, name, input_vars)

iqr_rs_mean_cols = ['CVDID', 'wave'] +  list(long_string_df.columns[long_string_df.columns.str.startswith('meanLongString')])
iqr_rs_mean = long_string_df[iqr_rs_mean_cols]
iqr_rs_mean = interq_analysis(iqr_rs_mean, 3)
iqr_rs_mean = iqr_rs_mean.replace({3.0: True, 2.0: False, 1.0: False, 0.0: False})

# core questionnaires
core_rs_questionnaires = ['meanLongString_PANA', 'meanLongString_STAI_state', 'meanLongString_STAI_trait', 
                          'meanLongString_NEO1', 'meanLongString_NEO2', 'meanLongString_NEO3', 'meanLongString_NEO4', 
                          'meanLongString_NEO5', 'meanLongString_NEO6', 'meanLongString_VSA', 'meanLongString_EES']

# all questionnaires with >=4 questions
all_rs_questionnaires = long_string_df.columns[long_string_df.columns.str.startswith('meanLongString')]
iqr_rs_mean['string_outlier_core'] = iqr_rs_mean[core_rs_questionnaires].mean(axis=1, skipna = True)>=0.5
iqr_rs_mean['string_outlier_all'] = iqr_rs_mean[all_rs_questionnaires].mean(axis=1, skipna = True)>=0.5

valid_data = valid_data.merge(iqr_rs_mean[['CVDID','wave','string_outlier_core', 'string_outlier_all']], how = 'left', on = ['CVDID','wave'])

check_for_errors(data, valid_data, 'questionnaire string outlier')
del iqr_rs_mean


IQR of within questionnaire correlations: pos/neg items and regular and reverse scored items

In [None]:
coherence_df = data[['CVDID', 'wave']].copy()

PANAS_pos_idx = ['PANA1_1','PANA1_3','PANA1_5','PANA1_9','PANA1_10','PANA1_12','PANA1_14','PANA1_16',
                 'PANA1_17','PANA1_19']
PANAS_neg_idx = ['PANA1_2','PANA1_4','PANA1_6','PANA1_7','PANA1_8','PANA1_11','PANA1_13','PANA1_15',
                 'PANA1_18','PANA1_20']

PSS_idx = ['PSS1','PSS2','PSS3','PSS4','PSS5','PSS6','PSS7','PSS8','PSS9','PSS10']

STAI_rev = ['AnxS1_1','AnxS1_2','AnxS1_5','AnxS1_8','AnxS1_10','AnxS1_11','AnxS1_15','AnxS1_16','AnxS1_19','AnxS1_20']
STAI_reg = ['AnxS1_3', 'AnxS1_4', 'AnxS1_6', 'AnxS1_7', 'AnxS1_9', 'AnxS1_12', 'AnxS1_13','AnxS1_14', 'AnxS1_17','AnxS1_18']

EES_empFeelExp_reg = ['EES1_3','EES1_9', 'EES1_11', 'EES1_12', 'EES1_13', 'EES1_14', 'EES1_15', 'EES1_18','EES1_22','EES1_23','EES1_26', 'EES1_30']
EES_empFeelExp_rev = ['EES1_16','EES1_17', 'EES1_21']

EES_empPers_reg = ['EES1_4','EES1_6', 'EES1_19']
EES_empPers_rev = ['EES1_2','EES1_28', 'EES1_29', 'EES1_31']

Disg_reg = ['DISG1.1_2', 'DISG1.1_3','DISG1.1_4', 'DISG1.1_5', 'DISG1.1_7', 'DISG1.1_8','DISG1.1_9', 'DISG1.1_12', 'DISG1.1_14', 'DISG1.1_16',
            'DISG1.2_1', 'DISG1.2_3', 'DISG1.2_4', 'DISG1.2_5', 'DISG1.2_6', 'DISG1.2_7', 'DISG1.2_8', 'DISG1.2_9', 'DISG1.2_10', 'DISG1.2_11',
            'DISG1.2_12', 'DISG1.2_14']
Disg_rev = ['DISG1.1_1', 'DISG1.1_6','DISG1.1_10', 'DISG1.1_13', 'DISG1.2_2']

coherence_df['PANAS_diff'] = abs(data[PANAS_neg_idx].astype(float).mean(axis=1) - data[PANAS_pos_idx].astype(float).mean(axis=1))
coherence_df['PANASpos_PSS_diff'] = abs(data[PSS_idx].astype(float).mean(axis=1) - data[PANAS_pos_idx].astype(float).mean(axis=1))
coherence_df['STAI_diff'] = abs(data[STAI_rev].astype(float).mean(axis=1) - data[STAI_reg].astype(float).mean(axis=1))
coherence_df['EES_empFeelExp_diff'] = abs(data[EES_empFeelExp_rev].astype(float).mean(axis=1) - data[EES_empFeelExp_reg].astype(float).mean(axis=1))
coherence_df['EES_empPers_diff'] = abs(data[EES_empPers_rev].astype(float).mean(axis=1) - data[EES_empPers_reg].astype(float).mean(axis=1))
coherence_df['Disg_diff'] = abs(data[Disg_rev].astype(float).mean(axis=1) - data[Disg_reg].astype(float).mean(axis=1))

iqr_coherence = interq_analysis(coherence_df, 3)
iqr_coherence = iqr_coherence.replace({3.0: True, 2.0: False,1.0:  False, 0.0: False})
iqr_coherence['response_consistency'] = iqr_coherence[['PANAS_diff','PANASpos_PSS_diff','STAI_diff',
                                         'EES_empFeelExp_diff', 'EES_empPers_diff','Disg_diff']].mean(axis=1, skipna = True)>=0.5

valid_data = valid_data.merge(iqr_coherence[['CVDID','wave','response_consistency']], how = 'left', on = ['CVDID','wave'])

check_for_errors(data, valid_data, 'questionnaire coherence')
del iqr_coherence


Frequency of nan responses

In [None]:
# trust opinions -> not sure
tmp = pd.DataFrame()
nanResp_df = pd.DataFrame()
nanResp_df = data[['CVDID', 'wave']].copy()

tmp = data[['RW6_1_1','RW6_1_2', 'RW6_1_3', 'RW6_1_4', 'RW6_1_5', 'RW6_1_6', 'RW6_1_8', 'RW6_1_9', 'RW6_1_10', 'RW6_1_11']] == '7.0'
tmp[data[['RW6_1_1','RW6_1_2', 'RW6_1_3', 'RW6_1_4', 'RW6_1_5', 'RW6_1_6', 'RW6_1_8', 'RW6_1_9', 'RW6_1_10', 'RW6_1_11']].isnull()] = np.nan
nanResp_df['NaN_Freq1'] = tmp.mean(axis = 1)
del tmp

# important to prevent the spread of Covid -> not sure
tmp = pd.DataFrame()
tmp = data[['RW25_1', 'RW25_2', 'RW25_3', 'RW25_4', 'RW25_5', 'RW25_6', 'RW25_7', 'RW25_8']] == '7.0'
tmp[data[['RW25_1', 'RW25_2', 'RW25_3', 'RW25_4', 'RW25_5', 'RW25_6', 'RW25_7', 'RW25_8']].isnull()] = np.nan
nanResp_df['NaN_Freq2'] = tmp.mean(axis = 1)
del tmp

# Public Policy knowledge -> I don't know
tmp = pd.DataFrame()
tmp = data[['CvPP1_1','CvPP1_2','CvPP1_3','CvPP1_4','CvPP1_5','CvPP1_6','CvPP1_7','CvPP1_8','CvPP1_9','CvPP1_10','CvPP1_11','CvPP1_12']] == '4.0'
tmp[data[['CvPP1_1','CvPP1_2','CvPP1_3','CvPP1_4','CvPP1_5','CvPP1_6','CvPP1_7','CvPP1_8','CvPP1_9','CvPP1_10','CvPP1_11','CvPP1_12']].isnull()] = np.nan
nanResp_df['NaN_Freq3'] = tmp.mean(axis = 1)
del tmp

iqr_nanResp = interq_analysis(nanResp_df, 3)
iqr_nanResp = iqr_nanResp.replace({3.0: True, 2.0: False,1.0:  False, 0.0: False})
# NAN resp in half or more than half of the questions including NAN responses
iqr_nanResp['freq_NAresp'] = iqr_nanResp[['NaN_Freq1','NaN_Freq2','NaN_Freq3']].mean(axis=1, skipna = True)>=0.5

# add to validation data
valid_data = valid_data.merge(iqr_nanResp[['CVDID','wave','freq_NAresp']], how = 'left', on = ['CVDID','wave'])

check_for_errors(data, valid_data, 'NaN resps')
del iqr_nanResp



# overview over NA-responses per wave

In [None]:
nanResp_df = data[['CVDID', 'wave']].copy()

nanResp_df[['RW6_1_1','RW6_1_2', 'RW6_1_3', 'RW6_1_4', 'RW6_1_5', 
            'RW6_1_6', 'RW6_1_8', 'RW6_1_9', 'RW6_1_10', 
            'RW6_1_11']] = data[['RW6_1_1','RW6_1_2', 'RW6_1_3', 'RW6_1_4', 'RW6_1_5', 
            'RW6_1_6', 'RW6_1_8', 'RW6_1_9', 'RW6_1_10', 'RW6_1_11']] == '7.0'
nanResp_df[data[['RW6_1_1','RW6_1_2', 'RW6_1_3', 'RW6_1_4', 'RW6_1_5', 
            'RW6_1_6', 'RW6_1_8', 'RW6_1_9', 'RW6_1_10', 
            'RW6_1_11']].isnull()] = np.nan

nanResp_df[['RW25_1', 'RW25_2', 'RW25_3', 'RW25_4', 'RW25_5', 'RW25_6', 'RW25_7', 
            'RW25_8']]= data[['RW25_1', 'RW25_2', 'RW25_3', 'RW25_4', 'RW25_5', 
                              'RW25_6', 'RW25_7', 'RW25_8']] == '7.0'
nanResp_df[data[['RW25_1', 'RW25_2', 'RW25_3', 'RW25_4', 'RW25_5', 'RW25_6', 'RW25_7','RW25_8']].isnull()] = np.nan

nanResp_df[['CvPP1_1','CvPP1_2','CvPP1_3','CvPP1_4','CvPP1_5','CvPP1_6',
            'CvPP1_7','CvPP1_8','CvPP1_9','CvPP1_10','CvPP1_11',
            'CvPP1_12']]= data[['CvPP1_1','CvPP1_2','CvPP1_3','CvPP1_4',
                                'CvPP1_5','CvPP1_6','CvPP1_7','CvPP1_8','CvPP1_9',
                                'CvPP1_10','CvPP1_11','CvPP1_12']] == '4.0'
nanResp_df[data[['CvPP1_1','CvPP1_2','CvPP1_3','CvPP1_4','CvPP1_5','CvPP1_6',
            'CvPP1_7','CvPP1_8','CvPP1_9','CvPP1_10','CvPP1_11','CvPP1_12']].isnull()] = np.nan


nanResp_df['sum_NA_resp'] = nanResp_df.iloc[:,2:].sum(axis = 1)
nanResp_df['n_NA_resp'] = np.sum(np.array(~nanResp_df[list(nanResp_df.columns[2:-1])].isnull()),1)
nanResp_df.loc[nanResp_df['sum_NA_resp'] > nanResp_df['n_NA_resp']/2,'N_overX'] = 1

nanResp_summary_all = pd.DataFrame(index = nanResp_df.wave.unique()) 
nanResp_summary_core = pd.DataFrame(index = nanResp_df.wave.unique()) 

# NA summary all subjects
nanResp_summary_all['mean'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').mean()
nanResp_summary_all['var'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').var()
nanResp_summary_all['max'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').max()
nanResp_summary_all['>half'] = nanResp_df.loc[:,['wave','N_overX']].groupby(by='wave').sum()
nanResp_summary_all['N'] = nanResp_df.loc[:,['wave','n_NA_resp']].groupby(by='wave').max()


# # NA summary core sample 
nanResp_df =  nanResp_df.loc[nanResp_df.CVDID.isin(data.loc[data['sample'] == 'core', 'CVDID'])]
nanResp_summary_core['mean'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').mean()
nanResp_summary_core['var'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').var()
nanResp_summary_core['max'] = nanResp_df.loc[:,['wave','sum_NA_resp']].groupby(by='wave').max()
nanResp_summary_core['>half'] = nanResp_df.loc[:,['wave','N_overX']].groupby(by='wave').sum()
nanResp_summary_core['N'] = nanResp_df.loc[:,['wave','n_NA_resp']].groupby(by='wave').max()
nanResp_summary_all

# Task Validation

response string length

In [None]:
#% run ProcessTaskData.ipynb
task_data = pd.read_csv(os.path.join(data_dir, 'task_data', 'task_qual.csv'),dtype=str, low_memory=False)

rs_cols = list(task_data.columns[task_data.columns.str.endswith('_meanLongString')])
task_data[rs_cols] =task_data[rs_cols].astype(float)
rs_cols = ['CVDID','wave'] + rs_cols 

iqr_task_rs = interq_analysis(task_data[rs_cols],3)
iqr_task_rs = iqr_task_rs.replace({3.0: True, 2.0: False,1.0:  False, 0.0: False})
iqr_task_rs  = iqr_task_rs.rename(columns={"tr_1s_meanLongString": "string_outlier_tr", 
                                            "amp_meanLongString": "string_outlier_amp",
                                            "altt_meanLongString": "string_outlier_altt",
                                            "cvd_consp_meanLongString": "string_outlier_cvd_consp"})



valid_data = valid_data.merge(iqr_task_rs, how = 'left', on = ['CVDID','wave'])

check_for_errors(data, valid_data, 'task rs')



other task quality measures: RT outliers, iat/biat exclusion, tasks missing 

In [None]:
pctl_bad_rt_cutoff = 0.1

task_valid = task_data[['CVDID','wave']].copy()

# read in additional task quality measures: 
task_valid.loc[task_data['tr_1s_rt_pctlt_300'].astype(float)>=pctl_bad_rt_cutoff,'tr_1s_rt_pctlt_300'] = True
task_valid.loc[task_data['tr_1s_rt_pctlt_300'].astype(float)<pctl_bad_rt_cutoff,'tr_1s_rt_pctlt_300'] = False
task_valid.loc[task_data['tr_1s_rt_pctlt_300'].isnull(),'tr_1s_rt_pctlt_300'] = np.nan

task_valid.loc[task_data['altt_rt_pctlt_300'].astype(float)>=pctl_bad_rt_cutoff,'altt_rt_pctlt_300'] = True
task_valid.loc[task_data['altt_rt_pctlt_300'].astype(float)<pctl_bad_rt_cutoff,'altt_rt_pctlt_300'] = False
task_valid.loc[task_data['altt_rt_pctlt_300'].isnull(),'altt_rt_pctlt_300'] =np.nan

task_valid.loc[task_data['cvd_consp_rt_pctlt_300'].astype(float)>=pctl_bad_rt_cutoff,'cvd_consp_rt_pctlt_300'] = True
task_valid.loc[task_data['cvd_consp_rt_pctlt_300'].astype(float)<pctl_bad_rt_cutoff,'cvd_consp_rt_pctlt_300'] = False
task_valid.loc[task_data['cvd_consp_rt_pctlt_300'].isnull(),'cvd_consp_rt_pctlt_300'] = np.nan


task_valid.loc[task_data['amp_pct_bad_rts'].astype(float)>=pctl_bad_rt_cutoff,'amp_pct_bad_rts'] = True
task_valid.loc[task_data['amp_pct_bad_rts'].astype(float)<pctl_bad_rt_cutoff,'amp_pct_bad_rts'] = False
task_valid.loc[task_data['amp_pct_bad_rts'].isnull(),'amp_pct_bad_rts'] = np.nan

task_valid.loc[task_data['tr_1s_noVar']=='1.0','tr_1s_noVar'] = True
task_valid.loc[task_data['tr_1s_noVar']!='1.0','tr_1s_noVar'] = False
task_valid.loc[task_data['tr_1s_noVar'].isnull(),'tr_1s_noVar'] = np.nan

task_valid.loc[task_data['iat_include']=='0.0','iat_exclude'] = True
task_valid.loc[task_data['iat_include']!='0.0','iat_exclude'] = False
task_valid.loc[task_data['iat_include'].isnull(),'iat_exclude'] = np.nan

task_valid.loc[task_data['biat_include']=='0.0','biat_exclude'] = True
task_valid.loc[task_data['biat_include']!='0.0','biat_exclude'] = False
task_valid.loc[task_data['biat_include'].isnull(),'biat_exclude'] = np.nan

task_valid['biat_missing'] = task_data.biat_missing
task_valid['altt_missing'] = task_data.altt_missing
task_valid['tr_1s_missing'] = task_data.tr_1s_missing
task_valid['iat_missing'] = task_data.iat_missing
task_valid['pgg_missing'] = task_data.pgg_missing
task_valid['amp_missing'] = task_data.amp_missing
task_valid['cvd_consp_missing'] = task_data.cvd_consp_missing

task_valid['biat_administered'] = task_data.biat_administered
task_valid['altt_administered'] = task_data.altt_administered
task_valid['tr_1s_administered'] = task_data.tr_1s_administered
task_valid['iat_administered'] = task_data.iat_administered
task_valid['pgg_administered'] = task_data.pgg_administered
task_valid['amp_administered'] = task_data.amp_administered
task_valid['cvd_consp_administered'] = task_data.cvd_consp_administered

# rename rt columns for consistency
task_valid = task_valid.rename(columns={"tr_1s_rt_pctlt_300": "tr_pct_bad_rts",
                                              "altt_rt_pctlt_300": "altt_pct_bad_rts",
                                              "cvd_consp_rt_pctlt_300": "cvd_consp_pct_bad_rts"})

# task missing/ administered 
tasks = ['altt','pgg','tr_1s','iat','amp','biat', 'cvd_consp']
for w in task_valid.wave.unique():
    for task in tasks:
        if sum(task_valid.loc[task_valid.wave == w, task + '_administered'] == 'True')> 0:
            task_valid.loc[task_valid.wave == w, task+ '_administered'] = 'True'
        else:
            task_valid.loc[task_valid.wave == w, task+ '_administered'] = 'False'
        
for task in tasks:
    task_valid.loc[(task_valid[task+'_missing'].isnull()) &
                                     (task_valid[task+'_administered'] == 'True'), task+'_missing'] = 'True'
    task_valid.loc[(task_valid[task+'_missing'].isnull()) &
                                     (task_valid[task+'_administered'] == 'False'), task+'_missing'] = 'False'
# merge
valid_data = valid_data.merge(task_valid, how = 'left', on = ['CVDID','wave'])

# percentage of completed waves

In [None]:
w_prlfc = []
for w in range(1,max_wave_prolific+1,1):
    w_prlfc.append(str(w))

valid_data_prlfc = valid_data.loc[valid_data['wave'].isin(w_prlfc),:].copy()
valid_data_conte = valid_data.loc[valid_data['wave'].isin(w_conte),:].copy()
del valid_data

wave_count_prlfc = valid_data_prlfc[['CVDID','wave']].groupby('CVDID').count()  
wave_count_prlfc = wave_count_prlfc.rename(columns={'wave': 'nCompleted'}).reset_index()
wave_count_prlfc['perc_completed'] = wave_count_prlfc.nCompleted/wave_count_prlfc.nCompleted.max()
valid_data_prlfc = valid_data_prlfc.merge(wave_count_prlfc, on = 'CVDID', how = 'left')

wave_count_conte = valid_data_conte[['CVDID','wave']].groupby('CVDID').count()    
wave_count_conte = wave_count_conte.rename(columns={'wave': 'nCompleted'})
wave_count_conte['perc_completed'] = wave_count_conte.nCompleted/wave_count_conte.nCompleted.max()
valid_data_conte = valid_data_conte.merge(wave_count_conte, on = 'CVDID', how = 'left')

valid_data_prlfc['low_compl'] = valid_data_prlfc.perc_completed<0.5
valid_data_conte['low_compl'] = valid_data_conte.perc_completed<0.5

# weekly subject count based on (cummulative) quality criteria

In [None]:

validation_criteria =  ['low_compl', 'more_than_1_attQ_failed', 'string_outlier_all', 'string_outlier_core','duration_outlier',
                            'string_outlier_amp', 'string_outlier_altt', 'tr_pct_bad_rts','response_consistency',
                            'altt_pct_bad_rts', 'amp_pct_bad_rts', 'iat_exclude', 'biat_exclude','free_text_resp_valid_stress',
                            'free_text_resp_valid_news','freq_NAresp']
    
# drop all rows of incomplete qualtrics data 
valid_data_prlfc = valid_data_prlfc.loc[valid_data_prlfc['completed']=='1',:]
valid_data_conte = valid_data_conte.loc[valid_data_conte['completed']=='1',:]
    
valid_data_prlfc['perc_valid_failed'] = valid_data_prlfc[validation_criteria].mean(axis = 1, skipna = True)
valid_data_conte['perc_valid_failed'] = valid_data_conte[validation_criteria].mean(axis = 1, skipna = True)
valid_data_prlfc['N_valid_failed'] =  valid_data_prlfc[validation_criteria].sum(axis = 1, skipna = True)
valid_data_conte['N_valid_failed'] = valid_data_conte[validation_criteria].sum(axis = 1, skipna = True)


# wave-by-wave summary 
week_sub_prlfc = pd.DataFrame(index = valid_data_prlfc.wave.unique())
for wave in list(week_sub_prlfc.index):
    week_sub_prlfc.loc[wave, 'N sub'] = len(valid_data_prlfc.loc[valid_data_prlfc.wave == wave,:])
    week_sub_prlfc.loc[wave, 'N QC'] = int(sum(valid_data_prlfc.loc[valid_data_prlfc.wave == wave,validation_criteria].isnull().sum(axis = 0) != week_sub_prlfc.loc[wave,'N sub']))
    
for wave in list(week_sub_prlfc.index):
    for perc in list(np.arange(1, 0.2, -0.1)):
        perc_pass = str(round(sum(valid_data_prlfc.loc[valid_data_prlfc.wave == wave,'perc_valid_failed']<=(1-perc))/sum(valid_data_prlfc.wave == wave)*100,2))+'%'
        count_pass = ' (' + str( sum(valid_data_prlfc.loc[valid_data_prlfc.wave == wave,'perc_valid_failed']<=(1-perc))) + ')'
        week_sub_prlfc.loc[wave,'> ' + str(round(perc*100))+'% QC passed'] = perc_pass +count_pass
valid_data_prlfc.to_csv(os.path.join(data_dir, 'validation_passCriterion_perSub_perWave_w1-'+str(max_wave_prolific) + '_prlfc.csv'),index=False)

week_sub_prlfc.to_csv(os.path.join(data_dir, 'validation_proportion_prct_pass_perWave_w1-'+str(max_wave_prolific) + '_prlfc.csv'))






In [None]:
nanResp_df