In [1]:
import os
import pandas as pd
from glob import glob
import numpy
import fnmatch
import pickle
from deepdiff import DeepDiff


In [2]:
raw_beh_path = os.path.abspath(r'C:\Users\USER\Guro_Psy_KJH Dropbox\1.Projects\1_anxiety_VR\3_Data\1_Behavior\0_raw')

In [3]:
# Input: local location for raw behavior data from VRABES
# Output: dictionary, key = sub-xxxx_ses-xx, value = list of path 
# reference: ref1 : 폴더명 생성 규칙 
def gen_vrabes_paths (raw_beh_path): 
    dic_path = {}  
    subs = os.listdir(raw_beh_path) 
    for sub in subs:
        path2 = os.path.join(raw_beh_path,sub)
        sess = os.listdir(path2) 
        for ses in sess:
            path3 = os.path.join(path2,ses)
            paths = os.listdir(path3) 
            dic_path['_'.join([sub,ses])] = paths           
                
    return dic_path

In [4]:
# Input: return from gen_vrabes_paths
# output: dictionary, key = sub-xxxx_ses-xx, value = list of filename
# reference: ref2 : Information and location by file

def gen_vrabes_flist ():
    a = gen_vrabes_paths(raw_beh_path)
    dic_file = {}    
    for key in a.keys():        
        f_list=[]
        for i in a[key]:
            path_list = [raw_beh_path, key.split('_')[0], key.split('_')[1],i]      
            p = os.path.join(*path_list)
            f_list.extend(glob(p+'/*.csv'))
            f_list.extend(glob(p+'/*.txt'))                      
            
        dic_file[key] = f_list
    return dic_file

In [5]:
# 고려 중 - 추후에 추가되는 실험 내용에 대하여 반영이 가능하게 고민 중
def old_vrabes_ex():
    with open('vrabes_beh_dict.pkl','rb') as f:
        old_dict = pickle.load(f)
    old_vrabes_dict = {}
    for key in old_dict.keys():
        col_list = old_dict[key].keys()
        flist = []
        for a in col_list:
            col_names = a.split('_')[:-1]
            fname = '_'.join(col_names)
            if fname in flist:
                pass
            else: flist.append(fname)
        old_vrabes_dict[key] = flist
    return old_vrabes_dict

In [6]:
# 고려 중- 추후에 추가되는 실험 내용에 대하여 반영이 가능하게 고민 중 - main 함수를 차이가 있는데서만 작동하도록
# Input: pkl - generated dictionary from gen_df
# Output: dictionary, Deepdiff -> ref4
# Compares the dictionary generated by gen_dic_flist() of raw_beh_path at execution time 
# and the flist dictionary extracted from the existing storage dictionary from "old_dict_ex" 
# and returns the difference

def diff_dic ():
    new_dict = gen_vrabes_flist()
    ex_new_dict = {}
    for key in new_dict.keys():
        new_flist =[]
        for fname in new_dict[key]:
            new_flist.append(fname.split('\\')[-1][:-4])
        ex_new_dict[key] = new_flist

    old_dict = old_vrabes_ex()
    diff = DeepDiff(old_dict,ex_new_dict,verbose_level=2)
    return diff
    
    
        

In [7]:
# Input: string from HRV_Result
# output: string, name & val 
# reference: ref3 : Information and location by file
def hrv_r_slicer (x):
    eq_ind = x.find('=') 
    name = x[:eq_ind]
    after_eq = x[eq_ind+1:] 
    if '(' in after_eq:
        br_ind = after_eq.find('(')
        val = x[eq_ind+2:eq_ind+br_ind+1] 
    else:
        val = x[eq_ind+2:]
    return name, val

In [8]:
# Input: generate pickle
# output: dictionary, key = sub-xxxx_sex-xx, value = dataframe for each experiment

def gen_vrabes_dict ():
    f_path = gen_vrabes_flist()
    result_dic = {}
    for key in f_path.keys():
        # print(key)        
        dic_df = {}       
        for i in f_path[key]:            
            i_sp = i.split('\\')[-1]
            if i_sp[-3:] == 'csv':
                r_df = pd.read_csv(i,header=1)
                col1 = i_sp[:-4]
                if i_sp[-7:-4] =='HRV':
                    col2 = r_df.columns[0]
                    val = r_df.columns[1]
                    dic_df['_'.join([col1,col2])] = val
                else:
                    for d in r_df.columns:
                        val = r_df[d][0]
                        dic_df['_'.join([col1,d])] = val  
            elif i_sp[-3:] == 'txt':
                r_df = pd.read_csv(i,header=None)
                col1 = i_sp[:-4]
                for e in r_df.loc[:,0]:
                    col2, val = hrv_r_slicer(e)
                    dic_df['_'.join([col1,col2])] = val
        result_dic[key] = dic_df
        result = pd.DataFrame(result_dic).transpose()
        # result = result.transpose()
    return result
 

In [9]:
raw_crf_path = os.path.abspath(r'C:\Users\USER\Guro_Psy_KJH Dropbox\1.Projects\1_anxiety_VR\3_Data\4_CRF')
filename = ['VRABES_CRF_form.xlsx', 'VRABES_Demographic.xlsx']
sheet = ['baseline','normal','anxiety','Demographic']
drop = ['no','code','name']
# header = 3, index_col = 'ID'

In [10]:
def crf_pre (data):
    data.drop(data.columns[data.columns.str.contains('Unnamed')],axis =1 , inplace = True)
    data.drop([col for col in data.columns if any([x for x in drop if x == col])], axis=1, inplace=True)
    data.dropna(subset = ['ID'], inplace = True)
    return data

In [11]:
def gen_total (raw_crf_path):
    raw_df = gen_vrabes_dict ()
    demo = pd.read_excel(os.path.join(raw_crf_path,'VRABES_Demographic.xlsx'), sheet_name= 'Demographic', header = 3, index_col= 'no')
    ses_01_all = pd.read_excel(os.path.join(raw_crf_path,'VRABES_CRF_form.xlsx'), sheet_name= 'baseline', header = 3, index_col= 'no')
    ses_02_nor = pd.read_excel(os.path.join(raw_crf_path,'VRABES_CRF_form.xlsx'), sheet_name= 'normal', header = 3, index_col= 'no')
    anx = pd.read_excel(os.path.join(raw_crf_path,'VRABES_CRF_form.xlsx'), sheet_name= 'anxiety', header = 3, index_col= 'no')

    demo = crf_pre(demo)
    ses_01_all = crf_pre(ses_01_all)
    ses_02_nor = crf_pre(ses_02_nor)
    anx = crf_pre(anx)
    ses_01_demo = demo.loc[:,list(dict.fromkeys(list(demo.columns)))]
    ses_02_demo = demo.loc[:,list(dict.fromkeys(list(demo.columns)))]
    ses_03_demo = demo.loc[:,list(dict.fromkeys(list(demo.columns)))]
    ses_04_demo = demo.loc[:,list(dict.fromkeys(list(demo.columns)))]

    i_ses02 = anx.columns.get_loc('visit_2_d')
    i_ses03 = anx.columns.get_loc('visit_3_d')
    i_ses04 = anx.columns.get_loc('visit_4_d')

    ses_02_anx = pd.concat([anx.iloc[:,0],anx.iloc[:,1:i_ses03]], axis=1)
    ses_03_anx = pd.concat([anx.iloc[:,0],anx.iloc[:,i_ses03:i_ses04]], axis=1)
    ses_04_anx = pd.concat([anx.iloc[:,0],anx.iloc[:,i_ses04:]], axis=1)

    ses_01_all.rename(columns = lambda x:'ses-01_' + x, inplace=True)
    ses_01_demo.rename(columns = lambda x:'ses-01_' + x, inplace=True)
    ses_02_nor.rename(columns = lambda x:'ses-02_' + x, inplace=True)
    ses_02_anx.rename(columns = lambda x:'ses-02_' + x, inplace=True)
    ses_02_demo.rename(columns = lambda x:'ses-02_' + x, inplace=True)
    ses_03_anx.rename(columns = lambda x:'ses-03_' + x, inplace=True)
    ses_03_demo.rename(columns = lambda x:'ses-03_' + x, inplace=True)
    ses_04_anx.rename(columns = lambda x:'ses-04_' + x, inplace=True)
    ses_04_demo.rename(columns = lambda x:'ses-04_' + x, inplace=True)

    ses_01_all['ses-01_ID'] = ses_01_all.iloc[:,0].apply(lambda x:x+'_ses-01')
    ses_01_demo['ses-01_ID'] = ses_01_demo.loc[:,'ses-01_ID'].apply(lambda x:x+'_ses-01')
    ses_02_nor['ses-02_ID'] = ses_02_nor.iloc[:,0].apply(lambda x:x+'_ses-02')
    ses_02_anx['ses-02_ID'] = ses_02_anx.iloc[:,0].apply(lambda x:x+'_ses-02')
    ses_02_demo['ses-02_ID'] = ses_02_demo.loc[:,'ses-02_ID'].apply(lambda x:x+'_ses-02')
    ses_03_anx['ses-03_ID'] = ses_03_anx.iloc[:,0].apply(lambda x:x+'_ses-03')
    ses_03_demo['ses-03_ID'] = ses_03_demo.loc[:,'ses-03_ID'].apply(lambda x:x+'_ses-03')
    ses_04_anx['ses-04_ID'] = ses_04_anx.iloc[:,0].apply(lambda x:x+'_ses-04')
    ses_04_demo['ses-04_ID'] = ses_04_demo.loc[:,'ses-04_ID'].apply(lambda x:x+'_ses-04')

    ses_01_all.rename(columns = {'ses-01_ID':'ID'}, inplace=True)
    ses_01_demo.rename(columns = {'ses-01_ID':'ID'}, inplace=True)
    ses_02_nor.rename(columns = {'ses-02_ID':'ID'}, inplace=True)
    ses_02_anx.rename(columns = {'ses-02_ID':'ID'}, inplace=True)
    ses_02_demo.rename(columns = {'ses-02_ID':'ID'}, inplace=True)
    ses_03_anx.rename(columns = {'ses-03_ID':'ID'}, inplace=True)
    ses_03_demo.rename(columns = {'ses-03_ID':'ID'}, inplace=True)
    ses_04_anx.rename(columns = {'ses-04_ID':'ID'}, inplace=True)
    ses_04_demo.rename(columns = {'ses-04_ID':'ID'}, inplace=True)

    raw_df['ID'] = raw_df.index    
    total = pd.merge(raw_df,ses_01_all, how='left', on ='ID')
    total = pd.merge(total,ses_01_demo, how='left', on ='ID')
    total = pd.merge(total,ses_02_nor, how='left', on ='ID')
    total = pd.merge(total, ses_02_anx,how='left', on ='ID')
    total = pd.merge(total, ses_02_demo,how='left', on ='ID')
    total = pd.merge(total, ses_03_anx,how='left', on ='ID')
    total = pd.merge(total, ses_03_demo,how='left', on ='ID')
    total = pd.merge(total, ses_04_anx,how='left', on ='ID')
    total = pd.merge(total, ses_04_demo,how='left', on ='ID')

    return total

In [12]:
total = gen_total(raw_crf_path)
with open('vrabes_beh_dict.pkl','wb') as f:
    pickle.dump(total,f)   

  ses_02_nor = pd.read_excel(os.path.join(raw_crf_path,'VRABES_CRF_form.xlsx'), sheet_name= 'normal', header = 3, index_col= 'no')


In [13]:
total['ID']

0      sub-0001_ses-01
1      sub-0001_ses-02
2      sub-0002_ses-01
3      sub-0002_ses-02
4      sub-0003_ses-01
            ...       
161    sub-0060_ses-03
162    sub-0061_ses-01
163    sub-0062_ses-01
164    sub-0062_ses-02
165    sub-0063_ses-01
Name: ID, Length: 166, dtype: object