In [43]:
import os
import pandas as pd
import numpy
import pickle
import scipy as sp
import matplotlib.pylab as plt
import seaborn as sns
import statsmodels.api as sm
import missingno as msno
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
type_dic = {0:'demo', 1:'as',2:'hrv',3:'time',4:'crf'}
session = ['ses-01','ses-02','ses-03','ses-04']

In [3]:

with open('vrabes_extracted.pkl','rb') as f:
    extracted = pickle.load(f)

In [4]:
ses_01_as = extracted['ses-01'][1]
ses_01_hrv = extracted['ses-01'][2]
ses_01_time = extracted['ses-01'][3]
ses_01_crf = extracted['ses-01'][4]
ses_01_demo = extracted['ses-01'][0]

ses_02_as = extracted['ses-02'][1]
ses_02_hrv = extracted['ses-02'][2]
ses_02_time = extracted['ses-02'][3]
ses_02_crf = extracted['ses-02'][4]
ses_02_demo = extracted['ses-02'][0]

ses_03_as = extracted['ses-03'][1]
ses_03_hrv = extracted['ses-03'][2]
ses_03_time = extracted['ses-03'][3]
ses_03_crf = extracted['ses-03'][4]
ses_03_demo = extracted['ses-03'][0]

ses_04_as = extracted['ses-04'][1]
ses_04_hrv = extracted['ses-04'][2]
ses_04_time = extracted['ses-04'][3]
ses_04_crf = extracted['ses-04'][4]
ses_04_demo = extracted['ses-04'][0]

In [5]:
def missing_finder (extracted):
    type_dic = {0:'demo', 1:'as',2:'hrv',3:'time',4:'crf'}
    session = ['ses-01','ses-02','ses-03','ses-04']
    result = {}
    for ses in session:
        sess = extracted[ses]
        for type in type_dic.keys():
            sess_type = sess[type]
            result['_'.join([ses,type_dic[type]])] = sess_type.loc[sess_type.isnull().any(axis=1),:]
    return result

In [6]:
missing = missing_finder(extracted)

In [7]:
# 1. Find the retire part, get the position. (return 1)
# 2. Create a dataframe that converts the retire part into integer 
# 3. Replace nan with simpleimputer median(return2)
def prepro_as (df):
    df_r = df.set_index('ID')
    retire = []
    for i in df_r.index:
        ses = df_r.loc[i,:]    
        for val in ses:        
            val_s = str(val)
            if 'retire' in val_s:
                ix_retire = ses.loc[ses==val].index
                df_r.loc[i,ix_retire] = val.split('(')[0]
                retire.append((i,ix_retire))                
            else:pass
    imputer = SimpleImputer(strategy='median')
    df_r = pd.DataFrame(imputer.fit_transform(df_r), index = df_r.index, columns = df_r.columns)
    return df_r

    


In [8]:
# 1. drop 'HR Max-Min" column
# 2. Replace nan with simpleimputer median(return)
def prepro_hrv (df):
    df_r = df.set_index('ID')
    df_r = df_r.loc[:,~df_r.columns.str.contains('Max-Min')]
    imputer = SimpleImputer(strategy='median')
    df_r = pd.DataFrame(imputer.fit_transform(df_r), index = df_r.index, columns = df_r.columns)
    return df_r


In [37]:
# 1. Extract column containing 'vit', column containing 'BMI', 'SUM', 'HADS_a', 'HADS_d'
# 2. Replace nan with simpleimputer median(return)
def prepro_crf(df):    
    id = df.loc[:,['ID']]
    vit = df.loc[:,df.columns.str.contains('vit')]
    bmi = df.loc[:,df.columns.str.contains('BMI')]
    sum = df.loc[:,df.columns.str.contains('SUM')]
    hadsa = df.loc[:,df.columns.str.contains('HADS_a')]
    hadsd = df.loc[:,df.columns.str.contains('HADS_d')]    
    df_r = vit.join(bmi).join(sum).join(hadsa).join(hadsd)    
    imputer = SimpleImputer(strategy='median')
    df_r = pd.DataFrame(imputer.fit_transform(df_r), index = df_r.index, columns = df_r.columns)    
    df_r = df_r.join(id)
    df_r = df_r.set_index('ID')
    return df_r



In [38]:
def prepro_demo (df):
    id = df.loc[:,['ID']]
    group = df.loc[:,df.columns.str.contains('group')]
    group = group.iloc[:,0].apply(lambda x: x-1)
    age = df.loc[:,df.columns.str.contains('age_y')]
    sex = df.loc[:,df.columns.str.contains('sex')]
    eduy = df.loc[:,df.columns.str.contains('edu_y')]
    smok = df.loc[:,df.columns.str.contains('smok1')]
    alcohol = df.loc[:,df.columns.str.contains('alcohol')]
    marriage = df.loc[:,df.columns.str.contains('marriage')]    
    df_r = id.join(group).join(age).join(sex).join(eduy).join(smok).join(alcohol).join(marriage)
    df_r = df_r.set_index('ID')
    return df_r

In [39]:
def prepro_gen(dic):
    type_dic = {0:'demo', 1:'as',2:'hrv',3:'time',4:'crf'}
    session = ['ses-01','ses-02','ses-03','ses-04']
    r_dic = {}
    for i in session:
        for t in type_dic.keys():
            if t == 0:
                r_dic['_'.join([i,type_dic[t]])] = prepro_demo(dic[i][t])
            elif t == 1:
                r_dic['_'.join([i,type_dic[t]])] = prepro_as(dic[i][t])
            elif t == 2:
                r_dic['_'.join([i,type_dic[t]])] = prepro_hrv(dic[i][t])
            elif t == 4:
                r_dic['_'.join([i,type_dic[t]])] = prepro_crf(dic[i][t])
    return r_dic

    

In [40]:
preprocessed = prepro_gen(extracted)
with open('vrabes_preprocessed.pkl','wb') as f:
    pickle.dump(preprocessed,f)  