In [1]:
import numpy as np
import pandas as pd

In [2]:
def f_get_Normalization(X, norm_mode):
    num_Patient, num_Feature = np.shape(X)

    if norm_mode == 'standard': #zero mean unit variance
        for j in range(num_Feature):
            if np.std(X[:,j]) != 0:
                X[:,j] = (X[:,j] - np.mean(X[:, j]))/np.std(X[:,j])
            else:
                X[:,j] = (X[:,j] - np.mean(X[:, j]))
    elif norm_mode == 'normal': #min-max normalization
        for j in range(num_Feature):
            X[:,j] = (X[:,j] - np.min(X[:,j]))/(np.max(X[:,j]) - np.min(X[:,j]))
    else:
        print("INPUT MODE ERROR!")

    return X

# MAKE CF TIME-SERIES DATASET

In [3]:
filename= '/mnt/85c3b84c-90ae-4506-81f1-4ff88bb29110/[ DATASET ]/[ TIME-SERIES DATA ]/CYSTIC FIBROSIS/Preprocessed/Longitudinal/version 5/stacked_Data_imputed_v5.csv'

df = pd.read_csv(filename)
tmp_df = df.copy(deep=True)

feat_list =  ['Age', 'Weight', 'Height', 'BMI', 'Gender', 'Smoking Status',
              'Class I Mutation', 'Class II Mutation', 'Class III Mutation', 'Class IV Mutation', 'Class V Mutation',
              'Class VI Mutation', 'DF508 Mutation', 'G551D Mutation',
              'FEV1', 'FEV1 Predicted', 'Best FEV1', 'Best FEV1 Predicted',
              'Pseudomonas Aeruginosa', 'Haemophilus Influenza', 'Klebsiella Pneumoniae', 'Ecoli', 'ALCA',
              'Aspergillus', 'NTM', 'Gram-Negative', 'Xanthomonas', 'Staphylococcus Aureus', 'Liver Disease',
              'Asthma', 'ABPA', 'Hypertension', 'Diabetes', 'Arthropathy', 'Bone fracture', 'Osteoporosis',
              'Osteopenia', 'Cancer', 'Cirrhosis', 'Kidney Stones', 'Depression', 'Hemoptysis', 'Pancreatitus',
              'Hearing Loss', 'Homozygous', 'Heterozygous', 'Gall bladder', 'Colonic structure',
              'Intestinal Obstruction', 'GI bleeding non-var source', 'GI bleeding var source', 
              'Non-IV Hospital Admission', 'IV Antibiotic Days Hosp', 'IV Antibiotic Days Home', 
              'Dornase Alpha', 'Anti-fungals', 'Liver Enzymes', 'Lab Liver Enzymes', 'HyperSaline', 'HypertonicSaline',
              'Tobi Solution', 'Cortico Combo', 'Noninvasive Ventilation', 'Acetylcysteine', 'Aminoglycoside',
              'iBuprofen', 'Drug Dornase', 'HDI Buprofen', 'Tobramycin', 'Leukotriene', 'Colistin',
              'Diabetes Inter Insulin', 'Macrolida Antibiotics', 'Inhaled Broncho BAAC', 'Inhaled Broncho LAAC',
              'Inhaled Broncho SAAC', 'Inhaled Broncho LABA', 'Inhaled Bronchodilators', 'Cortico Inhaled',
              'Oral Broncho THEOPH', 'Oral Broncho BA', 'Oral Hypoglycemic Agents', 'Chronic Oral Antibiotic',
              'Cortico Oral', 'O2 Prn', 'O2 Exc', 'O2 Noct', 'O2 Cont', 'Oxygen Therapy']

# UKCF - MORTALITY

In [4]:
for c in [0, 1, 2, 3]:
    print('label:{} -- {}'.format(c, np.sum(tmp_df['cause'] == c)/np.shape(tmp_df)[0]) )

label:0 -- 0.9229030717809963
label:1 -- 0.06175393920101862
label:2 -- 0.008594620404265478
label:3 -- 0.006748368613719561


In [4]:
tmp_df['new_time_to_event'] = np.nan
tmp_df['label'] = np.nan #this is 1-yr mortality label
# if time-to-event > 12 -> label 0
# if time-to-event <= 12 & cause !=0 -> label 1
# if time-to-event <= 12 & cause == 0 -> label -1 (these will be removed) 

grouped = tmp_df.groupby('ID')
id_list = pd.unique(tmp_df['ID'])

tmp_df['new_time_to_event'] = tmp_df['time_to_event'] - tmp_df['Age']
tmp_df['new_time_to_event'] = tmp_df['time_to_event'] - tmp_df['Age']

condition1 = (tmp_df['new_time_to_event'] > 12)
condition2 = (tmp_df['new_time_to_event'] <= 12) & (tmp_df['cause'] != 0)
condition3 = (tmp_df['new_time_to_event'] <= 12) & (tmp_df['cause'] == 0)

tmp_df.loc[condition1, 'label'] = 0
tmp_df.loc[condition2, 'label'] = 1
tmp_df.loc[condition3, 'label'] = -1


new_df = tmp_df[tmp_df['label'] != -1].reset_index(drop=True)
new_df = new_df.drop(columns=['cause', 'time_to_event', 'new_time_to_event'])

In [5]:
#make Delta_{j} = t_{j} - t_{j-1}
new_df['Delta'] = new_df['Age'].diff()
new_df.loc[new_df.drop_duplicates('ID').index, 'Delta'] = 0

new_df[feat_list] = f_get_Normalization(np.asarray(new_df[feat_list]), norm_mode='standard')

id_list = pd.unique(new_df['ID'])
grouped = new_df.groupby(by='ID')

max_length = 0
remove_list = []

for p_idx in range(len(id_list)):    
    tmp_length = grouped.get_group(id_list[p_idx]).shape[0]
    
    if tmp_length == 1:
        remove_list.append(id_list[p_idx])
        
    if max_length <= tmp_length:
        max_length = tmp_length

In [33]:
new_id_list = [p_id for p_id in id_list if p_id not in remove_list]

data_x        = np.zeros([len(new_id_list), max_length, len(feat_list)+1])
data_y        = np.zeros([len(new_id_list), max_length, 1])
data_y_onehot = np.zeros([len(new_id_list), max_length, 2])

for p_idx in range(len(new_id_list)):
    tmp = grouped.get_group(new_id_list[p_idx])
    tmp_length = tmp.shape[0]

    data_x[p_idx, :tmp_length, :] = np.asarray(tmp[['Delta']+feat_list])
    data_y[p_idx, :tmp_length, :] = np.asarray(tmp[['label']])
    data_y_onehot[p_idx, np.where(np.asarray(tmp['label']) == 0)[0], 0] = 1
    data_y_onehot[p_idx, np.where(np.asarray(tmp['label']) == 1)[0], 1] = 1

In [41]:
np.savez('./data/CF/data.npz', 
         data_x=data_x,
         data_y=data_y, 
         data_y_onehot=data_y_onehot, 
         feat_list= ['Delta'] + feat_list)

In [166]:
len(label_candidate_list)

21

# UKCF - COMORBIDITY

In [16]:
filename= '/mnt/85c3b84c-90ae-4506-81f1-4ff88bb29110/[ DATASET ]/[ TIME-SERIES DATA ]/CYSTIC FIBROSIS/Preprocessed/Longitudinal/version 5/stacked_Data_imputed_v5.csv'

df = pd.read_csv(filename)
tmp_df = df.copy(deep=True)

feat_list =  ['Age', 'Weight', 'Height', 'BMI', 'Gender', 'Smoking Status',
              'Class I Mutation', 'Class II Mutation', 'Class III Mutation', 'Class IV Mutation', 'Class V Mutation',
              'Class VI Mutation', 'DF508 Mutation', 'G551D Mutation',
              'FEV1', 'FEV1 Predicted', 'Best FEV1', 'Best FEV1 Predicted',
              'Pseudomonas Aeruginosa', 'Haemophilus Influenza', 'Klebsiella Pneumoniae', 'Ecoli', 'ALCA',
              'Aspergillus', 'NTM', 'Gram-Negative', 'Xanthomonas', 'Staphylococcus Aureus', 'Liver Disease',
              'Asthma', 'ABPA', 'Hypertension', 'Diabetes', 'Arthropathy', 'Bone fracture', 'Osteoporosis',
              'Osteopenia', 'Cancer', 'Cirrhosis', 'Kidney Stones', 'Depression', 'Hemoptysis', 'Pancreatitus',
              'Hearing Loss', 'Homozygous', 'Heterozygous', 'Gall bladder', 'Colonic structure',
              'Intestinal Obstruction', 'GI bleeding non-var source', 'GI bleeding var source', 
              'Non-IV Hospital Admission', 'IV Antibiotic Days Hosp', 'IV Antibiotic Days Home', 
              'Dornase Alpha', 'Anti-fungals', 'Liver Enzymes', 'HyperSaline', 'HypertonicSaline',
              'Tobi Solution', 'Cortico Combo', 'Noninvasive Ventilation', 'Acetylcysteine', 'Aminoglycoside',
              'iBuprofen', 'Drug Dornase', 'HDI Buprofen', 'Tobramycin', 'Leukotriene', 'Colistin',
              'Diabetes Inter Insulin', 'Macrolida Antibiotics', 'Inhaled Broncho BAAC', 'Inhaled Broncho LAAC',
              'Inhaled Broncho SAAC', 'Inhaled Broncho LABA', 'Inhaled Bronchodilators', 'Cortico Inhaled',
              'Oral Broncho THEOPH', 'Oral Broncho BA', 'Oral Hypoglycemic Agents', 'Chronic Oral Antibiotic',
              'Cortico Oral', 'O2 Prn', 'O2 Exc', 'O2 Noct', 'O2 Cont', 'Oxygen Therapy']

# removed  ['Lab Liver Enzymes']
label_candidate_list = ['Liver Disease', 'Asthma', 'ABPA', 'Hypertension', 'Diabetes', 'Arthropathy', 'Bone fracture', 'Osteoporosis',
                        'Osteopenia', 'Cancer', 'Cirrhosis', 'Kidney Stones', 'Depression', 'Hemoptysis', 'Pancreatitus',
                        'Hearing Loss', 'Gall bladder', 'Colonic structure', 'Intestinal Obstruction', 
                        'GI bleeding non-var source', 'GI bleeding var source', 'Liver Enzymes']

In [17]:
tmp_df = df.copy(deep=True)

label_list    = ['next_'+f for f in label_candidate_list]
selected_list = ['next_Diabetes', 'next_ABPA', 'next_Intestinal Obstruction'] #3 selected comorbidities for creating 

In [18]:
### assign the next comorbidity value to f in label_list
for f in label_list:
    tmp_df[f] = np.nan
    
for f_idx in range(len(label_list)):
    tmp_df[label_list[f_idx]].iloc[0:-1] = np.asarray(tmp_df[label_candidate_list[f_idx]].iloc[1:])     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [19]:
grouped = tmp_df.groupby('ID')
id_list = pd.unique(tmp_df['ID'])

In [20]:
new_df = tmp_df.groupby('ID', as_index=False, group_keys=False).apply(lambda x: x.iloc[:-1])
new_df = new_df.reset_index(drop=True)

In [21]:
#make Delta_{j} = t_{j} - t_{j-1}
new_df['Delta'] = new_df['Age'].diff()
new_df.loc[new_df.drop_duplicates('ID').index, 'Delta'] = 0

new_df2 = new_df.copy(deep=True) #for original values
new_df[feat_list] = f_get_Normalization(np.asarray(new_df[feat_list]), norm_mode='standard')

id_list = pd.unique(new_df['ID'])
grouped = new_df.groupby(by='ID')
grouped2 = new_df2.groupby(by='ID')

max_length = 0
remove_list = []

for p_idx in range(len(id_list)):    
    tmp_length = grouped.get_group(id_list[p_idx]).shape[0]
    
    if tmp_length == 1:
        remove_list.append(id_list[p_idx])
        
    if max_length <= tmp_length:
        max_length = tmp_length

In [22]:
new_id_list = [p_id for p_id in id_list if p_id not in remove_list]

data_x        = np.zeros([len(new_id_list), max_length, len(feat_list)+1])
data_x_org    = np.zeros([len(new_id_list), max_length, len(feat_list)+1])
data_y        = np.zeros([len(new_id_list), max_length, len(label_list)])

# data_y_onehot = np.zeros([len(new_id_list), max_length, 2])

for p_idx in range(len(new_id_list)):
    tmp = grouped.get_group(new_id_list[p_idx])
    tmp2 = grouped2.get_group(new_id_list[p_idx])
    tmp_length = tmp.shape[0]

    data_x[p_idx, :tmp_length, :] = np.asarray(tmp[['Delta']+feat_list])
    data_x_org[p_idx, :tmp_length, :] = np.asarray(tmp2[['Delta']+feat_list])
    data_y[p_idx, :tmp_length, :] = np.asarray(tmp[label_list])
#     data_new_y[p_idx, np.where(np.asarray(tmp['label']) == 0)[0], 0] = 1
#     data_new_y[p_idx, np.where(np.asarray(tmp['label']) == 1)[0], 1] = 1

In [24]:
np.savez('./CF_comorbidity/data_como2.npz', 
         data_x=data_x,
         data_x_org=data_x_org,
         data_y=data_y, 
         feat_list= ['Delta'] + feat_list,
         label_list = label_list,
         selected_list = selected_list)

In [161]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [165]:
for f in label_list:
    print(new_df[[f]].describe())

        next_Asthma
count  25532.000000
mean       0.157802
std        0.364562
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
          next_ABPA
count  25532.000000
mean       0.121260
std        0.326435
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
       next_Hypertension
count       25532.000000
mean            0.042457
std             0.201632
min             0.000000
25%             0.000000
50%             0.000000
75%             0.000000
max             1.000000
       next_Diabetes
count   25532.000000
mean        0.292848
std         0.455078
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
       next_Arthropathy
count      25532.000000
mean           0.095684
std            0.294163
min            0.000000
25%            0.000000
50%            0.000000
75%            0.000000
max            1.000000
       nex

In [158]:
new_df.loc[new_df['ID'] == new_id_list[i], label_list]

Unnamed: 0,next_Asthma,next_ABPA,next_Hypertension,next_Diabetes,next_Arthropathy,next_Bone fracture,next_Osteoporosis,next_Osteopenia,next_Cancer,next_Cirrhosis,...,next_Depression,next_Hemoptysis,next_Pancreatitus,next_Hearing Loss,next_Gall bladder,next_Colonic structure,next_Intestinal Obstruction,next_GI bleeding non-var source,next_GI bleeding var source,next_Liver Enzymes
27,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
