# Merge All DataFrames

In [1]:
import pandas as pd
import numpy as np

from sklearn import model_selection

In [2]:
dir = '../data/'
prefix = 'FONNESBECK_'
suffix = '_20151202.csv'

In [3]:
adt_cms_final = pd.read_pickle(dir + './adt_cms_final.pkl')
adt_cms_final.ruid = adt_cms_final.ruid.astype('int64')
adt_cms_final.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,hospital_day,stay_length,n_transfers,readmit_time,readmit_30d
0,50135262,0,2007-02-08,2007-02-12,2007-02-08,4 days,2,172 days,0
1,50135262,0,2007-02-08,2007-02-12,2007-02-09,4 days,2,172 days,0
2,50135262,0,2007-02-08,2007-02-12,2007-02-10,4 days,2,172 days,0
3,50135262,0,2007-02-08,2007-02-12,2007-02-11,4 days,2,172 days,0
4,50135262,0,2007-02-08,2007-02-12,2007-02-12,4 days,2,172 days,0


In [4]:
adt_cms_final.shape

(128407, 9)

In [5]:
len(np.unique(adt_cms_final.ruid))

5651

In [6]:
phenotype = pd.read_pickle(dir + './phenotype.pkl')
phenotype.head()

Unnamed: 0,ruid,sex,dob,dod,race
0,50135262,F,1949-09-20,NaT,W
1,50135361,M,1932-02-15,2012-09-09,W
2,50135369,M,1958-05-04,2012-01-23,W
3,50135375,M,1943-05-01,2011-08-18,B
4,50135425,F,1946-10-02,NaT,W


In [7]:
len(np.unique(phenotype.ruid))

8000

In [8]:
merged = adt_cms_final.merge(phenotype, how='left', on='ruid')
assert(len(np.unique(merged.ruid)) == len(np.unique(adt_cms_final.ruid)))
merged.shape

(128407, 13)

In [9]:
cpt_wide = pd.read_pickle(dir + './cpt_wide.pkl')
cpt_wide.rename(columns={'event_date': 'hospital_day'}, inplace=True)
cpt_wide.head()

code_cat,ruid,hospital_day,cpt_anesthesia,cpt_eval_manage,cpt_expired,cpt_medicine,cpt_modifier,cpt_path_lab,cpt_radiology,cpt_surgery,cpt_unknown
0,50135262,2005-01-09,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,50135262,2007-02-08,0.0,1.0,2.0,4.0,2.0,10.0,4.0,0.0,0.0
2,50135262,2007-02-09,0.0,2.0,1.0,0.0,3.0,9.0,5.0,2.0,0.0
3,50135262,2007-02-10,0.0,1.0,0.0,7.0,2.0,2.0,0.0,0.0,0.0
4,50135262,2007-02-11,0.0,1.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0


In [10]:
merged = merged.merge(cpt_wide, how='left', on=['ruid', 'hospital_day'])
assert(len(np.unique(merged.ruid)) == len(np.unique(adt_cms_final.ruid)))
merged.shape

(128407, 22)

In [11]:
icd_wide = pd.read_pickle(dir + './icd_wide.pkl')
icd_wide.rename(columns={'event_date': 'hospital_day'}, inplace=True)
icd_wide.head()

code_cat,ruid,hospital_day,icd_dx_blood,icd_dx_circulatory,icd_dx_congenital,icd_dx_digestive,icd_dx_endocrine,icd_dx_external,icd_dx_gu,icd_dx_infection,...,icd_dx_muscskel,icd_dx_neoplasm,icd_dx_nervous,icd_dx_obstetric,icd_dx_perinatal,icd_dx_respiratory,icd_dx_skin,icd_dx_symptoms,icd_proc,icd_visit
0,50135262,2005-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50135262,2007-02-08,0.0,2.0,0.0,0.0,3.0,1.0,0.0,2.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,4.0
2,50135262,2007-02-09,0.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,50135262,2007-02-10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,50135262,2007-02-11,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
merged = merged.merge(icd_wide, how='left', on=['ruid', 'hospital_day'])
assert(len(np.unique(merged.ruid)) == len(np.unique(adt_cms_final.ruid)))
merged.shape

(128407, 42)

In [13]:
med_classes_final_ruids = pd.read_pickle(dir + './med_classes_final_ruids.pkl')
med_classes_final_ruids.rename(columns={'entry_date': 'hospital_day'}, inplace=True)
med_classes_final_ruids.head()

Unnamed: 0,ruid,hospital_day,med_14-alpha Demethylase Inhibitors,med_5-alpha Reductase Inhibitors,"med_Abortifacient Agents, Nonsteroidal",med_Acetaldehyde Dehydrogenase Inhibitors,med_Acetylcholine Release Inhibitors,med_Acid Sensing Ion Channel Blockers,med_Adenosine A2 Receptor Agonists,med_Adenosine Deaminase Inhibitors,...,med_Uncoupling Agents,med_Uricosuric Agents,med_Urological Agents,med_Vasoconstrictor Agents,med_Vasodilator Agents,med_Viscosupplements,med_Vitamin B Complex,med_Vitamins,med_Voltage-Gated Sodium Channel Blockers,med_beta-Lactamase Inhibitors
0,50135262,2005-01-09,0.0,0.0,0.0,,0.0,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,50135262,2007-01-15,0.0,0.0,0.0,,0.0,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50135262,2015-01-25,0.0,0.0,0.0,,0.0,0.0,0.0,,...,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,50135262,2007-02-08,0.0,0.0,0.0,,0.0,0.0,0.0,,...,,0.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,50135262,2007-02-09,0.0,0.0,0.0,,0.0,0.0,0.0,,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# select the top number of medications
counts = (med_classes_final_ruids
          .drop(columns=['ruid', 'hospital_day'])
          .fillna(0.0)
          .sum()
          .sort_values(ascending=False))

In [15]:
# keep top X medication class counts
keep_cols = counts[:50].index.values.tolist()
keep_cols.append('ruid')
keep_cols.append('hospital_day')
keep_cols

['med_Antihypertensive Agents',
 'med_Analgesics, Opioid',
 'med_Narcotics',
 'med_Antipyretics',
 'med_Anti-Bacterial Agents',
 'med_Anti-Inflammatory Agents, Non-Steroidal',
 'med_Analgesics, Non-Narcotic',
 'med_Antiemetics',
 'med_Diuretics',
 'med_Anti-Arrhythmia Agents',
 'med_Anti-Allergic Agents',
 'med_Anti-Inflammatory Agents',
 'med_Vasodilator Agents',
 'med_Anti-Ulcer Agents',
 'med_Anti-Anxiety Agents',
 'med_Fibrinolytic Agents',
 'med_Bronchodilator Agents',
 'med_Antipruritics',
 'med_Cyclooxygenase Inhibitors',
 'med_Anticholesteremic Agents',
 'med_Proton Pump Inhibitors',
 'med_Hydroxymethylglutaryl-CoA Reductase Inhibitors',
 'med_Glucocorticoids',
 'med_Anticoagulants',
 'med_Sympatholytics',
 'med_Hypoglycemic Agents',
 'med_Enzyme Inhibitors',
 'med_Hypnotics and Sedatives',
 'med_Sodium Potassium Chloride Symporter Inhibitors',
 'med_Platelet Aggregation Inhibitors',
 'med_Calcium Channel Blockers',
 'med_Antineoplastic Agents, Hormonal',
 'med_Histamine H1 Ant

In [16]:
top_med_classes = med_classes_final_ruids.filter(items=keep_cols)
top_med_classes.shape

(505474, 52)

In [17]:
merged = merged.merge(top_med_classes, 
                      how='left', on=['ruid', 'hospital_day'])
assert(len(np.unique(merged.ruid)) == len(np.unique(adt_cms_final.ruid)))
merged.shape

(128407, 92)

## Compress to Discharge Date Only

In [18]:
merged.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,hospital_day,stay_length,n_transfers,readmit_time,readmit_30d,sex,...,med_Serotonin Uptake Inhibitors,med_Vitamins,med_Antitussive Agents,med_Angiotensin-Converting Enzyme Inhibitors,med_Antifungal Agents,med_Analgesics,med_Tocolytic Agents,med_Bone Density Conservation Agents,med_Serotonin Antagonists,med_Cardiotonic Agents
0,50135262,0,2007-02-08,2007-02-12,2007-02-08,4 days,2,172 days,0,F,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50135262,0,2007-02-08,2007-02-12,2007-02-09,4 days,2,172 days,0,F,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50135262,0,2007-02-08,2007-02-12,2007-02-10,4 days,2,172 days,0,F,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,50135262,0,2007-02-08,2007-02-12,2007-02-11,4 days,2,172 days,0,F,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,50135262,0,2007-02-08,2007-02-12,2007-02-12,4 days,2,172 days,0,F,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
merged.isnull().sum()

ruid                                                      0
visit_id                                                  0
admit_date                                                0
discharge_date                                            0
hospital_day                                              0
stay_length                                               0
n_transfers                                               0
readmit_time                                          32900
readmit_30d                                               0
sex                                                       0
dob                                                       0
dod                                                   83346
race                                                      0
cpt_anesthesia                                        13466
cpt_eval_manage                                       13466
cpt_expired                                           13466
cpt_medicine                            

In [20]:
merged2 = (merged.drop(columns=['hospital_day'])
                .fillna(value={'readmit_time': np.timedelta64(9999, 'D'), 
                              'sex': 'U',
                              'dob': np.datetime64('1800-01-01'),
                              'dod': np.datetime64('2100-01-01')})
                .groupby(['ruid', 'visit_id', 'admit_date', 'discharge_date', 'stay_length', 
                         'n_transfers', 'readmit_time', 
                         'readmit_30d', 'sex', 'dob', 'dod', 
                         'race'])
                .agg('sum')
                .reset_index()
          )
assert(len(np.unique(merged2.ruid)) == len(np.unique(adt_cms_final.ruid)))

In [21]:
merged2.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,med_Serotonin Uptake Inhibitors,med_Vitamins,med_Antitussive Agents,med_Angiotensin-Converting Enzyme Inhibitors,med_Antifungal Agents,med_Analgesics,med_Tocolytic Agents,med_Bone Density Conservation Agents,med_Serotonin Antagonists,med_Cardiotonic Agents
0,50135262,0,2007-02-08,2007-02-12,4 days,2,172 days,0,F,1949-09-20,...,9.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1,50135262,1,2007-08-03,2007-08-06,3 days,3,22 days,1,F,1949-09-20,...,9.0,0.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0
2,50135262,2,2007-08-28,2007-08-29,1 days,1,179 days,0,F,1949-09-20,...,6.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,50135262,3,2008-02-24,2008-02-28,4 days,2,44 days,0,F,1949-09-20,...,6.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
4,50135262,4,2008-04-12,2008-04-13,1 days,1,928 days,0,F,1949-09-20,...,3.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [22]:
merged2.tail()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,med_Serotonin Uptake Inhibitors,med_Vitamins,med_Antitussive Agents,med_Angiotensin-Converting Enzyme Inhibitors,med_Antifungal Agents,med_Analgesics,med_Tocolytic Agents,med_Bone Density Conservation Agents,med_Serotonin Antagonists,med_Cardiotonic Agents
21128,53736421,6,2014-03-31,2014-04-02,2 days,4,5 days,1,F,1990-05-05,...,0.0,5.0,7.0,2.0,5.0,2.0,5.0,2.0,0.0,0.0
21129,53736421,7,2014-04-07,2014-04-08,1 days,1,4 days,1,F,1990-05-05,...,0.0,4.0,0.0,1.0,3.0,0.0,5.0,1.0,0.0,0.0
21130,53736421,8,2014-04-12,2014-04-15,3 days,3,401 days,0,F,1990-05-05,...,1.0,9.0,0.0,8.0,8.0,4.0,16.0,3.0,0.0,7.0
21131,53736421,9,2015-05-21,2015-05-25,4 days,3,9999 days,0,F,1990-05-05,...,0.0,5.0,2.0,0.0,4.0,1.0,6.0,1.0,0.0,0.0
21132,53736422,0,2006-04-21,2006-05-05,14 days,8,9999 days,0,M,1948-05-28,...,0.0,15.0,21.0,0.0,10.0,0.0,18.0,0.0,0.0,0.0


## Add Labs, BP, BMI, & eGFR

In [24]:
labs_bp_bmi_egfr = pd.read_csv(dir + './labs.csv')
labs_bp_bmi_egfr.head()

Unnamed: 0,ruid,visit_id,an-gap_median,bun_median,co2_median,ca_median,cl_median,creat_median,glubed_median,gluc_median,...,k_95p,mch_95p,mchc_95p,mcv_95p,na_95p,pcv_95p,plt-ct_95p,rbc_95p,rdw_95p,wbc_95p
0,50135262,0,9.0,17.0,26.0,8.9,101.0,0.54,137.0,138.0,...,4.4,29.0,32.0,91.0,136.0,39.2,334.0,4.41,14.1,16.0
1,50135262,1,6.5,14.0,29.5,9.1,99.0,0.715,184.5,213.5,...,4.755,28.0,33.19,85.0,137.7,42.4,309.65,5.044,18.47,14.55
2,50135262,2,7.0,11.0,31.0,9.1,101.0,0.61,224.5,266.0,...,3.9,28.395,33.07,86.95,139.0,38.0,308.9,4.426,16.775,11.76
3,50135262,3,5.0,12.0,29.0,8.6,102.0,0.85,205.0,230.0,...,4.48,26.3,31.88,83.8,138.6,41.6,316.8,4.872,15.94,12.52
4,50135262,4,6.5,12.5,32.0,,100.5,0.75,228.5,162.0,...,4.19,25.59,31.575,81.95,139.9,36.0,340.8,4.513,15.79,10.27


In [25]:
labs_bp_bmi_egfr.shape

(18849, 59)

In [26]:
len(np.unique(labs_bp_bmi_egfr.ruid))

5221

In [27]:
merged_all = merged2.merge(labs_bp_bmi_egfr, 
                           how='left', on=['ruid', 'visit_id'])
assert(len(np.unique(merged_all.ruid)) == len(np.unique(adt_cms_final.ruid)))

## Add Age

In [28]:
merged_all['age'] = pd.to_datetime(merged_all['admit_date']) - pd.to_datetime(merged_all['dob'])
merged_all['age'] = merged_all['age'].apply(lambda x: pd.to_timedelta(x).days/365.25)
merged_all.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,mch_95p,mchc_95p,mcv_95p,na_95p,pcv_95p,plt-ct_95p,rbc_95p,rdw_95p,wbc_95p,age
0,50135262,0,2007-02-08,2007-02-12,4 days,2,172 days,0,F,1949-09-20,...,29.0,32.0,91.0,136.0,39.2,334.0,4.41,14.1,16.0,57.385352
1,50135262,1,2007-08-03,2007-08-06,3 days,3,22 days,1,F,1949-09-20,...,28.0,33.19,85.0,137.7,42.4,309.65,5.044,18.47,14.55,57.867214
2,50135262,2,2007-08-28,2007-08-29,1 days,1,179 days,0,F,1949-09-20,...,28.395,33.07,86.95,139.0,38.0,308.9,4.426,16.775,11.76,57.935661
3,50135262,3,2008-02-24,2008-02-28,4 days,2,44 days,0,F,1949-09-20,...,26.3,31.88,83.8,138.6,41.6,316.8,4.872,15.94,12.52,58.428474
4,50135262,4,2008-04-12,2008-04-13,1 days,1,928 days,0,F,1949-09-20,...,25.59,31.575,81.95,139.9,36.0,340.8,4.513,15.79,10.27,58.55989


## Convert Length-of-Stay to Integer

In [29]:
merged_all['stay_length'] = merged_all['stay_length'].apply(lambda x: pd.to_timedelta(x).days)
merged_all.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,mch_95p,mchc_95p,mcv_95p,na_95p,pcv_95p,plt-ct_95p,rbc_95p,rdw_95p,wbc_95p,age
0,50135262,0,2007-02-08,2007-02-12,4,2,172 days,0,F,1949-09-20,...,29.0,32.0,91.0,136.0,39.2,334.0,4.41,14.1,16.0,57.385352
1,50135262,1,2007-08-03,2007-08-06,3,3,22 days,1,F,1949-09-20,...,28.0,33.19,85.0,137.7,42.4,309.65,5.044,18.47,14.55,57.867214
2,50135262,2,2007-08-28,2007-08-29,1,1,179 days,0,F,1949-09-20,...,28.395,33.07,86.95,139.0,38.0,308.9,4.426,16.775,11.76,57.935661
3,50135262,3,2008-02-24,2008-02-28,4,2,44 days,0,F,1949-09-20,...,26.3,31.88,83.8,138.6,41.6,316.8,4.872,15.94,12.52,58.428474
4,50135262,4,2008-04-12,2008-04-13,1,1,928 days,0,F,1949-09-20,...,25.59,31.575,81.95,139.9,36.0,340.8,4.513,15.79,10.27,58.55989


## Create Training-Validation-Testing Sets

In [30]:
adt_cms_final.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,hospital_day,stay_length,n_transfers,readmit_time,readmit_30d
0,50135262,0,2007-02-08,2007-02-12,2007-02-08,4 days,2,172 days,0
1,50135262,0,2007-02-08,2007-02-12,2007-02-09,4 days,2,172 days,0
2,50135262,0,2007-02-08,2007-02-12,2007-02-10,4 days,2,172 days,0
3,50135262,0,2007-02-08,2007-02-12,2007-02-11,4 days,2,172 days,0
4,50135262,0,2007-02-08,2007-02-12,2007-02-12,4 days,2,172 days,0


In [31]:
# count unique encounters per patient
encounter_counts = (adt_cms_final.groupby('ruid', as_index=False)
                                 .agg({'visit_id': pd.Series.nunique})
                                 .rename(columns={'visit_id': 'total_encounters'}))
# assign high vs. low frequency
encounter_counts['freq'] = np.where(encounter_counts['total_encounters']>2, 'high', 'low')

encounter_counts.head()

Unnamed: 0,ruid,total_encounters,freq
0,50135262,10,high
1,50135361,14,high
2,50135369,13,high
3,50135375,21,high
4,50135425,4,high


In [32]:
encounter_counts.tail()

Unnamed: 0,ruid,total_encounters,freq
5646,53736417,3,high
5647,53736419,1,low
5648,53736420,1,low
5649,53736421,10,high
5650,53736422,1,low


In [33]:
low_freq = encounter_counts[encounter_counts.freq=='low']
low_freq.shape

(3411, 3)

In [34]:
high_freq = encounter_counts[encounter_counts.freq=='high']
assert(low_freq.shape[0] + high_freq.shape[0] == encounter_counts.shape[0])
high_freq.shape

(2240, 3)

In [35]:
np.random.seed(42)

low_freq['group'] = np.random.choice(['train', 'valid', 'test'], size=len(low_freq), p=[0.6, 0.2, 0.2])
high_freq['group'] = np.random.choice(['train', 'valid', 'test'], size=len(high_freq), p=[0.6, 0.2, 0.2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [36]:
# sanity check
low_freq['group'].value_counts()/len(low_freq)

train    0.592495
valid    0.208150
test     0.199355
Name: group, dtype: float64

In [37]:
# sanity check
high_freq['group'].value_counts()/len(high_freq)

train    0.617411
test     0.198214
valid    0.184375
Name: group, dtype: float64

In [38]:
# concatenate low-frequency & high-frequency groups
all_groups = pd.concat([low_freq, high_freq]).drop(columns='freq')

In [39]:
all_groups.head()

Unnamed: 0,ruid,total_encounters,group
7,50135735,2,train
13,50136007,1,test
16,50136118,1,valid
19,50136383,2,train
21,50136466,1,train


In [40]:
# merge with other dataframes
merged_all = merged_all.merge(all_groups, how='left', on=['ruid'])
assert(len(np.unique(merged_all.ruid)) == len(np.unique(adt_cms_final.ruid)))

## Save Final File

In [41]:
merged_all.head()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,mcv_95p,na_95p,pcv_95p,plt-ct_95p,rbc_95p,rdw_95p,wbc_95p,age,total_encounters,group
0,50135262,0,2007-02-08,2007-02-12,4,2,172 days,0,F,1949-09-20,...,91.0,136.0,39.2,334.0,4.41,14.1,16.0,57.385352,10,train
1,50135262,1,2007-08-03,2007-08-06,3,3,22 days,1,F,1949-09-20,...,85.0,137.7,42.4,309.65,5.044,18.47,14.55,57.867214,10,train
2,50135262,2,2007-08-28,2007-08-29,1,1,179 days,0,F,1949-09-20,...,86.95,139.0,38.0,308.9,4.426,16.775,11.76,57.935661,10,train
3,50135262,3,2008-02-24,2008-02-28,4,2,44 days,0,F,1949-09-20,...,83.8,138.6,41.6,316.8,4.872,15.94,12.52,58.428474,10,train
4,50135262,4,2008-04-12,2008-04-13,1,1,928 days,0,F,1949-09-20,...,81.95,139.9,36.0,340.8,4.513,15.79,10.27,58.55989,10,train


In [42]:
merged_all.tail()

Unnamed: 0,ruid,visit_id,admit_date,discharge_date,stay_length,n_transfers,readmit_time,readmit_30d,sex,dob,...,mcv_95p,na_95p,pcv_95p,plt-ct_95p,rbc_95p,rdw_95p,wbc_95p,age,total_encounters,group
21128,53736421,6,2014-03-31,2014-04-02,2,4,5 days,1,F,1990-05-05,...,98.0,141.9,33.9,273.2,3.506,14.17,14.81,23.904175,10,valid
21129,53736421,7,2014-04-07,2014-04-08,1,1,4 days,1,F,1990-05-05,...,101.9,,31.9,323.1,3.112,14.385,10.5,23.92334,10,valid
21130,53736421,8,2014-04-12,2014-04-15,3,3,401 days,0,F,1990-05-05,...,100.85,145.55,31.6,572.85,3.2115,13.87,13.215,23.937029,10,valid
21131,53736421,9,2015-05-21,2015-05-25,4,3,9999 days,0,F,1990-05-05,...,,,,,,,,25.043121,10,valid
21132,53736422,0,2006-04-21,2006-05-05,14,8,9999 days,0,M,1948-05-28,...,96.95,,34.85,436.95,3.887,16.495,26.36,57.897331,1,train


In [43]:
merged_all.to_csv(dir + './merged.csv', index=False)