### Reformat X_train and X_test with original categorical columns

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("processed_data1.csv", index_col=0)

In [3]:
lab = pd.read_csv("lab.csv", index_col=0)

In [4]:
df = df.merge(lab, how='left', on = 'icustay_id')

In [5]:
df = df.drop('ROW_ID', axis=1)

In [6]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,INR_min,INR_max,PT_min,PT_max,SODIUM_min,SODIUM_max,BUN_min,BUN_max,WBC_min,WBC_max
0,2,163353,2138/7/17 19:04,2138/7/21 15:48,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,...,,,,,,,,,0.1,22.0
1,3,145834,2101/10/20 19:08,2101/10/31 13:58,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,,...,1.3,1.7,13.5,15.7,136.0,153.0,41.0,53.0,11.3,24.4
2,4,185777,2191/3/16 0:28,2191/3/23 18:41,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME WITH HOME IV PROVIDR,Private,,...,1.1,1.1,12.8,12.8,141.0,141.0,10.0,10.0,9.7,9.7
3,5,178980,2103/2/2 4:31,2103/2/4 12:15,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,,...,,,,,,,,,13.9,13.9
4,6,107064,2175/5/30 7:15,2175/6/15 16:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,...,1.0,1.4,12.6,14.6,134.0,138.0,62.0,65.0,10.6,10.6


In [7]:
# time in emergency department, if not enter, then 0
df['EDREGTIME'] = df['EDREGTIME'].fillna(0)
df['EDOUTTIME'] = df['EDOUTTIME'].fillna(0)
df['EDstay'] = pd.to_datetime(df.EDOUTTIME) - pd.to_datetime(df.EDREGTIME)

In [8]:
df['Hosp_LOS'] = pd.to_timedelta(df.Hosp_LOS).dt.total_seconds()
df['EDstay'] = df.EDstay.dt.total_seconds()

In [9]:
df_new = df.drop(['SUBJECT_ID', 'HADM_ID', 'icustay_id', 
                  'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'EDREGTIME', 'EDOUTTIME', 
                  'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 
                  'HeartRate_Min', 'HeartRate_Max', 'SysBP_Min', 'SysBP_Max', 
                  'DiasBP_Min', 'DiasBP_Max', 'RespRate_Max', 'HeartRate_Mean_1',
                  'HeartRate_Min_1', 'Glucose_Max', 'Glucose_Min', 'INTIME', 'OUTTIME', 
                  'DOB', 'DOD', 'LANGUAGE', 'MARITAL_STATUS', 'Height'], axis=1)

In [10]:
df_new2 = df_new.drop(columns=['Weight','subject_id','hadm_id'])

In [11]:
df_new2.DISCHARGE_LOCATION.value_counts()

HOME                         16090
HOME HEALTH CARE             10924
SNF                           5545
REHAB/DISTINCT PART HOSP      5042
DEAD/EXPIRED                  4393
SHORT TERM HOSPITAL           1441
LONG TERM CARE HOSPITAL       1385
DISC-TRAN CANCER/CHLDRN H      589
DISCH-TRAN TO PSYCH HOSP       382
HOSPICE-HOME                   266
LEFT AGAINST MEDICAL ADVI      214
HOSPICE-MEDICAL FACILITY       115
OTHER FACILITY                  49
HOME WITH HOME IV PROVIDR       42
ICF                             31
DISC-TRAN TO FEDERAL HC         11
SNF-MEDICAID ONLY CERTIF         1
Name: DISCHARGE_LOCATION, dtype: int64

In [12]:
# target variable encoding
result = [] 
for value in df_new2['DISCHARGE_LOCATION']: 
    if value == 'DEAD/EXPIRED': 
        result.append(4) 
    elif 'HOME' in value: 
        result.append(1)
    elif value.startswith('SNF'): 
        result.append(2)
    else: 
        result.append(3) 
        
df_new2['target'] = result

In [13]:
df_new2.target.value_counts()

1    27322
3     9259
2     5546
4     4393
Name: target, dtype: int64

In [14]:
df_new2.columns

Index(['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'RELIGION', 'ETHNICITY', 'DIAGNOSIS', 'HeartRate_Mean',
       'SysBP_Mean', 'DiasBP_Mean', 'TempC_Max', 'RespRate_Mean',
       'Glucose_Mean', 'ICU_LOS', 'GENDER', 'Hosp_LOS', 'age', 'ANIONGAP_min',
       'ANIONGAP_max', 'ALBUMIN_min', 'ALBUMIN_max', 'BANDS_min', 'BANDS_max',
       'BICARBONATE_min', 'BICARBONATE_max', 'BILIRUBIN_min', 'BILIRUBIN_max',
       'CREATININE_min', 'CREATININE_max', 'CHLORIDE_min', 'CHLORIDE_max',
       'GLUCOSE_min', 'GLUCOSE_max', 'HEMATOCRIT_min', 'HEMATOCRIT_max',
       'HEMOGLOBIN_min', 'HEMOGLOBIN_max', 'LACTATE_min', 'LACTATE_max',
       'PLATELET_min', 'PLATELET_max', 'POTASSIUM_min', 'POTASSIUM_max',
       'PTT_min', 'PTT_max', 'INR_min', 'INR_max', 'PT_min', 'PT_max',
       'SODIUM_min', 'SODIUM_max', 'BUN_min', 'BUN_max', 'WBC_min', 'WBC_max',
       'EDstay', 'target'],
      dtype='object')

In [15]:
df_new2.RELIGION.value_counts()

CATHOLIC                  15659
NOT SPECIFIED              9550
UNOBTAINABLE               7711
PROTESTANT QUAKER          5117
JEWISH                     3833
OTHER                      2104
EPISCOPALIAN                589
CHRISTIAN SCIENTIST         360
GREEK ORTHODOX              323
BUDDHIST                    195
MUSLIM                      157
JEHOVAH'S WITNESS           104
UNITARIAN-UNIVERSALIST      104
HINDU                       101
ROMANIAN EAST. ORTH          66
7TH DAY ADVENTIST            57
BAPTIST                      25
HEBREW                       15
METHODIST                     6
LUTHERAN                      1
Name: RELIGION, dtype: int64

### Reformat RELIGION and ETHNICITY columns:
- Subset 5 types from RELIGION and change the other types to 'RELIGION_others':
  - 'BUDDHIST', 'JEWISH', 'HINDU', 'MUSLIM', '7TH DAY ADVENTIST', 'RELIGION_others'
- Select 4 types from ETHNICITY:
  - 'ASIAN', 'WHITE', 'BLACK', 'ETHNICITY_Others' 

In [16]:
religion_list = ['BUDDHIST', 'JEWISH', 'HINDU', 'MUSLIM', '7TH DAY ADVENTIST']
df_new2.RELIGION[~df_new2.RELIGION.isin(religion_list)] = 'RELIGION_others'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
result = [] 
for value in df_new2['ETHNICITY']: 
    if 'ASIAN' in value: 
        result.append('ASIAN') 
    elif 'WHITE' in value: 
        result.append('WHITE')
    elif 'BLACK' in value: 
        result.append('BLACK')
    else: 
        result.append('ETHNICITY_Others') 
        
df_new2['ETHNICITY_new'] = result
df_new2 = df_new2.drop(columns = ['ETHNICITY'])
df_new2 = df_new2.rename(columns = {'ETHNICITY_new' : 'ETHNICITY'})

### DIAGNOSIS

In [18]:
result = [] 
for value in df_new2['DIAGNOSIS']: 
    if value == 'PNEUMONIA': 
        result.append('PNEUMONIA')
    elif value == 'CORONARY ARTERY DISEASE': 
        result.append('CORONARY ARTERY DISEASE')
    elif value == 'SEPSIS': 
        result.append('SEPSIS')
    elif value == 'INTRACRANIAL HEMORRHAGE': 
        result.append('INTRACRANIAL HEMORRHAGE')
    elif value == 'CHEST PAIN': 
        result.append('CHEST PAIN')
    elif value == 'CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA': 
        result.append('CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA')
    elif value == 'GASTROINTESTINAL BLEED': 
        result.append('GASTROINTESTINAL BLEED')
    elif value == 'CONGESTIVE HEART FAILURE': 
        result.append('CONGESTIVE HEART FAILURE')
    elif value == 'ALTERED MENTAL STATUS': 
        result.append('ALTERED MENTAL STATUS')
    else: 
        result.append('others') 
        
df_new2['DIAGNOSIS_new'] = result
df_new2 = df_new2.drop(columns = ['DIAGNOSIS'])
df_new2 = df_new2.rename(columns = {'DIAGNOSIS_new' : 'DIAGNOSIS'})

In [19]:
df_new2['ICU_LOS'] = df_new2['ICU_LOS'].fillna(0)

In [20]:
df_new2.drop(columns='Hosp_LOS', inplace=True)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df_new2.drop(columns=['DISCHARGE_LOCATION'], inplace=True)

In [23]:
y = df_new2.pop('target')
X_train, X_test, y_train, y_test = train_test_split(df_new2, y, test_size=0.2, random_state=123, stratify=y)

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [25]:
imp = SimpleImputer(strategy='mean')

In [26]:
numeric_columns = X_train.select_dtypes(include='number').columns

In [27]:
object_columns = X_train.select_dtypes(include='O').columns

In [28]:
imp.fit(X_train[numeric_columns])

SimpleImputer()

In [29]:
X_train1 = pd.concat([X_train[object_columns].reset_index(drop=True), 
           pd.DataFrame(imp.transform(X_train[numeric_columns]), columns=numeric_columns).reset_index(drop=True)], 
           axis = 1)

In [30]:
X_test1 = pd.concat([X_test[object_columns].reset_index(drop=True), 
           pd.DataFrame(imp.transform(X_test[numeric_columns]), columns=numeric_columns).reset_index(drop=True)], 
           axis = 1)

In [31]:
pd.DataFrame(X_train1).to_csv("X_train_nodummy.csv")

In [32]:
pd.DataFrame(X_test1).to_csv("X_test_nodummy.csv")