## Pipeline
- To aggregate all the exploratory data analysis and modelling of our previous steps, we need to build a pipeline to trasport this machine learning model.

In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline

### Read in Raw data
- All these data are retrieved from `big query` of Google Cloud Platform.

In [2]:
raw_data = pd.read_csv('processed_data1.csv', index_col=0)

In [3]:
raw_data.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'HeartRate_Min',
       'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max',
       'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'TempC_Max',
       'RespRate_Max', 'RespRate_Mean', 'HeartRate_Mean_1', 'HeartRate_Min_1',
       'Glucose_Max', 'Glucose_Min', 'Glucose_Mean', 'icustay_id', 'INTIME',
       'OUTTIME', 'ICU_LOS', 'Height', 'Weight', 'GENDER', 'DOB', 'DOD',
       'Hosp_LOS', 'age'],
      dtype='object')

In [4]:
lab = pd.read_csv("lab.csv", index_col=0)

In [5]:
df = raw_data.merge(lab, how='left', on = 'icustay_id')
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,INR_min,INR_max,PT_min,PT_max,SODIUM_min,SODIUM_max,BUN_min,BUN_max,WBC_min,WBC_max
0,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,,,,,,,0.1,22.0
1,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Medicare,...,1.3,1.7,13.5,15.7,136.0,153.0,41.0,53.0,11.3,24.4
2,3,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME WITH HOME IV PROVIDR,Private,...,1.1,1.1,12.8,12.8,141.0,141.0,10.0,10.0,9.7,9.7
3,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,Private,...,,,,,,,,,13.9,13.9
4,5,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,1.0,1.4,12.6,14.6,134.0,138.0,62.0,65.0,10.6,10.6


In [6]:
df.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION',
       'DISCHARGE_LOCATION', 'INSURANCE', 'LANGUAGE', 'RELIGION',
       'MARITAL_STATUS', 'ETHNICITY', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS',
       'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'HeartRate_Min',
       'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max',
       'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'TempC_Max',
       'RespRate_Max', 'RespRate_Mean', 'HeartRate_Mean_1', 'HeartRate_Min_1',
       'Glucose_Max', 'Glucose_Min', 'Glucose_Mean', 'icustay_id', 'INTIME',
       'OUTTIME', 'ICU_LOS', 'Height', 'Weight', 'GENDER', 'DOB', 'DOD',
       'Hosp_LOS', 'age', 'subject_id', 'hadm_id', 'ANIONGAP_min',
       'ANIONGAP_max', 'ALBUMIN_min', 'ALBUMIN_max', 'BANDS_min', 'BANDS_max',
       'BICARBONATE_min', 'BICARBONATE_max', 'BILIRUBIN_min', 'BILIRUBIN_max',
       'CREATININE_min', 'CREATININE_max', 'CHLORIDE_min',

### Organization of the Pipelines
- Data Preprocessing
    - Categorical Variables Transformation
        - Select categorical variables
        - Delete redundant categories and get dummies
    - Numerical Variables Transformation
        - Select numerical variables
        - Impute missing values by mean
- Machine Learning Model
    - GBM model from xgboost package

In [1]:
# name of categorical and numerical features
categorical_features = ['ADMISSION_TYPE','ADMISSION_LOCATION','INSURANCE','DIAGNOSIS','GENDER','RELIGION','ETHNICITY']

numerical_features = ['HeartRate_Mean', 'SysBP_Mean', 'DiasBP_Mean', 'TempC_Max',
       'RespRate_Mean', 'Glucose_Mean', 'ICU_LOS', 'age', 'ANIONGAP_min',
       'ANIONGAP_max', 'ALBUMIN_min', 'ALBUMIN_max', 'BANDS_min', 'BANDS_max',
       'BICARBONATE_min', 'BICARBONATE_max', 'BILIRUBIN_min', 'BILIRUBIN_max',
       'CREATININE_min', 'CREATININE_max', 'CHLORIDE_min', 'CHLORIDE_max',
       'GLUCOSE_min', 'GLUCOSE_max', 'HEMATOCRIT_min', 'HEMATOCRIT_max',
       'HEMOGLOBIN_min', 'HEMOGLOBIN_max', 'LACTATE_min', 'LACTATE_max',
       'PLATELET_min', 'PLATELET_max', 'POTASSIUM_min', 'POTASSIUM_max',
       'PTT_min', 'PTT_max', 'INR_min', 'INR_max', 'PT_min', 'PT_max',
       'SODIUM_min', 'SODIUM_max', 'BUN_min', 'BUN_max', 'WBC_min', 'WBC_max','EDREGTIME','EDOUTTIME']

target = 'DISCHARGE_LOCATION'

### Feature Selector
- This transformer aims to select the categorical variables and continuous variables requested.

In [8]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector(BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__(self, feature_names):
        self._feature_names = feature_names
    
    #Return self nothing else to do here    
    def fit(self, X, y = None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, X, y = None):
        return X.loc[:,self._feature_names]

### Numerical Variables Transformer
- Calculate ED stay from raw data and fill the missingness as 0.
- Fill in missingness in ICU Length of Stay as 0.
- Drop the columns used for calculation.

In [9]:
# Fill in EDstay, ICUstay with 0 if missing
class NumericalTransformer(BaseEstimator, TransformerMixin):
        
    #Return self nothing else to do here
    def fit( self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['EDREGTIME'] = X['EDREGTIME'].fillna(0)
        X['EDOUTTIME'] = X['EDOUTTIME'].fillna(0)
        X['EDstay'] = pd.to_datetime(X.EDOUTTIME) - pd.to_datetime(X.EDREGTIME)
        X['EDstay'] = X.EDstay.dt.total_seconds()
        
        X.loc[:,'ICU_LOS'] = X.ICU_LOS.fillna(0)
        X.drop(columns=['EDREGTIME','EDOUTTIME'], inplace=True)

        return X

In [10]:
# Numerical Features Pipeline: select, fill time variables with 0, fill other variables with mean
numerical_pipeline = Pipeline(steps=[('num_selector', FeatureSelector(numerical_features)),  
                                     ('num_transformer', NumericalTransformer()),
                                  ('imputer', SimpleImputer(strategy = 'mean'))])

In [11]:
numerical_pipeline.fit(df)



Pipeline(steps=[('num_selector', FeatureSelector(feature_names=None)),
                ('num_transformer', NumericalTransformer()),
                ('imputer', SimpleImputer())])

In [12]:
numerical_pipeline.transform(df)

array([[1.40000000e+02, 1.19264797e+02, 6.08049585e+01, ...,
        1.00000000e-01, 2.20000000e+01, 0.00000000e+00],
       [1.11785714e+02, 1.02960000e+02, 5.57200000e+01, ...,
        1.13000000e+01, 2.44000000e+01, 8.10000000e+03],
       [8.92173913e+01, 1.18000000e+02, 6.90000000e+01, ...,
        9.70000000e+00, 9.70000000e+00, 4.32000000e+04],
       ...,
       [6.85217391e+01, 1.24318182e+02, 5.18181818e+01, ...,
        1.71000000e+01, 2.14000000e+01, 1.86000000e+04],
       [6.99354839e+01, 1.34433333e+02, 5.03333333e+01, ...,
        7.30000000e+00, 8.20000000e+00, 0.00000000e+00],
       [9.55600000e+01, 1.26240000e+02, 6.40000000e+01, ...,
        9.30000000e+00, 1.24000000e+01, 0.00000000e+00]])

### Categorical Transformers
- Remove redundant categories.
- Get dummies of categorical variables

In [13]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    #Class constructor method that takes in a list of values as its argument
        
    #Return self nothing else to do here
    def fit( self, X, y=None):
        return self
    
    #Transformer method we wrote for this transformer 
    def transform(self, X, y=None):
        '''
        This function transforms ethinicity, diagnosis, and get dummies.
        Ethinicity only keeps white, asian, black and others.
        Diagnosis only keeps the most frequent 9 diagnosis.
        '''
        
        result = [] 
        for value in X['ETHNICITY']: 
            if 'ASIAN' in value: 
                result.append('ASIAN') 
            elif 'WHITE' in value: 
                result.append('WHITE')
            elif 'BLACK' in value: 
                result.append('BLACK')
            else: 
                result.append('ETHNICITY_Others') 

        X['ETHNICITY'] = result
        
        result = [] 
        for value in X['DIAGNOSIS']: 
            if value == 'PNEUMONIA': 
                result.append('PNEUMONIA')
            elif value == 'CORONARY ARTERY DISEASE': 
                result.append('CORONARY ARTERY DISEASE')
            elif value == 'SEPSIS': 
                result.append('SEPSIS')
            elif value == 'INTRACRANIAL HEMORRHAGE': 
                result.append('INTRACRANIAL HEMORRHAGE')
            elif value == 'CHEST PAIN': 
                result.append('CHEST PAIN')
            elif value == 'CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA': 
                result.append('CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS GRAFT /SDA')
            elif value == 'GASTROINTESTINAL BLEED': 
                result.append('GASTROINTESTINAL BLEED')
            elif value == 'CONGESTIVE HEART FAILURE': 
                result.append('CONGESTIVE HEART FAILURE')
            elif value == 'ALTERED MENTAL STATUS': 
                result.append('ALTERED MENTAL STATUS')
            else: 
                result.append('others')
        X['DIAGNOSIS'] = result
        
        X = pd.get_dummies(X)
        X.drop(columns=['ADMISSION_LOCATION_** INFO NOT AVAILABLE **', 'RELIGION_NOT SPECIFIED', 'RELIGION_CATHOLIC', 
               'RELIGION_PROTESTANT QUAKER','RELIGION_UNOBTAINABLE', 'RELIGION_OTHER', "RELIGION_JEHOVAH'S WITNESS", 
               'RELIGION_GREEK ORTHODOX', 'RELIGION_EPISCOPALIAN', 'RELIGION_CHRISTIAN SCIENTIST','RELIGION_METHODIST', 'RELIGION_UNITARIAN-UNIVERSALIST', 
               'RELIGION_HEBREW','RELIGION_BAPTIST', 'RELIGION_ROMANIAN EAST. ORTH','RELIGION_LUTHERAN','DIAGNOSIS_others','RELIGION_7TH DAY ADVENTIST'],
              inplace=True)

        return X

In [14]:
# Categorical Feature Pipeline
categorical_pipeline = Pipeline(steps=[('cat_selector', FeatureSelector(categorical_features)),                     
                                  ('cat_transformer', CategoricalTransformer()),
                                      ])

In [15]:
categorical_pipeline.transform(df)

Unnamed: 0,ADMISSION_TYPE_ELECTIVE,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_NEWBORN,ADMISSION_TYPE_URGENT,ADMISSION_LOCATION_CLINIC REFERRAL/PREMATURE,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_HMO REFERRAL/SICK,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,ADMISSION_LOCATION_TRANSFER FROM HOSP/EXTRAM,ADMISSION_LOCATION_TRANSFER FROM OTHER HEALT,...,GENDER_F,GENDER_M,RELIGION_BUDDHIST,RELIGION_HINDU,RELIGION_JEWISH,RELIGION_MUSLIM,ETHNICITY_ASIAN,ETHNICITY_BLACK,ETHNICITY_ETHNICITY_Others,ETHNICITY_WHITE
0,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,1,0,0,...,0,1,1,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46515,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
46516,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
46517,0,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
46518,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [16]:
# Combine Categorical Feature Pipeline and Numerical Feature Pipeline
full_pipeline = FeatureUnion(transformer_list = [('categorical_pipeline', categorical_pipeline),  
                                                  ('numerical_pipeline', numerical_pipeline)])

### Target Variables

In [17]:
result = [] 
for value in df['DISCHARGE_LOCATION']: 
    if value == 'DEAD/EXPIRED': 
        result.append(4) 
    elif 'HOME' in value: 
        result.append(1)
    elif value.startswith('SNF'): 
        result.append(2)
    else: 
        result.append(3) 
target = result

In [18]:
full_pipeline.fit(df)



FeatureUnion(transformer_list=[('categorical_pipeline',
                                Pipeline(steps=[('cat_selector',
                                                 FeatureSelector(feature_names=None)),
                                                ('cat_transformer',
                                                 CategoricalTransformer())])),
                               ('numerical_pipeline',
                                Pipeline(steps=[('num_selector',
                                                 FeatureSelector(feature_names=None)),
                                                ('num_transformer',
                                                 NumericalTransformer()),
                                                ('imputer',
                                                 SimpleImputer())]))])

In [19]:
X_train = full_pipeline.transform(df)
X_train.shape

(46520, 83)

### Combine Preprocessing and Modelling

In [20]:
import xgboost as xgb

In [21]:
full_pipeline_m = Pipeline(steps=[('full_pipeline', full_pipeline),
                                ('model', xgb.XGBClassifier({'learning_rate': 0.04372790493554727,
                                                                  'colsample_bytree': 0.266798290436423,
                                                                  'n_estimators': 887,
                                                                  'gamma': 3.2261552797199973}))])

In [22]:
full_pipeline_m.fit(df,target)



Pipeline(steps=[('full_pipeline',
                 FeatureUnion(transformer_list=[('categorical_pipeline',
                                                 Pipeline(steps=[('cat_selector',
                                                                  FeatureSelector(feature_names=None)),
                                                                 ('cat_transformer',
                                                                  CategoricalTransformer())])),
                                                ('numerical_pipeline',
                                                 Pipeline(steps=[('num_selector',
                                                                  FeatureSelector(feature_names=None)),
                                                                 ('num_transformer',
                                                                  NumericalTransformer()),
                                                                 ('imputer',...
             

In [23]:
pred = full_pipeline_m.predict(df)

In [24]:
from sklearn.metrics import classification_report

In [25]:
print('Training')
print(classification_report(target, pred, target_names=['HOME','SNF','Other','Dead/Expired']))

Training
              precision    recall  f1-score   support

        HOME       0.83      0.96      0.89     27322
         SNF       0.77      0.62      0.68      5546
       Other       0.81      0.59      0.68      9259
Dead/Expired       0.94      0.77      0.85      4393

    accuracy                           0.83     46520
   macro avg       0.84      0.74      0.78     46520
weighted avg       0.83      0.83      0.82     46520



### Save the pipeline as a pickle file

In [29]:
import pickle

In [30]:
with open('model.pkl', 'wb') as model_file:     
    pickle.dump(full_pipeline_m, model_file)

In [32]:
with open('model.pkl', 'rb') as f:
    m_pickle = pickle.load(f)

In [34]:
m_pickle.predict(df)

array([1, 2, 1, ..., 2, 2, 1])