## logistic regression model

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### load cleaned feature data table

In [2]:
df = pd.read_feather('../data/processed/df_to_model')

In [3]:
df.columns

Index(['time_on_vent', 'anchor_age', 'spontrr', 'heartrate', 'std_spontrr',
       'weight', 'bloodpressure', 'std_pulseox', 'std_heartrate', 'height',
       'tidalvolume', 'temp', 'std_bloodpressure', 'pulseox', 're_intub_class',
       'gender', 'admission_type'],
      dtype='object')

In [4]:
df[df['re_intub_class']==0]

Unnamed: 0,time_on_vent,anchor_age,spontrr,heartrate,std_spontrr,weight,bloodpressure,std_pulseox,std_heartrate,height,tidalvolume,temp,std_bloodpressure,pulseox,re_intub_class,gender,admission_type
0,659.416667,70.0,0.0,127.0,11.631239,167.4,102.0,2.000000,7.314369,59.027559,0.0,37.500000,7.120393,94.0,0,F,URGENT
1,25.983333,72.0,19.0,79.0,3.718759,123.2,62.0,0.000000,1.971222,59.921260,387.0,36.444444,20.347548,100.0,0,F,EW EMER.
2,17.000000,23.0,33.0,122.0,3.577709,264.0,101.0,1.788854,13.771952,70.933071,527.0,39.166667,6.350853,96.0,0,M,EW EMER.
3,580.233333,81.0,24.0,63.0,1.788854,198.0,79.0,1.673320,4.324350,61.905512,437.0,36.833333,6.418723,96.0,0,F,EW EMER.
4,170.000000,65.0,15.0,80.0,2.316607,205.9,105.0,0.577350,3.633180,61.905512,614.0,37.333333,4.505552,100.0,0,F,URGENT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8071,49.616667,54.0,14.0,78.0,5.703313,203.5,75.0,1.187735,6.284903,70.039370,529.0,36.555556,16.954842,97.0,0,M,EW EMER.
8072,168.833333,77.0,16.0,100.0,3.907411,209.0,88.0,0.000000,2.097618,64.086614,553.0,37.000000,10.876162,100.0,0,F,ELECTIVE
8073,21.233333,73.0,9.0,76.0,7.863796,143.4,67.0,0.632456,5.609516,64.980315,431.0,38.277778,11.493081,98.0,0,M,EW EMER.
8074,113.800000,87.0,17.0,50.0,6.345696,193.6,85.0,0.516398,2.065591,72.023622,1137.0,36.833333,9.605475,99.0,0,M,EW EMER.


## Handle categoricals

In [5]:
X = df[df.columns.drop('re_intub_class')]
y = df['re_intub_class']

In [37]:
# Import libraries 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [38]:
numeric_features  = df[df.columns.drop(['gender','admission_type','re_intub_class'])].columns
numeric_transformer = Pipeline(steps=[
   # ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = df[['gender','admission_type']].columns
categorical_transformer = Pipeline(steps=[
   # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('smote', SMOTE(random_state=101)),
                      ('classifier', SVC())])


param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

svc_param_grid ={ 'svc__kernel':('linear', 'rbf'), 
                 'svc__C': [0.1,1, 10, 100, 1000], 
                 'svc__gamma': [1,0.1,0.01,0.001,0.0001]}

gs = GridSearchCV(clf, param_grid, verbose=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
#print("model score: %.3f" % clf.score(X_test, y_test))

predictions = clf.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90      1618
           1       0.07      0.17      0.09        95

    accuracy                           0.82      1713
   macro avg       0.51      0.51      0.50      1713
weighted avg       0.90      0.82      0.86      1713

[[1392  226]
 [  79   16]]


In [18]:
predictions = clf.predict(X_test)

#### 5. Evaulate model

In [19]:
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.61      0.75      1615
           1       0.08      0.52      0.13        98

    accuracy                           0.61      1713
   macro avg       0.52      0.57      0.44      1713
weighted avg       0.90      0.61      0.71      1713



In [21]:
print(confusion_matrix(y_test,predictions))

[[992 623]
 [ 47  51]]


In [None]:
import joblib

In [None]:
#joblib.dump(logmodel, "reintubate_model_log.sav")

In [None]:
#joblib.dump(scaler, "reintubate_scaler.sav")

## Train model

### perform train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### 3. SMOTE IT!

from collections import Counter
counter = Counter(y_train)
print(counter)

# transform the dataset
oversample = SMOTE(random_state = 101)

X_smote, y_smote = oversample.fit_resample(X_train, y_train)

counter = Counter(y_smote)
print(counter)

#### 4. Do logistic regression model

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression(max_iter=1000, C=0.0001)
logmodel.fit(X_smote,y_smote)

import pickle

In [None]:
# Save the model as a pickle in a file 
#pickle.dump(logmodel, open("reintubate_model_log", 'wb')) 

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features  = df[df.columns.drop(['gender','admission_type','re_intub_class'])].columns
#numeric_transformer = ('scaler', StandardScaler())
numeric_transformer = make_column_transformer(
    (StandardScaler(), numeric_features),
    remainder='passthrough')


categorical_features = df[['gender','admission_type']].columns
#categorical_transformer =  ('onehot', OneHotEncoder(drop='first'))
categorical_transformer = make_column_transformer(
    (OneHotEncoder(drop='first'), categorical_features),
    remainder='passthrough')

#preprocessor = make_column_transformer(
 #   transformers=[
  #      ('num', numeric_transformer, numeric_features),
   #     ('cat', categorical_transformer, categorical_features)],
#remainder ='passthrough')

clf = Pipeline(steps=[('num', numeric_transformer),
                      ('cat', categorical_transformer),
                      ('classifier', LogisticRegression())])

#### 2. Perform feature scaling

Because the range of values in the features are not necessarily in the same order of magnitude, we will scale the feature data prior to training the model.

* actually... they might not be far off! 

mask=['spontRR', 'stdABP', 'meanABP', 'stdSpontRR', 'pulseox', 'stdPulseox',
       'temp', 'heartRate', 'stdHeartRate', 'weight', 'height', 'anchor_age',
       'time_on_vent']

X_traina = X_train.copy()
X_testa = X_test.copy()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.loc[:,mask])
X_traina.loc[:,mask] = scaler.transform(X_train.loc[:,mask])

#X_train = scaler.transform(X_train)
X_testa.loc[:,mask] = scaler.transform(X_test.loc[:,mask])