## logistic regression model

#### import libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### load cleaned feature data table

In [3]:
df = pd.read_csv('clean_ft_table.csv')

In [4]:
df.columns

Index(['spontRR', 'stdABP', 'meanABP', 'stdSpontRR', 'pulseox', 'stdPulseox',
       'temp', 'heartRate', 'stdHeartRate', 'weight', 'height',
       're_intub_class', 'anchor_age', 'time_on_vent', 'adm_type_ELECTIVE',
       'adm_type_EW EMER.', 'adm_type_OBSERVATION ADMIT',
       'adm_type_SURGICAL SAME DAY ADMISSION', 'adm_type_URGENT', 'gender_M'],
      dtype='object')

In [5]:
df[df['re_intub_class']==0]

Unnamed: 0,spontRR,stdABP,meanABP,stdSpontRR,pulseox,stdPulseox,temp,heartRate,stdHeartRate,weight,height,re_intub_class,anchor_age,time_on_vent,adm_type_ELECTIVE,adm_type_EW EMER.,adm_type_OBSERVATION ADMIT,adm_type_SURGICAL SAME DAY ADMISSION,adm_type_URGENT,gender_M
0,0.0,7.120393,102.0,11.631239,94.0,2.000000,37.500000,127.0,7.314369,167.4,59.027559,0,70,659.416667,0,0,0,0,1,0
1,19.0,20.347548,62.0,3.718759,100.0,0.000000,36.444444,79.0,1.971222,123.2,59.921260,0,72,25.983333,0,1,0,0,0,0
2,33.0,6.350853,101.0,3.577709,96.0,1.788854,39.166667,122.0,13.771952,264.0,70.933071,0,23,17.000000,0,1,0,0,0,1
3,24.0,6.418723,79.0,1.788854,96.0,1.673320,36.833333,63.0,4.324350,198.0,61.905512,0,81,580.233333,0,1,0,0,0,0
4,15.0,4.505552,105.0,2.316607,100.0,0.577350,37.333333,80.0,3.633180,205.9,61.905512,0,65,170.000000,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10632,14.0,2.401388,87.0,7.629829,98.0,0.816497,37.277778,135.0,6.228965,224.2,64.980315,0,37,12.483333,0,0,0,0,1,0
10633,19.0,3.405877,78.0,4.163332,96.0,1.032796,37.333333,83.0,2.041241,255.2,68.501969,0,57,21.216667,1,0,0,0,0,0
10634,19.0,5.656854,94.0,10.392305,97.0,0.983192,36.222222,50.0,2.994439,197.1,70.039370,0,61,8.550000,0,1,0,0,0,1
10635,26.0,7.098556,76.0,4.208371,92.0,2.267898,37.333333,103.0,5.006892,183.7,66.964567,0,87,17.983333,0,1,0,0,0,1


## Train model

### perform train test split

In [6]:
X = df[df.columns.drop('re_intub_class')]
y = df['re_intub_class']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### 2. Perform feature scaling

Because the range of values in the features are not necessarily in the same order of magnitude, we will scale the feature data prior to training the model.

* actually... they might not be far off! 

In [8]:
mask=['spontRR', 'stdABP', 'meanABP', 'stdSpontRR', 'pulseox', 'stdPulseox',
       'temp', 'heartRate', 'stdHeartRate', 'weight', 'height', 'anchor_age',
       'time_on_vent']

In [9]:
X_traina = X_train.copy()
X_testa = X_test.copy()

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.loc[:,mask])
X_traina.loc[:,mask] = scaler.transform(X_train.loc[:,mask])

In [11]:
#X_train = scaler.transform(X_train)
X_testa.loc[:,mask] = scaler.transform(X_test.loc[:,mask])

#### 3. SMOTE IT!

In [12]:
from collections import Counter
counter = Counter(y_train)
print(counter)

Counter({0: 7423, 1: 723})


In [13]:
import imblearn
print(imblearn.__version__)

0.7.0


In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
# transform the dataset
oversample = SMOTE(random_state = 101)

In [16]:
X_smote, y_smote = oversample.fit_resample(X_train, y_train)

In [17]:
counter = Counter(y_smote)
print(counter)

Counter({0: 7423, 1: 7423})


#### 4. Do logistic regression model

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
logmodel = LogisticRegression(max_iter=1000, C=0.0001)
logmodel.fit(X_smote,y_smote)

LogisticRegression(C=0.0001, max_iter=1000)

In [20]:
predictions = logmodel.predict(X_test)

#### 5. Evaulate model

In [21]:
from sklearn.metrics import classification_report, confusion_matrix

In [22]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.66      0.78      3214
           1       0.13      0.56      0.20       278

    accuracy                           0.66      3492
   macro avg       0.54      0.61      0.49      3492
weighted avg       0.88      0.66      0.73      3492



In [23]:
print(confusion_matrix(y_test,predictions))

[[2133 1081]
 [ 123  155]]


In [24]:
import pickle

In [25]:
# Save the model as a pickle in a file 
pickle.dump(logmodel, open("reintubate_model_log", 'wb')) 

In [26]:
import joblib

In [27]:
joblib.dump(logmodel, "reintubate_model_log.sav")

['reintubate_model_log.sav']

In [28]:
joblib.dump(scaler, "reintubate_scaler.sav")

['reintubate_scaler.sav']