## logistic regression model

#### import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### load cleaned feature data table

In [2]:
df = pd.read_feather('../data/processed/df_to_model_heartfail')

In [3]:
df.columns

Index(['time_on_vent', 'anchor_age', 'spontrr', 'heartrate', 'std_spontrr',
       'weight', 'sodium', 'abg_po2', 'abg_ph', 'hco3', 'abg_pco2',
       'bloodpressure', 'std_pulseox', 'std_heartrate', 'creatinine', 'bun',
       'lactic_acid', 'hemoglobin', 'wbg', 'tidalvolume', 'std_bloodpressure',
       'tidal_weight', 'pulseox', 're_intub_class', 'gender',
       'admission_type'],
      dtype='object')

In [4]:
df.drop(['admission_type','time_on_vent'],axis=1,inplace=True)

In [5]:
df

Unnamed: 0,anchor_age,spontrr,heartrate,std_spontrr,weight,sodium,abg_po2,abg_ph,hco3,abg_pco2,...,bun,lactic_acid,hemoglobin,wbg,tidalvolume,std_bloodpressure,tidal_weight,pulseox,re_intub_class,gender
0,4.248495,2.995732,4.615121,2.262005,5.125154,4.962845,5.214936,2.141242,3.332205,3.637586,...,2.772589,1.131402,2.104134,3.190476,6.317165,2.686546,1.460338,5.199338,0,F
1,4.248495,2.995732,4.615121,2.262005,5.125154,4.962845,5.214936,2.141242,3.332205,3.637586,...,2.772589,1.131402,2.104134,3.190476,6.317165,2.686546,1.460338,5.199338,0,F
2,3.850148,2.772589,4.488636,1.856050,5.571393,4.941642,4.718499,2.136531,3.367296,3.737670,...,3.737670,0.875469,2.388763,2.617396,6.388561,1.331541,1.184456,-0.395725,0,M
3,4.262680,3.218876,4.394449,1.167725,4.872139,4.927254,4.584967,2.145931,3.433987,3.713572,...,3.091042,0.875469,2.116256,2.595255,6.095825,1.741895,1.485735,-0.395725,0,M
4,4.262680,3.218876,4.394449,1.167725,4.872139,4.927254,4.584967,2.145931,3.433987,3.713572,...,3.091042,0.875469,2.116256,2.595255,6.095825,1.741895,1.485735,-0.395725,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3696,4.204693,2.944439,4.276666,1.405311,5.619676,5.017280,4.290459,2.124654,3.637586,4.127134,...,4.330733,0.993252,2.341806,2.251292,6.011267,1.933442,0.908693,5.199338,0,M
3697,4.442651,2.397895,4.804021,2.405094,5.252797,4.941642,4.787492,2.121063,3.044522,3.784190,...,3.688879,0.832909,2.302585,3.310543,5.916202,2.517143,1.080565,-0.709103,0,F
3698,4.356709,2.772589,4.488636,1.905745,5.031744,4.976734,5.379897,2.111425,3.401197,4.204693,...,4.007333,0.741937,2.208274,2.785011,5.880533,2.019496,1.207645,0.373159,0,F
3699,4.143135,3.068053,4.394449,1.777500,5.236442,4.905275,4.812184,2.129421,3.258097,3.332205,...,3.931826,0.788457,2.208274,2.104134,6.817831,2.185905,1.771957,5.199338,0,M


## Handle categoricals

In [6]:
X = df[df.columns.drop(['re_intub_class'])]
y = df['re_intub_class']

In [7]:
# Import libraries 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [8]:
numeric_features  = df[df.columns.drop(['gender','re_intub_class'])].columns
numeric_transformer = Pipeline(steps=[
   # ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = df[['gender']].columns
categorical_transformer = Pipeline(steps=[
   # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop = 'first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#The order of the columns in the transformed feature matrix follows the order 
#of how the columns are specified in the transformers list. Columns of the original 
#feature matrix that are not specified are dropped from the resulting transformed feature matrix,
#unless specified in the passthrough keyword. Those columns specified with passthrough 
#are added at the right to the output of the transformers.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 101)

In [10]:
numeric_feature_names = preprocessor.transformers[0][2]

In [11]:
training_data = X_train.copy()
training_data.reset_index(inplace=True)
training_data.drop('index',axis=1,inplace=True)

In [12]:
training_data.to_feather("strip_train_data")

In [13]:
scaled_train = preprocessor.fit_transform(X_train)

In [14]:
categs= preprocessor.named_transformers_['cat']['onehot']
onehot_features = categs.get_feature_names()
numeric_feature_names = preprocessor.transformers[0][2]
feature_names = np.concatenate((numeric_feature_names.tolist(),onehot_features))

In [15]:
feature_names

array(['anchor_age', 'spontrr', 'heartrate', 'std_spontrr', 'weight',
       'sodium', 'abg_po2', 'abg_ph', 'hco3', 'abg_pco2', 'bloodpressure',
       'std_pulseox', 'std_heartrate', 'creatinine', 'bun', 'lactic_acid',
       'hemoglobin', 'wbg', 'tidalvolume', 'std_bloodpressure',
       'tidal_weight', 'pulseox', 'x0_M'], dtype=object)

In [16]:
scaled_test = preprocessor.transform(X_test) 

In [17]:
oversample = SMOTE(random_state = 101)
X_smote, y_smote = oversample.fit_resample(scaled_train, y_train)

In [18]:
from imblearn.under_sampling import RandomUnderSampler

In [19]:
#smote = RandomUnderSampler(random_state = 101, replacement = True)
#X_smote, y_smote= smote.fit_resample(scaled_train, y_train)

In [20]:
w = {0:90, 1:10}
w1 = {0:80, 1:20}
w2 = {0:70, 1:30}
w3 = {0:60, 1:40}

param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear'],
    'max_iter' : [1000],
    'class_weight': [w,w1,w2,w3]}]

rfc_param_grid=[
    {'n_estimators' : list(range(10,101,10)),
    'max_features' : list(range(4,24,4)),
    'class_weight' :[w,w1,w2,w3]}]

svc_param_grid ={ 'kernel':('linear', 'rbf'), 
                 'C': [0.1,1, 10, 100, 1000], 
                 'gamma': [1,0.1,0.01,0.001,0.0001]}

In [21]:
clf = GridSearchCV(LogisticRegression()),param_grid,refit=True)
clf.fit(X_smote,y_smote)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid=[{'class_weight': [{0: 90, 1: 10}, {0: 80, 1: 20},
                                           {0: 70, 1: 30}, {0: 60, 1: 40}],
                          'max_features': [4, 8, 12, 16, 20],
                          'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                           100]}])

In [22]:
import statsmodels.api as sm

In [23]:
logit_model=sm.Logit(y_smote,X_smote)
result=logit_model.fit()
print(result.summary(alpha = 0.05))

Optimization terminated successfully.
         Current function value: 0.661810
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         re_intub_class   No. Observations:                 5378
Model:                          Logit   Df Residuals:                     5355
Method:                           MLE   Df Model:                           22
Date:                Mon, 05 Oct 2020   Pseudo R-squ.:                 0.04521
Time:                        14:08:22   Log-Likelihood:                -3559.2
converged:                       True   LL-Null:                       -3727.7
Covariance Type:            nonrobust   LLR p-value:                 3.470e-58
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.0034      0.032     -0.107      0.915      -0.067       0.060
x2            -0.0861      0.

In [24]:
result.params

x1    -0.003449
x2    -0.086085
x3     0.056036
x4    -0.088615
x5    -0.805651
x6    -0.023027
x7    -0.007538
x8     0.137930
x9    -0.051949
x10    0.170528
x11    0.152577
x12    0.164565
x13    0.001726
x14   -0.012712
x15    0.421192
x16    0.080283
x17   -0.079866
x18    0.136912
x19    0.680586
x20    0.037831
x21   -0.848831
x22   -0.111177
x23   -0.082090
dtype: float64

In [25]:
clf.best_estimator_.coef_

AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

In [None]:
clf.best_estimator_

In [None]:
predictions = clf.predict(scaled_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
tn, fp, fn, tp = confusion_matrix(y_test,predictions).ravel()
tn, fp, fn, tp

In [None]:
clf.predict_log_proba(scaled_test)

In [None]:
train_predictions = clf.predict(scaled_train)
print(classification_report(y_train,train_predictions))
print(confusion_matrix(y_train,train_predictions))

In [None]:
y_pred_proba = clf.predict_proba(scaled_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
np.savetxt('heartfail_fpr.txt',fpr)
np.savetxt('heartfail_tpr.txt',fpr)

#### Model explainability

In [None]:
import lime
import lime.lime_tabular

In [None]:
X_test.reset_index(inplace=True)
X_test.drop(['index'],axis=1,inplace=True)

In [None]:
X_test

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_smote,  
                              feature_names=feature_names,  
                              #class_names=['re_intub_class'], 
                              #categorical_features=categorical_features ,
                              verbose=True, 
                              mode='classification',
                              discretize_continuous=True)

In [None]:
scaled_test[0,:]

In [None]:
clf.predict(scaled_test[8,:].reshape(1,-1))
explog = explainer.explain_instance(scaled_test[0,:], clf.predict_proba, num_features=5)
explog.show_in_notebook(show_table=True)

In [None]:
x = explog.show_in_notebook(show_table=True)

In [None]:
explainer.feature_names

In [None]:
explog.as_list()

In [None]:
feature_list = explog.as_list()

In [None]:
salient_feature = feature_list[2][0].split(' ')

In [None]:
num_top_feats = len(feature_list)

In [None]:
j = 0
for j in np.arange(num_top_feats):
    salient_feature = feature_list[j][0].split(' ')
    j = j+1
    for i in salient_feature:
        if i in feature_names:
            print (i)

#### 5. Evaulate model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
import joblib

In [None]:
#joblib.dump(clf, "reintubate_model_strip.sav")

In [None]:
#joblib.dump(preprocessor, "reintubate_preprocessor_strip.sav")

## Train model

### perform train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### 3. SMOTE IT!

from collections import Counter
counter = Counter(y_train)
print(counter)

# transform the dataset
oversample = SMOTE(random_state = 101)

X_smote, y_smote = oversample.fit_resample(X_train, y_train)

counter = Counter(y_smote)
print(counter)

#### 4. Do logistic regression model

from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression(max_iter=1000, C=0.0001)
logmodel.fit(X_smote,y_smote)

import pickle

In [None]:
# Save the model as a pickle in a file 
#pickle.dump(logmodel, open("reintubate_model_log", 'wb')) 

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features  = df[df.columns.drop(['gender','admission_type','re_intub_class'])].columns
#numeric_transformer = ('scaler', StandardScaler())
numeric_transformer = make_column_transformer(
    (StandardScaler(), numeric_features),
    remainder='passthrough')


categorical_features = df[['gender','admission_type']].columns
#categorical_transformer =  ('onehot', OneHotEncoder(drop='first'))
categorical_transformer = make_column_transformer(
    (OneHotEncoder(drop='first'), categorical_features),
    remainder='passthrough')

#preprocessor = make_column_transformer(
 #   transformers=[
  #      ('num', numeric_transformer, numeric_features),
   #     ('cat', categorical_transformer, categorical_features)],
#remainder ='passthrough')

clf = Pipeline(steps=[('num', numeric_transformer),
                      ('cat', categorical_transformer),
                      ('classifier', LogisticRegression())])

#### 2. Perform feature scaling

Because the range of values in the features are not necessarily in the same order of magnitude, we will scale the feature data prior to training the model.

* actually... they might not be far off! 

mask=['spontRR', 'stdABP', 'meanABP', 'stdSpontRR', 'pulseox', 'stdPulseox',
       'temp', 'heartRate', 'stdHeartRate', 'weight', 'height', 'anchor_age',
       'time_on_vent']

X_traina = X_train.copy()
X_testa = X_test.copy()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train.loc[:,mask])
X_traina.loc[:,mask] = scaler.transform(X_train.loc[:,mask])

#X_train = scaler.transform(X_train)
X_testa.loc[:,mask] = scaler.transform(X_test.loc[:,mask])