In [159]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None

#for train-dev-test splitting
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
#https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.RandomOverSampler.html
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Goals: 70+% recall, 90+% precision

In [182]:
def getData(fileName):
    path = '../data/' + fileName +'.pkl'
    data = pd.read_pickle(path)
    Y = data.pop('strokeBin')
    data.pop('recordId')
    X = data
    return X, Y

In [186]:
X_Train, Y_Train = getData('pre_op_X_B_train')
X_Dev, Y_Dev = getData('pre_op_X_B_dev')

In [187]:
X_Train.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_year_2011,surgdt_year_2012,surgdt_year_2013,surgdt_year_2014,surgdt_year_2015,surgdt_year_2016,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jun,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Wed,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_Mid,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,cvawhen,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm_ANGINA,CardSympTimeOfAdm_STEMI,CardSympTimeOfSurg_ANGINA,CardSympTimeOfSurg_STEMI,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvd,cva,cvdtia,cvdpcarsurg,cvdcarsten_NONE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_NONE,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_NONE,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,Arrhythmia,arrhyafib,ArrhythAFlutter_NONE,ArrhythAFlutter_REMOTE,ArrhythAFlutter_RECENT,ArrhythAFib_NONE,ArrhythAFib_PAROXYSMAL,ArrhythAFib_CONTINOUS,ArrhythAFibDur_NONE,ArrhythAFibDur_SHORT,ArrhythAFibDur_LONG,arrhythwhen_NONE,arrhythwhen_SHORT,arrhythwhen_LONG
33392,46,170.0,88.3,30.55363,43.0,1.0,3.8,5.6,8.2,40.0,45.0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,1,0
8466,54,162.60001,93.9,35.51603,39.7,0.78,3.8,6.3,7.47,61.0,41.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1.0,0.0,1.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,1.0,0.0,1,0,0,0,1,0,1,0,0,1,0,0
11475,59,175.0,82.0,26.77551,37.0,1.7,3.9,5.9,11.49,45.0,49.0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0.0,0.0,1,0,0,1,0,0,1,0,0,1,0,0
12607,80,168.0,80.0,28.34467,38.6,0.96,3.1,5.6,7.16,65.0,23.1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0.0,0.0,1,0,0,1,0,0,1,0,0,1,0,0
33429,85,187.0,108.4,30.99888,41.0,1.4,3.5,5.8,18.4,55.0,35.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,0,1,0,0,0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,1,0


In [192]:
def logistic(xTrain, yTrain, xDev, yDev, solver = 'lbfgs', max_iter = 400):
    clf = LogisticRegression(random_state=0, solver=solver, 
                             class_weight='balanced', max_iter=max_iter)
    clf.fit(xTrain, yTrain)
    print("Accuracy: " + str(clf.score(xDev, yDev)))
    y_pred = clf.predict(xDev)
    metrics = precision_recall_fscore_support(yDev, y_pred)
    print("Precision:") 
    print(metrics[0])
    print("Recall:") 
    print(metrics[1])

Let's get metrics for the vanilla model with no special sampling

In [193]:
logistic(X_Train, Y_Train, X_Dev, Y_Dev)

Accuracy: 0.7045256744995648
Precision:
[0.99071782 0.02639296]
Recall:
[0.70684327 0.54545455]




Next oversample to parity with RandomOverSampler

In [210]:
ros = RandomOverSampler(1, random_state = 4) #50-50 split stroke to no-stroke
X_res, y_res = ros.fit_sample(X=X_Train, y=Y_Train)
logistic(X_res, y_res, X_Dev, Y_Dev)

Accuracy: 0.7393385552654482
Precision:
[0.99       0.02675585]
Recall:
[0.74304636 0.48484848]


In [195]:
def predictForSamplingRatio(ratio, xTrain, yTrain, xDev, yDev):
    ros = RandomOverSampler(ratio, random_state = 4) #50-50 split stroke to no-stroke
    X_res, y_res = ros.fit_sample(X=xTrain, y=yTrain)
    clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=400).fit(X_res, y_res)
    print("Score: " + str(clf.score(xDev, yDev))) #accuracy
    y_pred = clf.predict(xDev)
    return y_pred
yPred = predictForSamplingRatio(1, X_Train, Y_Train, X_Dev, Y_Dev)
precision_recall_fscore_support(Y_Dev, yPred)

Score: 0.7071366405570061




(array([0.99075216, 0.02662722]),
 array([0.70949227, 0.54545455]),
 array([0.82685876, 0.05077574]),
 array([2265,   33]))

In [197]:
def shrinkDatasetButBalance(overSampleRatio, xTrain, yTrain):
    ros = RandomOverSampler(overSampleRatio, random_state = 4)
    X_res, y_res = ros.fit_sample(X=xTrain, y=yTrain)
    rus = RandomUnderSampler(1, random_state = 4) #50-50 split stroke to no-stroke
    X_res, y_res = rus.fit_sample(X=X_res, y=y_res)
    return X_res, y_res

In [196]:
xRes, yRes = shrinkDatasetButBalance(0.25, X_Train, Y_Train)
logistic(xRes, yRes, X_Dev, Y_Dev,max_iter = 4000)

Accuracy: 0.7132288946910357
Precision:
[0.99023199 0.02575758]
Recall:
[0.71611479 0.51515152]


In [199]:
yRes.sum() #only have 4530 positive samples and 4530 negative examples

4530

## Scale numerical features

In [104]:
numerical_features = ['age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys']

In [200]:
#get refreshed data
X_Train, Y_Train = getData('pre_op_X_B_train')
X_Dev, Y_Dev = getData('pre_op_X_B_dev')

In [201]:
def scaleNumericalData(xDf):
    mm_scaler = MinMaxScaler()
    for col in X_Train.columns:
        if col in numerical_features:
            df = xDf[col].values.reshape([-1, 1])
            xDf[col] = mm_scaler.fit_transform(df)
    return xDf

In [202]:
X_Scaled_Train = scaleNumericalData(X_Train)
X_Scaled_Dev = scaleNumericalData(X_Dev)

Let's try just the scaled data, no sampling

In [203]:
logistic(X_Scaled_Train, Y_Train, X_Scaled_Dev, Y_Dev,max_iter = 4000)

Accuracy: 0.7297650130548303
Precision:
[0.99045346 0.02733119]
Recall:
[0.73289183 0.51515152]


Results are worse than when the data was unscaled

Let's do some sampling

In [204]:
xRes, yRes = shrinkDatasetButBalance(0.25, X_Scaled_Train, Y_Train)
logistic(xRes, yRes, X_Scaled_Dev, Y_Dev,max_iter = 4000)

Accuracy: 0.6845082680591819
Precision:
[0.99044586 0.02472527]
Recall:
[0.68653422 0.54545455]


Questions:
- Why are the results exactly the same despite variations?

# Let's try trees

In [213]:
X_Train, Y_Train = getData('pre_op_X_B_train')
X_Dev, Y_Dev = getData('pre_op_X_B_dev')

In [208]:
def tree(X_Train, Y_Train, X_Dev, Y_Dev):
    treeClf = DecisionTreeClassifier(criterion = 'entropy',class_weight = 'balanced', random_state=0)
    treeClf.fit(X_Train, Y_Train)
    y_pred = treeClf.predict(X_Dev)
    print("Accuracy: " + str(clf.score(X_Dev, Y_Dev)))
    metrics = precision_recall_fscore_support(Y_Dev, y_pred)
    print("Precision:") 
    print(metrics[0])
    print("Recall:") 
    print(metrics[1])

In [214]:
ros = RandomOverSampler(1, random_state = 4) #50-50 split stroke to no-stroke
X_res, y_res = ros.fit_sample(X=X_Train, y=Y_Train)

In [216]:
tree(X_Train, Y_Train, X_Dev, Y_Dev)

Accuracy: 0.6984334203655352
Precision:
[0.98532681 0.        ]
Recall:
[0.97836645 0.        ]


In [215]:
tree(X_res, y_res, X_Dev, Y_Dev)

Accuracy: 0.6984334203655352
Precision:
[0.98585323 0.02777778]
Recall:
[0.98454746 0.03030303]


In [217]:
X_Scaled_Train = scaleNumericalData(X_Train)
X_Scaled_Dev = scaleNumericalData(X_Dev)

In [219]:
tree(X_Scaled_Train, Y_Train, X_Scaled_Dev, Y_Dev)

Accuracy: 0.9521322889469104
Precision:
[0.98558322 0.        ]
Recall:
[0.99602649 0.        ]


In [220]:
xRes, yRes = shrinkDatasetButBalance(0.25, X_Scaled_Train, Y_Train)
tree(xRes, yRes, X_Scaled_Dev, Y_Dev)

Accuracy: 0.9521322889469104
Precision:
[0.98512173 0.        ]
Recall:
[0.96467991 0.        ]


In [221]:
def forest(X_Train, Y_Train, X_Dev, Y_Dev):
    forestClf = RandomForestClassifier(criterion = 'entropy', random_state = 0, bootstrap = True, 
                                   class_weight = "balanced_subsample")
    forestClf.fit(X_Train, Y_Train)
    y_pred = forestClf.predict(X_Dev)
    print("Accuracy: " + str(clf.score(X_Dev, Y_Dev)))
    metrics = precision_recall_fscore_support(Y_Dev, y_pred)
    print("Precision:") 
    print(metrics[0])
    print("Recall:") 
    print(metrics[1])

In [222]:
forest(X_Train, Y_Train, X_Dev, Y_Dev)



Accuracy: 0.9521322889469104
Precision:
[0.98563969 0.        ]
Recall:
[1. 0.]


  'precision', 'predicted', average, warn_for)


Try the "tree" dataset

In [223]:
X_Tree_Train, Y_Tree_Train = getData('pre_op_X_tree_B_train')
X_Tree_Dev, Y_Tree_Dev = getData('pre_op_X_tree_B_dev')

In [224]:
forestClf = RandomForestClassifier(criterion = 'gini', random_state = 0, bootstrap = True, 
                                   class_weight = "balanced_subsample")
forestClf.fit(X_Tree_Train, Y_Tree_Train)
y_pred = forestClf.predict(X_Dev)
precision_recall_fscore_support(Y_Dev, y_pred)



ValueError: could not convert string to float: 'Feb'

In [225]:
X_Tree_Train.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_year,surgdt_month,surgdt_DayOfWeek,surgdt_PartOfMonth,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,diabctrl,dyslip,dialysis,hypertn,infendo,infendty,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,ThAoDisease,syncope,unrespstat,hitanti,TobaccoUse,cigsmoker,cigsmokercurr,chrlungd,prcvint,prcab,prvalve,chf,priorhf,medinotr,hdefd,vdaort,vdstena,vdstenm,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,CardSympTimeOfAdm,CardSympTimeOfSurg,anginalclass,classnyh,vdinsufm,vdinsuft,incidencREOP,status,cvd,cva,cvawhen,cvdtia,cvdpcarsurg,cvdcarsten,cvdstenrt,cvdstenlft,Arrhythmia,arrhyafib,ArrhythAFlutter,ArrhythAFib,ArrhythAFibDur,arrhythwhen
33392,46,170.0,88.3,30.55363,43.0,1.0,3.8,5.6,8.2,40.0,45.0,2014,Feb,Mon,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,REST,NONE,NONE,NONE,NONE,URGENT,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT
8466,54,162.60001,93.9,35.51603,39.7,0.78,3.8,6.3,7.47,61.0,41.0,2015,Aug,Tues,End,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0.0,1.0,NONE,NONE,NONE,NONE,MILD,MODERATE,FIRST,NONE,1.0,0.0,0.0,1.0,0.0,NONE,NONE,NONE,1.0,0.0,NONE,PAROXYSMAL,NONE,NONE
11475,59,175.0,82.0,26.77551,37.0,1.7,3.9,5.9,11.49,45.0,49.0,2015,Jan,Wed,End,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,ANGINA,ANGINA,SLIGHT,NONE,MODERATE,MODERATE,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE
12607,80,168.0,80.0,28.34467,38.6,0.96,3.1,5.6,7.16,65.0,23.1,2014,Jul,Mon,Mid,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,1.0,NONE,NONE,NONE,NONE,TRIVIAL,TRIVIAL,NONE,URGENT,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,NONE
33429,85,187.0,108.4,30.99888,41.0,1.4,3.5,5.8,18.4,55.0,35.0,2014,Mar,Thurs,Beg,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,NONE,NONE,NONE,NONE,MILD,MILD,NONE,NONE,0.0,0.0,0.0,0.0,0.0,NONE,NONE,NONE,0.0,0.0,NONE,NONE,NONE,SHORT


In [226]:
X_Tree_Train.shape

(18387, 84)

In [227]:
X_Train.shape

(18387, 140)

In [228]:
Y_Train.sum()

265

# Let's try Neural Nets