## Capstone Project

### Modeling Notebook - `LogisticRegression`  `COMBINED DATASET` from 10/24/19

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn import metrics

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, cross_validate, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.1
Running numpy version: 1.14.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'X_A_DREF.pkl',
 'X_A_DREF_TREE_SKLEARN.pkl',
 'X_PREOP_10_24.pkl',
 'X_PREOP_TREE_10_24.pkl',
 'X_dev_A_DREF.pkl',
 'X_dev_A_DREF_TREE_SKLEARN.pkl',
 'X_dev_PREOP_10_24.pkl',
 'X_dev_PREOP_TREE_10_24.pkl',
 'X_test_A_DREF.pkl',
 'X_test_A_DREF_TREE_SKLEARN.pkl',
 'X_test_PREOP_10_24.pkl',
 'X_test_PREOP_TREE_10_24.pkl',
 'X_train_A_DREF.pkl',
 'X_train_A_DREF_TREE_SKLEARN.pkl',
 'X_train_PREOP_10_24.pkl',
 'X_train_PREOP_TREE_10_24.pkl',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outcome.pkl',
 'capstone_data_binarized_outcome.xlsx',
 'capstone_data_binarized_outcome_compressed.pkl',


#### Loading Datasets

#### `X_train`, `y_train`
- designation of `_all` denotes complete feature set

In [7]:
X_train_all = pd.read_pickle('X_train_PREOP_10_24.pkl')
y_train = pd.read_pickle('y_train_PREOP_10_24.pkl')

In [8]:
X_train_all.shape, y_train.shape

((34192, 109), (34192,))

#### `X_dev`, `y_dev`
- designation of `_all` denotes complete feature set

In [9]:
X_dev_all = pd.read_pickle('X_dev_PREOP_10_24.pkl')
y_dev = pd.read_pickle('y_dev_PREOP_10_24.pkl')

In [10]:
X_dev_all.shape, y_dev.shape

((4274, 109), (4274,))

#### `X_test`, `y_test`
- designation of `_all` denotes complete feature set

In [11]:
X_test_all = pd.read_pickle('X_dev_PREOP_10_24.pkl')
y_test = pd.read_pickle('y_dev_PREOP_10_24.pkl')

In [12]:
X_test_all.shape, y_dev.shape

((4274, 109), (4274,))

- validating row count for `COMBINED DATASET` from 10/24/19 - `42,740` total observations

In [13]:
42740 - X_train_all.shape[0] - X_dev_all.shape[0] - X_test_all.shape[0]

0

- last look at the data (`X_train_all`) before modeling

In [14]:
X_train_all.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
0,-2.12457,0.11594,0.32942,0.157,-2.65268,-0.35221,-2.27155,-1.62927,2.47597,0.98148,3.58585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.08243,-0.82771,-0.95497,-0.39563,-0.10617,-0.13691,0.04597,0.25206,-0.29493,0.33103,0.49141,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,-0.20037,1.5314,1.56497,0.35976,0.59504,-0.25533,-0.34028,0.32442,-0.67978,0.57495,-0.12748,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0.44103,-0.31888,-0.21754,-0.04732,0.59504,-0.25533,0.04597,-0.76096,-0.29493,0.16842,-0.64322,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,-0.75014,-1.059,0.19268,0.56118,-2.1729,-0.36298,0.04597,0.39678,-0.29493,0.57495,-0.65353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0


In [15]:
X_train_all.tail()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
34187,-0.292,0.58776,0.78848,0.26686,0.04145,-0.25533,0.04597,0.83093,-0.29493,0.00581,0.56361,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
34188,-1.11666,0.60626,-0.14917,-0.31935,-0.51214,-0.47063,-2.07842,0.61386,0.57972,-3.16512,1.62604,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
34189,0.62428,-1.059,0.17314,0.54625,-0.32761,-0.07232,-0.34028,-1.0504,-0.45237,1.388,-0.12748,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
34190,0.9908,-1.52157,0.75918,1.25109,-0.19844,-0.47063,0.62535,1.04801,-0.67978,0.57495,2.03863,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
34191,-0.20037,0.60626,0.71034,0.21143,-0.69667,-0.04003,-0.14715,3.14642,-0.36491,0.81886,-0.95266,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


## Logistic Regression
- see Albon 11.1 pages 179-182

- instantiating a `LogisticRegression()` object

In [16]:
lr_clf = LogisticRegression(class_weight='balanced',
                            random_state=0)

- creating a `StratifiedKFold` cross validation

In [17]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=0)

- defining `scoring_metrics`

In [18]:
scoring_metrics = ['accuracy',
                   'f1',
                   'f1_macro',
                   'f1_weighted',
                   'precision',
                   'precision_macro',
                   'precision_weighted',
                   'recall',
                   'recall_macro',
                   'recall_weighted',
                   'roc_auc']

- conduct `StratifiedKFold` cross-validation

In [19]:
skf_results = cross_validate(lr_clf, # model
                             X_train_all, # feature matrix - X
                             y_train, # target vector - y
                             cv=skf, # cross-validation technique
                             scoring=scoring_metrics, # loss functions
                             return_train_score=True, # returns training score
                             verbose=2, # verbosity level to check progress of calculations
                             n_jobs=-1)                          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.3s finished


In [20]:
all_features_results = pd.DataFrame(skf_results).drop(['fit_time', 'score_time'], axis=1)

In [21]:
all_features_results

Unnamed: 0,test_accuracy,train_accuracy,test_f1,train_f1,test_f1_macro,train_f1_macro,test_f1_weighted,train_f1_weighted,test_precision,train_precision,test_precision_macro,train_precision_macro,test_precision_weighted,train_precision_weighted,test_recall,train_recall,test_recall_macro,train_recall_macro,test_recall_weighted,train_recall_weighted,test_roc_auc,train_roc_auc
0,0.6732,0.68362,0.05337,0.06705,0.42794,0.43828,0.78969,0.79689,0.02807,0.0353,0.50816,0.5135,0.97182,0.97544,0.53846,0.66882,0.607,0.67635,0.6732,0.68362,0.65924,0.73768
1,0.68607,0.68267,0.0521,0.06707,0.43199,0.43794,0.79888,0.79621,0.02747,0.0353,0.50755,0.51352,0.97121,0.97548,0.50427,0.67097,0.59675,0.67692,0.68607,0.68267,0.63299,0.74625
2,0.68397,0.68707,0.05592,0.06611,0.43307,0.43908,0.79743,0.79933,0.02945,0.03483,0.50915,0.51304,0.97258,0.97497,0.55172,0.65021,0.61899,0.66896,0.68397,0.68707,0.66522,0.7374
3,0.67315,0.68114,0.05974,0.06677,0.43097,0.43725,0.7896,0.7951,0.0314,0.03514,0.51079,0.5134,0.9739,0.97537,0.61207,0.66953,0.64314,0.67544,0.67315,0.68114,0.69263,0.73341
4,0.6749,0.67906,0.04633,0.06676,0.42519,0.43648,0.7912,0.79361,0.02438,0.03512,0.50548,0.51343,0.97027,0.97545,0.46552,0.67382,0.57202,0.67649,0.6749,0.67906,0.65766,0.73334


In [22]:
all_features_results_summary = pd.DataFrame(all_features_results.mean(),
                                            columns=['all_score']).round(4)

In [23]:
all_features_results_summary

Unnamed: 0,all_score
test_accuracy,0.6783
train_accuracy,0.6827
test_f1,0.0535
train_f1,0.0668
test_f1_macro,0.4298
train_f1_macro,0.4378
test_f1_weighted,0.7934
train_f1_weighted,0.7962
test_precision,0.0282
train_precision,0.0351


#### Takeaways
- Straight out of the box `LogisticRegression` model scoring much better with `COMBINED DATASET` and `strokeBin2` relative to `2.73` and `strokeBin` which has `test_roc_auc` score of `0.5981`
- Mean `test_roc_auc` is slightly below `STS` model benchmark of `0.69`
- Disparity between `train` and `test` scores is not as wide as it was in `2.73`/`strokeBin` - model is overfitting much less in this setup
- `2.73` `train` - `test` score disparity was `0.18` versus only `0.07` here
- is the effect of having ~2x training data or more inclusive outcome variable or both??

### Logistic Regression with `SelectKBest` Feature Set
- let us see if automated feature selection can improve results

In [24]:
X_train_all.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
0,-2.12457,0.11594,0.32942,0.157,-2.65268,-0.35221,-2.27155,-1.62927,2.47597,0.98148,3.58585,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,1.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.08243,-0.82771,-0.95497,-0.39563,-0.10617,-0.13691,0.04597,0.25206,-0.29493,0.33103,0.49141,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0,0.0,0.0,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### `SelectKBest` features from `Automated_Feature_Selection_Combined_Dataset` Notebook
- uncurated list
- `47` features selected out of `110`

In [25]:
kbest_features = ['age',
                  'heightcm',
                  'weightkg',
                  'bmi', # put bmi back in since it was marginally insignificant
                  'hct',
                  'creatlst',
                  'totalbumin',
                  'a1clvl',
                  'meldscr',
                  'hdef',
                  'pasys',
                     
                  'surgdt_DayOfWeek_Mon',   # identified by SelectKBest
                  'surgdt_DayOfWeek_Tues',  # including given `surgdt_DayOfWeek_Mon`
                  'surgdt_DayOfWeek_Thurs', # including given `surgdt_DayOfWeek_Mon`
                  'surgdt_DayOfWeek_Fri',   # including given `surgdt_DayOfWeek_Mon`
                  'surgdt_DayOfWeek_Sat',   # including given `surgdt_DayOfWeek_Mon`
                  'surgdt_DayOfWeek_Sun',   # including given `surgdt_DayOfWeek_Mon`
                  
                  'gender',
                  'raceblack',
                  'diabetes',
                  'infendo',
                  'mediastrad',
                  'pvd',
                  'cvd',
                  'cva',
                  'cvdtia',
                  'cvdpcarsurg',
                  'prcvint',
                  'prvalve',
                  'chf',
                  'priorhf',
                  'arrhyafib',
                  'medinotr',
                  'vdaort',
                  'diabctrl',
                  'infendty',
                  'chrlungd',
                  'carshock24',
                  
                  'classnyh_REST',      # identified by SelectKBest
                  'classnyh_SLIGHT',    # including given `classnyh_REST`
                  
                  'vdinsufm_MILD',      # identified by SelectKBest
                  'vdinsufm_MODERATE',  # identified by SelectKBest
                  'vdinsufm_SEVERE',    # identified by SelectKBest
                  'vdinsufm_TRIVIAL',   # including given `vdinsufm_MILD`, `vdinsufm_MODERATE`, `vdinsufm_SEVERE`
                  
                  'vdinsuft_MILD',      # identified by SelectKBest
                  'vdinsuft_MODERATE',  # identified by SelectKBest
                  'vdinsuft_TRIVIAL',   # including given `vdinsuft_MILD` and `vdinsuft_MODERATE`
                  'vdinsuft_SEVERE',    # including given `vdinsuft_MILD` and `vdinsuft_MODERATE`
                  
                  'incidencREOP_FIRST',  # identified by SelectKBest
                  'incidencREOP_FOURTH', # identified by SelectKBest
                  'incidencREOP_SECOND', # including given `incidencREOP_FIRST` and `incidencREOP_FOURTH`
                  'incidencREOP_THIRD',  # including given `incidencREOP_FIRST` and `incidencREOP_FOURTH`
                  
                  'status_EMERGENCY',    # identified by SelectKBest
                  'status_URGENT',       # including given `status_EMERGENCY`
                  'status_SALVAGE',      # including given `status_EMERGENCY`
                  
                  'cvdcarsten_RIGHT',    # identified by SelectKBest
                  'cvdcarsten_LEFT',     # identified by SelectKBest
                  'cvdcarsten_BOTH',     # including given `cvdcarsten_RIGHT` and `cvdcarsten_LEFT`
                  
                  'cvdstenrt_80-99%',    # identified by SelectKBest
                  'cvdstenrt_100%',      # identified by SelectKBest
                  'cvdstenrt_50%-79%',   # including given `cvdstenrt_80-99%` and `cvdstenrt_100%`
                  
                  'cvdstenlft_80-99%',   # identified by SelectKBest
                  'cvdstenlft_100%',     # identified by SelectKBest
                  'cvdstenlft_80-99%']   # including given `cvdstenlft_80-99%` and `cvdstenlft_100%`

In [26]:
len(kbest_features)

64

- in the `2.73` `Automated_Feature_Selection` notebook, `SelectKBest` identified `38` significant features according to `chi-squared` and `ANOVA F-value` statistics for categorical and numerical features, respectively
- creating  `X_train_kbest`

In [27]:
X_train_all.shape

(34192, 109)

In [28]:
X_train_kbest = X_train_all.copy()[kbest_features]

In [29]:
X_train_kbest.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,gender,raceblack,diabetes,infendo,mediastrad,pvd,cvd,cva,cvdtia,cvdpcarsurg,prcvint,prvalve,chf,priorhf,arrhyafib,medinotr,vdaort,diabctrl,infendty,chrlungd,carshock24,classnyh_REST,classnyh_SLIGHT,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsufm_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_TRIVIAL,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_FOURTH,incidencREOP_SECOND,incidencREOP_THIRD,status_EMERGENCY,status_URGENT,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenrt_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,cvdstenlft_80-99%.1
0,-2.12457,0.11594,0.32942,0.157,-2.65268,-0.35221,-2.27155,-1.62927,2.47597,0.98148,3.58585,0,0,0,0,0,0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.08243,-0.82771,-0.95497,-0.39563,-0.10617,-0.13691,0.04597,0.25206,-0.29493,0.33103,0.49141,1,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,-0.20037,1.5314,1.56497,0.35976,0.59504,-0.25533,-0.34028,0.32442,-0.67978,0.57495,-0.12748,0,1,0,0,0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.44103,-0.31888,-0.21754,-0.04732,0.59504,-0.25533,0.04597,-0.76096,-0.29493,0.16842,-0.64322,0,0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-0.75014,-1.059,0.19268,0.56118,-2.1729,-0.36298,0.04597,0.39678,-0.29493,0.57495,-0.65353,0,0,0,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0


In [30]:
X_train_kbest.shape

(34192, 64)

In [31]:
y_train.shape

(34192,)

### Running `X_train_kbest` through the Logistic Regression Modeling Process
- instantiating a `LogisticRegression()` object

In [32]:
lr_clf = LogisticRegression(class_weight='balanced',
                            random_state=0)

- creating a `StratifiedKFold` cross validation

In [33]:
skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=0)

- conduct `StratifiedKFold` cross-validation

In [34]:
skf_results = cross_validate(lr_clf, # model
                             X_train_kbest, # feature matrix - X
                             y_train, # target vector - y
                             cv=skf, # cross-validation technique
                             scoring=scoring_metrics, # loss functions
                             return_train_score=True, # returns training score
                             verbose=2, # verbosity level to check progress of calculations
                             n_jobs=-1)                          

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.6s finished


In [35]:
kbest_features_results = pd.DataFrame(skf_results).drop(['fit_time', 'score_time'], axis=1)

In [36]:
kbest_features_results

Unnamed: 0,test_accuracy,train_accuracy,test_f1,train_f1,test_f1_macro,train_f1_macro,test_f1_weighted,train_f1_weighted,test_precision,train_precision,test_precision_macro,train_precision_macro,test_precision_weighted,train_precision_weighted,test_recall,train_recall,test_recall_macro,train_recall_macro,test_recall_weighted,train_recall_weighted,test_roc_auc,train_roc_auc
0,0.66691,0.67484,0.0579,0.06261,0.4278,0.43296,0.78503,0.79072,0.03042,0.03292,0.51003,0.51188,0.97323,0.97455,0.59829,0.63871,0.6332,0.65709,0.66691,0.67484,0.67591,0.71694
1,0.68548,0.67532,0.05782,0.06289,0.43453,0.43327,0.79834,0.79105,0.03047,0.03307,0.50978,0.51198,0.97269,0.97461,0.5641,0.64086,0.62585,0.65839,0.68548,0.67532,0.66752,0.7215
2,0.67154,0.68213,0.04992,0.06375,0.42568,0.43616,0.7887,0.79588,0.02625,0.03356,0.50691,0.51219,0.97127,0.97452,0.50862,0.63519,0.59149,0.65907,0.67154,0.68213,0.66519,0.72139
3,0.67432,0.6746,0.06153,0.06078,0.43225,0.43199,0.79039,0.79056,0.03234,0.03196,0.51148,0.51113,0.97436,0.97397,0.62931,0.61803,0.6522,0.6468,0.67432,0.6746,0.69584,0.71514
4,0.66642,0.67409,0.05235,0.06306,0.42497,0.4329,0.78494,0.79013,0.0275,0.03315,0.50792,0.51204,0.97204,0.97461,0.5431,0.64378,0.60583,0.6592,0.66642,0.67409,0.6748,0.71719


In [37]:
kbest_features_results_summary = pd.DataFrame(kbest_features_results.mean(),
                                              columns=['kbest_score']).round(4)

In [38]:
kbest_features_results_summary

Unnamed: 0,kbest_score
test_accuracy,0.6729
train_accuracy,0.6762
test_f1,0.0559
train_f1,0.0626
test_f1_macro,0.429
train_f1_macro,0.4335
test_f1_weighted,0.7895
train_f1_weighted,0.7917
test_precision,0.0294
train_precision,0.0329


#### Comparing results between `all_features` and `kbest_features`

In [39]:
results_summary = pd.concat([all_features_results_summary, 
                             kbest_features_results_summary],
                            axis=1)

In [40]:
results_summary['kbest_all_delta'] = results_summary['kbest_score'] - results_summary['all_score']

In [41]:
results_summary

Unnamed: 0,all_score,kbest_score,kbest_all_delta
test_accuracy,0.6783,0.6729,-0.0054
train_accuracy,0.6827,0.6762,-0.0065
test_f1,0.0535,0.0559,0.0024
train_f1,0.0668,0.0626,-0.0042
test_f1_macro,0.4298,0.429,-0.0008
train_f1_macro,0.4378,0.4335,-0.0043
test_f1_weighted,0.7934,0.7895,-0.0039
train_f1_weighted,0.7962,0.7917,-0.0045
test_precision,0.0282,0.0294,0.0012
train_precision,0.0351,0.0329,-0.0022


#### Takeaways
- Did not improve results significantly versus running full feature list
- still marginally short of `STS Model` benchmark levels on `roc_auc`
- Restricted feature matrix has reduced overfitting as measured by difference between `train` and `test` scores
- Also see less overfitting in lower `train` scores and higher `test` scores in `kbest_features` versus `all_features`
- Results point to exploring `Logistic Regression` with `Lasso` as an alternative automated feature selection process

### Logistic Regression with Lasso using `GridSearchCV`
- instantiating a `LogisticRegression()` object

In [42]:
log_reg = LogisticRegression(penalty='l1',
                             class_weight='balanced',
                             solver='liblinear',
                             random_state=0)

- defining hyperparamter candidates

In [43]:
hyper_params = {'C': [0.00001, 
                      0.0001, 
                      0.001,
                      0.01,
                      0.015,
                      0.1,
                      1,
                      10]}             

- create `GridSearchCV` object

In [44]:
gs = GridSearchCV(log_reg, # estimator
                  hyper_params, # param_grid
                  scoring=scoring_metrics, # loss functions
                  cv=5, # 
                  refit='roc_auc', # best_estimator_ will be based on this scoring metric
                  verbose=2,
                  return_train_score=True,
                  n_jobs=-1)

- fit `GridSearchcV`

In [45]:
log_reg_lasso = gs.fit(X_train_all,
                       y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   36.2s finished


- examining results

In [46]:
gs_results = pd.DataFrame(log_reg_lasso.cv_results_)

In [47]:
gs_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,split4_train_accuracy,mean_train_accuracy,std_train_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1,split0_test_f1_macro,split1_test_f1_macro,split2_test_f1_macro,split3_test_f1_macro,split4_test_f1_macro,mean_test_f1_macro,std_test_f1_macro,rank_test_f1_macro,split0_train_f1_macro,split1_train_f1_macro,split2_train_f1_macro,split3_train_f1_macro,split4_train_f1_macro,mean_train_f1_macro,std_train_f1_macro,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,split4_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,split0_train_f1_weighted,split1_train_f1_weighted,split2_train_f1_weighted,split3_train_f1_weighted,split4_train_f1_weighted,mean_train_f1_weighted,std_train_f1_weighted,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_train_precision,split1_train_precision,split2_train_precision,split3_train_precision,split4_train_precision,mean_train_precision,std_train_precision,split0_test_precision_macro,split1_test_precision_macro,split2_test_precision_macro,split3_test_precision_macro,split4_test_precision_macro,mean_test_precision_macro,std_test_precision_macro,rank_test_precision_macro,split0_train_precision_macro,split1_train_precision_macro,split2_train_precision_macro,split3_train_precision_macro,split4_train_precision_macro,mean_train_precision_macro,std_train_precision_macro,split0_test_precision_weighted,split1_test_precision_weighted,split2_test_precision_weighted,split3_test_precision_weighted,split4_test_precision_weighted,mean_test_precision_weighted,std_test_precision_weighted,rank_test_precision_weighted,split0_train_precision_weighted,split1_train_precision_weighted,split2_train_precision_weighted,split3_train_precision_weighted,split4_train_precision_weighted,mean_train_precision_weighted,std_train_precision_weighted,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall,split0_test_recall_macro,split1_test_recall_macro,split2_test_recall_macro,split3_test_recall_macro,split4_test_recall_macro,mean_test_recall_macro,std_test_recall_macro,rank_test_recall_macro,split0_train_recall_macro,split1_train_recall_macro,split2_train_recall_macro,split3_train_recall_macro,split4_train_recall_macro,mean_train_recall_macro,std_train_recall_macro,split0_test_recall_weighted,split1_test_recall_weighted,split2_test_recall_weighted,split3_test_recall_weighted,split4_test_recall_weighted,mean_test_recall_weighted,std_test_recall_weighted,rank_test_recall_weighted,split0_train_recall_weighted,split1_train_recall_weighted,split2_train_recall_weighted,split3_train_recall_weighted,split4_train_recall_weighted,mean_train_recall_weighted,std_train_recall_weighted,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,split0_train_roc_auc,split1_train_roc_auc,split2_train_roc_auc,split3_train_roc_auc,split4_train_roc_auc,mean_train_roc_auc,std_train_roc_auc
0,0.16652,0.00622,0.28036,0.09986,1e-05,{'C': 1e-05},0.98289,0.98289,0.98304,0.98304,0.98304,0.98298,7e-05,1,0.983,0.983,0.98296,0.98296,0.98296,0.98298,2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49569,0.49569,0.49572,0.49572,0.49572,0.49571,2e-05,1,0.49571,0.49571,0.4957,0.4957,0.4957,0.49571,4.47714e-06,0.97441,0.97441,0.97463,0.97463,0.97463,0.97454,0.00011,1,0.97457,0.97457,0.97452,0.97452,0.97452,0.97454,3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49145,0.49145,0.49152,0.49152,0.49152,0.49149,4e-05,7,0.4915,0.4915,0.49148,0.49148,0.49148,0.49149,8.80255e-06,0.96608,0.96608,0.96636,0.96636,0.96636,0.96625,0.00014,7,0.96629,0.96629,0.96622,0.96622,0.96622,0.96625,3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,7,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.98289,0.98289,0.98304,0.98304,0.98304,0.98298,7e-05,1,0.983,0.983,0.98296,0.98296,0.98296,0.98298,2e-05,0.5,0.5,0.5,0.5,0.5,0.5,0.0,7,0.5,0.5,0.5,0.5,0.5,0.5,0.0
1,0.16198,0.00669,0.0792,0.00459,0.0001,{'C': 0.0001},0.98289,0.98289,0.98304,0.98304,0.98304,0.98298,7e-05,1,0.983,0.983,0.98296,0.98296,0.98296,0.98298,2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49569,0.49569,0.49572,0.49572,0.49572,0.49571,2e-05,1,0.49571,0.49571,0.4957,0.4957,0.4957,0.49571,4.47714e-06,0.97441,0.97441,0.97463,0.97463,0.97463,0.97454,0.00011,1,0.97457,0.97457,0.97452,0.97452,0.97452,0.97454,3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49145,0.49145,0.49152,0.49152,0.49152,0.49149,4e-05,7,0.4915,0.4915,0.49148,0.49148,0.49148,0.49149,8.80255e-06,0.96608,0.96608,0.96636,0.96636,0.96636,0.96625,0.00014,7,0.96629,0.96629,0.96622,0.96622,0.96622,0.96625,3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.0,7,0.5,0.5,0.5,0.5,0.5,0.5,0.0,0.98289,0.98289,0.98304,0.98304,0.98304,0.98298,7e-05,1,0.983,0.983,0.98296,0.98296,0.98296,0.98298,2e-05,0.5,0.5,0.5,0.5,0.5,0.5,0.0,7,0.5,0.5,0.5,0.5,0.5,0.5,0.0
2,0.19902,0.00316,0.09239,0.0032,0.001,{'C': 0.001},0.50943,0.49364,0.50687,0.50468,0.50951,0.50483,0.00587,8,0.50579,0.49768,0.50775,0.50179,0.5098,0.50456,0.00434,0.0417,0.04258,0.04961,0.04511,0.04499,0.0448,0.00275,6,0.04601,0.04358,0.04429,0.04392,0.04637,0.04483,0.00114,0.35602,0.34919,0.35833,0.35536,0.3575,0.35528,0.00322,8,0.35627,0.35149,0.3564,0.35352,0.35824,0.35518,0.00238459,0.65958,0.64531,0.65659,0.65509,0.65941,0.65519,0.00523,8,0.65597,0.64893,0.65787,0.65257,0.65948,0.65497,0.0038,0.02157,0.022,0.02564,0.02332,0.02326,0.02316,0.00142,6,0.02379,0.02252,0.0229,0.0227,0.02398,0.02318,0.00059,0.50442,0.50501,0.50871,0.50638,0.50626,0.50615,0.00148,6,0.5068,0.50561,0.50584,0.50571,0.5069,0.50617,0.000559749,0.97074,0.97149,0.97539,0.97304,0.97286,0.97271,0.00159,3,0.97339,0.97228,0.97233,0.97226,0.97337,0.97273,0.00054,0.62393,0.65812,0.75862,0.68966,0.68103,0.68227,0.04443,1,0.70108,0.67312,0.66953,0.67167,0.69957,0.68299,0.0142,0.56568,0.57445,0.63057,0.59557,0.59379,0.59201,0.02238,6,0.60175,0.58388,0.58724,0.58526,0.60304,0.59223,0.00837,0.50943,0.49364,0.50687,0.50468,0.50951,0.50483,0.00587,8,0.50579,0.49768,0.50775,0.50179,0.5098,0.50456,0.00434,0.61294,0.60712,0.67212,0.6242,0.62323,0.62792,0.02301,6,0.63956,0.62948,0.62529,0.6341,0.64562,0.63481,0.0072
3,0.44303,0.01958,0.08644,0.00375,0.01,{'C': 0.01},0.66238,0.63927,0.65253,0.6499,0.65867,0.65255,0.00797,7,0.64903,0.64911,0.64927,0.65994,0.66111,0.65369,0.00559,0.06024,0.05443,0.06161,0.04545,0.04812,0.05397,0.0064,4,0.05771,0.05754,0.05608,0.06078,0.06022,0.05847,0.00176,0.42723,0.41578,0.4242,0.41555,0.42009,0.42057,0.0046,7,0.42104,0.42098,0.42035,0.42658,0.42675,0.42314,0.00289068,0.78167,0.76476,0.77449,0.77308,0.77943,0.77469,0.00587,7,0.772,0.77206,0.77221,0.77992,0.78079,0.7754,0.00406,0.03162,0.02849,0.03228,0.02383,0.02526,0.0283,0.00335,4,0.03024,0.03015,0.02939,0.03189,0.0316,0.03065,0.00095,0.51103,0.50895,0.51185,0.50528,0.5063,0.50868,0.00256,3,0.51027,0.5102,0.50957,0.51134,0.5111,0.51049,0.000644362,0.97404,0.97298,0.97514,0.97039,0.97102,0.97271,0.00179,2,0.97398,0.97392,0.97339,0.97445,0.97425,0.974,0.00036,0.63248,0.60684,0.67241,0.49138,0.50862,0.58235,0.07062,3,0.63226,0.63011,0.61159,0.64592,0.63734,0.63144,0.01132,0.64769,0.62334,0.6623,0.57201,0.58494,0.61806,0.03487,3,0.64079,0.63977,0.63075,0.65305,0.64943,0.64276,0.00784,0.66238,0.63927,0.65253,0.6499,0.65867,0.65255,0.00797,7,0.64903,0.64911,0.64927,0.65994,0.66111,0.65369,0.00559,0.6832,0.6743,0.71031,0.64704,0.63996,0.67096,0.02546,2,0.69957,0.69773,0.69346,0.71078,0.7115,0.70261,0.00725
4,0.49653,0.01137,0.08586,0.00356,0.015,{'C': 0.015},0.66881,0.65097,0.65779,0.66057,0.66949,0.66153,0.00697,6,0.65708,0.65653,0.65614,0.66747,0.6723,0.6619,0.0067,0.06056,0.05689,0.0655,0.04838,0.04962,0.05619,0.00648,1,0.05918,0.06041,0.05789,0.06188,0.06195,0.06026,0.00157,0.42976,0.42138,0.42802,0.42092,0.42479,0.42497,0.00351,6,0.42475,0.42513,0.42379,0.4299,0.43171,0.42706,0.00314465,0.78634,0.77339,0.77825,0.78081,0.78724,0.7812,0.00515,6,0.7779,0.77745,0.77722,0.78538,0.78887,0.78136,0.00484,0.03182,0.02983,0.03434,0.0254,0.02608,0.02949,0.00339,1,0.03104,0.03168,0.03036,0.0325,0.03256,0.03163,0.00085,0.51107,0.50983,0.51335,0.50639,0.50681,0.50949,0.00262,1,0.51076,0.51126,0.51022,0.51167,0.51163,0.51111,0.000552303,0.97392,0.97341,0.97611,0.97106,0.97123,0.97315,0.00187,1,0.97416,0.97455,0.97373,0.97451,0.97437,0.97426,0.0003,0.62393,0.61538,0.7069,0.50862,0.50862,0.59269,0.07572,2,0.63441,0.64946,0.62017,0.64378,0.63519,0.6366,0.00994,0.64676,0.63349,0.68192,0.58591,0.59045,0.62771,0.03598,1,0.64594,0.65306,0.63847,0.65583,0.65407,0.64947,0.00645,0.66881,0.65097,0.65779,0.66057,0.66949,0.66153,0.00697,6,0.65708,0.65653,0.65614,0.66747,0.6723,0.6619,0.0067,0.68634,0.68335,0.70833,0.64618,0.64046,0.67293,0.02574,1,0.70963,0.70711,0.70223,0.71966,0.72281,0.71229,0.00775
5,3.50716,2.26062,0.08857,0.00535,0.1,{'C': 0.1},0.677,0.67744,0.67242,0.67257,0.68529,0.67694,0.00468,5,0.67689,0.67934,0.67442,0.68217,0.69178,0.68092,0.00601,0.0588,0.05646,0.06276,0.05087,0.04863,0.0555,0.00516,2,0.06456,0.06741,0.06292,0.06496,0.06685,0.06534,0.00162,0.43192,0.43096,0.43214,0.42651,0.43004,0.43032,0.00204,3,0.43464,0.4369,0.43295,0.43675,0.44113,0.43647,0.0027453,0.79228,0.79265,0.78899,0.78941,0.79852,0.79237,0.00341,5,0.79214,0.79382,0.79037,0.79588,0.80265,0.79497,0.00425,0.03094,0.02972,0.03298,0.02675,0.02563,0.0292,0.0027,2,0.03395,0.03546,0.03308,0.03419,0.03524,0.03439,0.00087,0.51026,0.50934,0.512,0.50728,0.50631,0.50904,0.00204,2,0.51262,0.51371,0.51198,0.51267,0.51326,0.51285,0.000590402,0.97319,0.97255,0.97476,0.97151,0.97069,0.97254,0.0014,4,0.97502,0.9757,0.97457,0.97484,0.97498,0.97502,0.00038,0.58974,0.5641,0.64655,0.51724,0.47414,0.55836,0.05929,4,0.65591,0.68172,0.64163,0.64807,0.64807,0.65508,0.01407,0.63413,0.62176,0.65971,0.59624,0.58153,0.61868,0.02762,2,0.66658,0.68051,0.65831,0.66541,0.6703,0.66822,0.00727,0.677,0.67744,0.67242,0.67257,0.68529,0.67694,0.00468,5,0.67689,0.67934,0.67442,0.68217,0.69178,0.68092,0.00601,0.67871,0.67837,0.70238,0.64645,0.63286,0.66775,0.02491,3,0.73093,0.73337,0.72586,0.73669,0.74397,0.73416,0.00604
6,7.36375,1.33298,0.08744,0.00335,1.0,{'C': 1},0.67773,0.67729,0.67286,0.67315,0.68734,0.67767,0.00524,3,0.67616,0.68172,0.67709,0.68436,0.69251,0.68237,0.0059,0.05812,0.05643,0.06048,0.04772,0.04724,0.054,0.00548,3,0.06522,0.06748,0.0638,0.06659,0.06824,0.06627,0.00159,0.43186,0.4309,0.43121,0.42522,0.43011,0.42986,0.00239,4,0.43469,0.4378,0.43435,0.43833,0.44206,0.43744,0.00280604,0.79282,0.79255,0.78937,0.78991,0.79999,0.79293,0.00379,3,0.79159,0.79552,0.79227,0.7974,0.80314,0.79598,0.00416,0.03059,0.0297,0.03179,0.0251,0.02491,0.02842,0.00287,3,0.03429,0.03551,0.03356,0.03506,0.03598,0.03488,0.00086,0.50999,0.50933,0.51108,0.50604,0.50577,0.50844,0.00215,4,0.51289,0.5137,0.51229,0.51328,0.51378,0.51319,0.000549796,0.97298,0.97254,0.97412,0.97066,0.97031,0.97212,0.00144,5,0.97522,0.97563,0.97471,0.9752,0.97531,0.97521,0.00029,0.5812,0.5641,0.62069,0.48276,0.4569,0.54113,0.06159,5,0.66452,0.67742,0.64592,0.66094,0.66094,0.66195,0.01005,0.6303,0.62168,0.64722,0.5796,0.5741,0.61058,0.02879,4,0.67044,0.67961,0.66177,0.67286,0.677,0.67234,0.00616,0.67773,0.67729,0.67286,0.67315,0.68734,0.67767,0.00524,3,0.67616,0.68172,0.67709,0.68436,0.69251,0.68237,0.0059,0.67139,0.67555,0.69274,0.64151,0.62962,0.66216,0.02317,4,0.73448,0.73708,0.73,0.74025,0.74785,0.73793,0.00599
7,9.85828,2.65765,0.07111,0.01362,10.0,{'C': 10},0.67715,0.67773,0.67403,0.67227,0.68646,0.67753,0.00489,4,0.67601,0.68186,0.67617,0.68381,0.69281,0.68213,0.00616,0.05802,0.05651,0.05989,0.0476,0.04626,0.05366,0.00561,5,0.06499,0.06791,0.06383,0.06689,0.06892,0.06651,0.00186,0.4316,0.43109,0.43136,0.42484,0.42933,0.42965,0.00253,5,0.43453,0.43805,0.43403,0.43827,0.44249,0.43747,0.00305542,0.79241,0.79286,0.79023,0.78929,0.79939,0.79283,0.00354,4,0.7915,0.79561,0.79161,0.797,0.80333,0.79581,0.00434,0.03053,0.02974,0.03149,0.02503,0.02439,0.02824,0.00294,5,0.03417,0.03573,0.03357,0.03522,0.03634,0.03501,0.00101,0.50995,0.50935,0.51083,0.506,0.5054,0.50831,0.00219,5,0.51281,0.51386,0.51232,0.5134,0.51404,0.51329,0.000646322,0.97297,0.97255,0.97392,0.97064,0.97008,0.97203,0.00145,6,0.97517,0.97574,0.97475,0.9753,0.97548,0.97529,0.00033,0.5812,0.5641,0.61207,0.48276,0.44828,0.53768,0.06186,6,0.66237,0.68172,0.64807,0.66524,0.66738,0.66495,0.01076,0.63001,0.62191,0.64358,0.57915,0.56942,0.60881,0.02919,5,0.66931,0.68179,0.66236,0.67469,0.68031,0.67369,0.00718,0.67715,0.67773,0.67403,0.67227,0.68646,0.67753,0.00489,4,0.67601,0.68186,0.67617,0.68381,0.69281,0.68213,0.00616,0.67085,0.67155,0.69093,0.63916,0.62892,0.66028,0.02284,5,0.73463,0.73723,0.7302,0.74034,0.74803,0.73808,0.00598


In [48]:
gs_results_sorted = gs_results.sort_values(by=['rank_test_roc_auc'])

In [49]:
display_cols = ['param_C',
                'rank_test_roc_auc',
                'mean_test_roc_auc',
                'mean_train_roc_auc']

In [50]:
gs_results_summary = gs_results_sorted.copy()[display_cols].reset_index(drop=True)

In [51]:
gs_results_summary

Unnamed: 0,param_C,rank_test_roc_auc,mean_test_roc_auc,mean_train_roc_auc
0,0.015,1,0.67293,0.71229
1,0.01,2,0.67096,0.70261
2,0.1,3,0.66775,0.73416
3,1.0,4,0.66216,0.73793
4,10.0,5,0.66028,0.73808
5,0.001,6,0.62792,0.63481
6,1e-05,7,0.5,0.5
7,0.0001,7,0.5,0.5


- examining features selected by `LogisticRegression` with `l1` penalty

In [52]:
log_reg_lasso.best_estimator_

LogisticRegression(C=0.015, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
log_reg_best = log_reg_lasso.best_estimator_.fit(X_train_all,
                                                 y_train)

- number of features selected - versus `47` features selected by `SelectKBest`

In [54]:
np.count_nonzero(log_reg_best.coef_)

53

- putting into a `DataFrame` for easier analysis

In [55]:
log_reg_gs_features = pd.DataFrame({'feature': X_train_all.columns.tolist(),
                                    'coef': log_reg_best.coef_[0]})

- these are the relevant features selected by the model

In [56]:
lr_relevant_features = log_reg_gs_features[log_reg_gs_features['coef'] != 0]

In [57]:
lr_relevant_features

Unnamed: 0,feature,coef
0,age,0.30865
1,heightcm,-0.08417
2,weightkg,-0.06372
6,totalbumin,-0.15457
7,a1clvl,0.06835
8,meldscr,0.00884
9,hdef,-0.1213
10,pasys,0.06863
12,surgdt_month_Feb,-0.08556
13,surgdt_month_Mar,0.11137


In [58]:
lr_relevant_features.shape[0]

53

- number of features in our "curated" `KBest` list

In [59]:
len(kbest_features)

64

- number of features in the "uncurated" `KBest` list was `38` - see `Automated_Feature_Selection_A` which based on `2.73` and `strokeBin`
- number of features in the "uncurated" `KBest` list was `47` - see `Automated_Feature_Selection_Combined_Dataset`

#### Key Takeaways
- `LogisticRegression` with `l1` penalty results in a model even more parsimonius than `SelectKBest`
- Seems we are at the limits with linear models, given no improvement in `roc_auc`
- perhaps not too surprisng given that the `STS` model is based on a linear model