## Capstone Project

### Automated Feature Selection / EDA Notebook - `COMBINED DATASET` from 10/24/19

#### Importing Libraries

In [1]:
%matplotlib inline

# general libraries
import re
import string
import sys
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# importing date libraries
import datetime as dt
import dateutil.parser as dparser

# scikit-learn libraries for preprocessing
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scikit-learn libraries for constructing pipelines
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# scikit-learn libraries for clustering and dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.mixture import GaussianMixture

# scikit-learn libraries for evaluation
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# scikit-learn libraries for feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import RFECV

# scikit-learn libraries for learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# saving models
import pickle
from sklearn.externals import joblib

# setting pandas display options
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 10000)
pd.set_option('display.max_colwidth', 100)
pd.set_option('precision', 5)
pd.options.mode.chained_assignment = None



#### Directory/File Structure

In [2]:
sys.version

'3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]'

In [3]:
print ('Running pandas version:', pd.__version__)
print ('Running numpy version:', np.__version__)
print ('Running sklearn version:', sklearn.__version__)

Running pandas version: 0.25.1
Running numpy version: 1.14.2
Running sklearn version: 0.21.3


In [4]:
os.getcwd()

'/Users/nate_velarde/Documents/UC_Berkeley/Courses/W210_Capstone/stroke_project/sandbox/notebooks'

In [5]:
os.chdir('../data')

In [6]:
sorted(os.listdir())

['.DS_Store',
 '273_vs_281_null_count_by_feature.csv',
 '273_vs_281_null_count_by_feature.xlsm',
 'Capstone - Complication list - complete.xlsx',
 'Capstone - STS risk factor list.xlsx',
 'Capstone_Fall_Shannon_Sept2019_request.csv',
 'PREOP_dataset_10_24.pkl',
 'PREOP_dataset_TREE_10_24.pkl',
 'X_A_DREF.pkl',
 'X_A_DREF_TREE_SKLEARN.pkl',
 'X_PREOP_10_24.pkl',
 'X_PREOP_TREE_10_24.pkl',
 'X_dev_A_DREF.pkl',
 'X_dev_A_DREF_TREE_SKLEARN.pkl',
 'X_dev_PREOP_10_24.pkl',
 'X_dev_PREOP_TREE_10_24.pkl',
 'X_test_A_DREF.pkl',
 'X_test_A_DREF_TREE_SKLEARN.pkl',
 'X_test_PREOP_10_24.pkl',
 'X_test_PREOP_TREE_10_24.pkl',
 'X_train_A_DREF.pkl',
 'X_train_A_DREF_TREE_SKLEARN.pkl',
 'X_train_PREOP_10_24.pkl',
 'X_train_PREOP_TREE_10_24.pkl',
 'capstone_STS_risk_factor_features.xlsx',
 'capstone_cleaned_data.csv',
 'capstone_data-version-2.xlsx',
 'capstone_data.xlsx',
 'capstone_data_binarized_outcome.pkl',
 'capstone_data_binarized_outcome.xlsx',
 'capstone_data_binarized_outcome_compressed.pkl',


#### Loading Dataset

In [7]:
feature_matrix = pd.read_pickle('X_PREOP_10_24.pkl')

In [8]:
feature_matrix.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,83,162.60001,102.1,38.61754,29.0,1.2,3.3,6.2,8.6,60.0,36.0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,1.0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,59,160.0,127.5,49.80469,35.0,0.9,3.5,7.4,6.4,60.0,35.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,72,160.0,64.0,25.0,37.0,0.9,3.8,5.7,6.4,60.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [9]:
feature_matrix.shape

(42740, 109)

- loading `target` vector

In [10]:
target = pd.read_pickle('y_PREOP_10_24.pkl')

In [11]:
target.head()

0    0
1    0
2    0
3    0
4    0
Name: strokeBin2, dtype: int64

In [12]:
feature_matrix.shape, target.shape

((42740, 109), (42740,))

### Handling Highly Correlated Features

- Albon 10.3, pages 172-174
- checking to see if features are highly correlated
- creating correlation matrix

In [13]:
# returns a pandas DataFrame
corr_matrix = feature_matrix.corr().abs()

In [14]:
corr_matrix.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
age,1.0,0.12913,0.19149,0.08017,0.13915,0.01009,0.05175,0.11569,0.07197,0.04351,0.0716,0.01571,0.01272,0.00799,0.0021,0.00061,0.00697,0.00309,0.012,0.00597,0.00705,0.01225,0.00099,0.00287,0.00299,0.00392,0.00869,0.00874,0.00113,5e-05,0.06355,0.11662,0.11406,0.01695,0.02627,0.00803,0.03244,0.00299,0.10161,0.05707,0.13889,0.10901,0.01917,0.04538,0.00233,0.01183,0.10202,0.08099,0.03224,0.00726,0.12783,0.03513,0.08577,0.10423,0.00764,0.02621,0.07124,0.03433,0.05804,0.0618,0.10434,0.01363,0.00569,0.22509,0.23273,0.00204,0.06415,0.09,0.17448,0.02393,0.01659,0.12305,0.05806,0.01091,0.00254,0.03384,0.0036,0.06452,0.08845,0.00875,0.04823,0.05878,0.01094,0.03473,0.10197,0.07145,0.06793,0.04924,0.08056,0.07754,0.02174,0.03159,0.00496,0.00931,0.01768,0.04127,0.02895,0.00939,0.04847,0.04265,0.03963,0.05118,0.02825,0.0159,0.05365,0.02194,0.004,0.05594,0.09674
heightcm,0.12913,1.0,0.39307,0.29162,0.2199,0.05223,0.05566,0.01893,0.06221,0.08246,0.06073,0.01537,0.00833,0.00569,0.00128,0.00264,0.00393,0.00258,0.01088,0.00091,0.00213,0.01082,0.00262,0.01046,0.00079,0.00167,0.00481,0.00853,0.00265,0.00125,0.64799,0.04242,0.01163,0.05701,0.00013,0.00238,0.05606,0.034,0.01269,0.0067,0.03009,0.02226,0.06002,0.01011,0.02599,0.05249,0.00704,0.01736,0.00505,0.00123,0.06298,0.02153,0.04768,0.0303,0.00013,0.0252,0.00852,0.01002,0.05194,0.05836,0.00644,0.00414,0.00042,0.06855,0.07279,0.07618,0.02685,0.01563,0.06934,0.00168,0.02643,0.03118,0.09284,0.00804,0.00121,0.04667,0.00202,0.01952,0.05818,0.0266,0.00613,0.04308,0.02385,0.02386,0.03231,0.04642,0.03162,0.03497,0.05189,0.06582,0.03618,0.00259,0.00234,0.00106,0.00735,0.00664,0.00014,0.00226,0.00993,0.02259,0.03251,0.03018,0.01449,0.00778,0.0364,0.01968,0.00165,0.03,0.00771
weightkg,0.19149,0.39307,1.0,0.58348,0.15529,0.03548,0.02269,0.17311,0.04351,0.02032,0.08834,0.00794,0.0005,0.00144,0.00324,0.00466,0.00111,0.00772,0.00935,0.00093,0.00177,0.0018,0.00504,0.01061,0.00064,0.00114,0.00416,0.00076,0.0054,0.00248,0.29382,0.04195,0.01047,0.07926,0.00473,0.01255,0.02433,0.21117,0.08968,0.01143,0.10738,0.02272,0.27969,0.01435,0.03327,0.03618,0.01652,0.05101,0.03742,0.00804,0.06265,0.03169,0.03121,0.04457,0.0055,0.05124,0.00304,0.01713,0.01371,0.0037,0.00981,0.00535,0.00279,0.0497,0.00108,0.03548,0.1487,0.01722,0.0004,0.01919,0.01255,0.00837,0.00234,0.00413,0.00222,0.05514,0.00144,0.06217,0.056,0.02058,0.03799,0.00526,0.01386,0.0387,0.01977,0.06068,0.10523,0.03237,0.05899,0.07075,0.03548,0.00223,0.01186,0.00153,0.00021,0.00135,0.01218,0.00224,0.0211,0.02624,0.02009,0.02346,0.01266,0.00553,0.0217,0.02643,0.00371,0.03676,0.01411
bmi,0.08017,0.29162,0.58348,1.0,0.02024,0.00601,0.01453,0.11914,0.00699,0.01372,0.09141,0.00613,0.00873,0.00549,0.00157,0.00231,0.00273,0.00657,0.00336,0.00138,0.00594,0.00022,0.00335,0.00175,0.00219,5e-05,0.00484,0.00501,0.00166,0.00395,0.04507,0.00913,0.00448,0.0373,0.0019,0.0023,0.00668,0.15171,0.06638,0.00322,0.08253,0.02295,0.15857,0.00598,0.01067,0.00763,0.01273,0.02968,0.02922,0.00586,0.01849,0.01633,0.00114,0.01924,0.00427,0.02176,0.00315,0.01113,0.01318,0.01887,0.00778,0.00328,0.00134,0.01128,0.02728,0.00211,0.10945,0.019,0.02962,0.01255,0.02119,0.0186,0.03592,0.00111,0.00209,0.01923,0.00015,0.03588,0.01884,0.00315,0.02777,0.01289,0.00421,0.01995,0.00812,0.02296,0.05784,0.00706,0.0232,0.02356,0.01168,0.00592,0.01008,0.00094,0.003,0.00094,0.0104,0.00259,0.00847,0.00794,0.00054,0.00533,0.00442,0.00803,0.00084,0.01193,0.00092,0.01005,0.00575
hct,0.13915,0.2199,0.15529,0.02024,1.0,0.23134,0.39666,0.06369,0.28776,0.05791,0.1765,0.0188,0.00393,0.00292,0.00514,0.00038,0.00699,0.00143,0.00819,0.00093,0.00183,0.00287,0.00593,0.01077,0.00377,0.01515,0.0205,0.01417,0.00523,0.00751,0.27865,0.11751,0.13835,0.00704,0.00166,0.00173,0.02232,0.1554,0.00246,0.2146,0.07016,0.1372,0.02678,0.03928,0.08344,0.02567,0.04592,0.1092,0.03852,0.02768,0.12631,0.09363,0.04719,0.06595,0.02018,0.03667,0.02367,0.06641,0.18015,0.12691,0.05577,0.05818,0.00649,0.04879,0.00978,0.05309,0.16173,0.15456,0.05578,0.09467,0.02919,0.01335,0.0595,0.03391,0.00113,0.0233,0.01866,0.02695,0.01989,0.07551,0.06126,0.09797,0.15609,0.03736,0.05612,0.11089,0.01925,0.03034,0.07072,0.09693,0.0471,0.0418,0.02158,0.0187,0.00953,0.21869,0.00342,0.00992,0.03551,0.04101,0.04477,0.04651,0.0347,0.00389,0.05172,0.03552,0.00643,0.02404,0.02245


- selecting upper triangle of correlation matrix

In [15]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
                                  k=1).astype(np.bool))

In [16]:
upper.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
age,,0.12913,0.19149,0.08017,0.13915,0.01009,0.05175,0.11569,0.07197,0.04351,0.0716,0.01571,0.01272,0.00799,0.0021,0.00061,0.00697,0.00309,0.012,0.00597,0.00705,0.01225,0.00099,0.00287,0.00299,0.00392,0.00869,0.00874,0.00113,5e-05,0.06355,0.11662,0.11406,0.01695,0.02627,0.00803,0.03244,0.00299,0.10161,0.05707,0.13889,0.10901,0.01917,0.04538,0.00233,0.01183,0.10202,0.08099,0.03224,0.00726,0.12783,0.03513,0.08577,0.10423,0.00764,0.02621,0.07124,0.03433,0.05804,0.0618,0.10434,0.01363,0.00569,0.22509,0.23273,0.00204,0.06415,0.09,0.17448,0.02393,0.01659,0.12305,0.05806,0.01091,0.00254,0.03384,0.0036,0.06452,0.08845,0.00875,0.04823,0.05878,0.01094,0.03473,0.10197,0.07145,0.06793,0.04924,0.08056,0.07754,0.02174,0.03159,0.00496,0.00931,0.01768,0.04127,0.02895,0.00939,0.04847,0.04265,0.03963,0.05118,0.02825,0.0159,0.05365,0.02194,0.004,0.05594,0.09674
heightcm,,,0.39307,0.29162,0.2199,0.05223,0.05566,0.01893,0.06221,0.08246,0.06073,0.01537,0.00833,0.00569,0.00128,0.00264,0.00393,0.00258,0.01088,0.00091,0.00213,0.01082,0.00262,0.01046,0.00079,0.00167,0.00481,0.00853,0.00265,0.00125,0.64799,0.04242,0.01163,0.05701,0.00013,0.00238,0.05606,0.034,0.01269,0.0067,0.03009,0.02226,0.06002,0.01011,0.02599,0.05249,0.00704,0.01736,0.00505,0.00123,0.06298,0.02153,0.04768,0.0303,0.00013,0.0252,0.00852,0.01002,0.05194,0.05836,0.00644,0.00414,0.00042,0.06855,0.07279,0.07618,0.02685,0.01563,0.06934,0.00168,0.02643,0.03118,0.09284,0.00804,0.00121,0.04667,0.00202,0.01952,0.05818,0.0266,0.00613,0.04308,0.02385,0.02386,0.03231,0.04642,0.03162,0.03497,0.05189,0.06582,0.03618,0.00259,0.00234,0.00106,0.00735,0.00664,0.00014,0.00226,0.00993,0.02259,0.03251,0.03018,0.01449,0.00778,0.0364,0.01968,0.00165,0.03,0.00771
weightkg,,,,0.58348,0.15529,0.03548,0.02269,0.17311,0.04351,0.02032,0.08834,0.00794,0.0005,0.00144,0.00324,0.00466,0.00111,0.00772,0.00935,0.00093,0.00177,0.0018,0.00504,0.01061,0.00064,0.00114,0.00416,0.00076,0.0054,0.00248,0.29382,0.04195,0.01047,0.07926,0.00473,0.01255,0.02433,0.21117,0.08968,0.01143,0.10738,0.02272,0.27969,0.01435,0.03327,0.03618,0.01652,0.05101,0.03742,0.00804,0.06265,0.03169,0.03121,0.04457,0.0055,0.05124,0.00304,0.01713,0.01371,0.0037,0.00981,0.00535,0.00279,0.0497,0.00108,0.03548,0.1487,0.01722,0.0004,0.01919,0.01255,0.00837,0.00234,0.00413,0.00222,0.05514,0.00144,0.06217,0.056,0.02058,0.03799,0.00526,0.01386,0.0387,0.01977,0.06068,0.10523,0.03237,0.05899,0.07075,0.03548,0.00223,0.01186,0.00153,0.00021,0.00135,0.01218,0.00224,0.0211,0.02624,0.02009,0.02346,0.01266,0.00553,0.0217,0.02643,0.00371,0.03676,0.01411
bmi,,,,,0.02024,0.00601,0.01453,0.11914,0.00699,0.01372,0.09141,0.00613,0.00873,0.00549,0.00157,0.00231,0.00273,0.00657,0.00336,0.00138,0.00594,0.00022,0.00335,0.00175,0.00219,5e-05,0.00484,0.00501,0.00166,0.00395,0.04507,0.00913,0.00448,0.0373,0.0019,0.0023,0.00668,0.15171,0.06638,0.00322,0.08253,0.02295,0.15857,0.00598,0.01067,0.00763,0.01273,0.02968,0.02922,0.00586,0.01849,0.01633,0.00114,0.01924,0.00427,0.02176,0.00315,0.01113,0.01318,0.01887,0.00778,0.00328,0.00134,0.01128,0.02728,0.00211,0.10945,0.019,0.02962,0.01255,0.02119,0.0186,0.03592,0.00111,0.00209,0.01923,0.00015,0.03588,0.01884,0.00315,0.02777,0.01289,0.00421,0.01995,0.00812,0.02296,0.05784,0.00706,0.0232,0.02356,0.01168,0.00592,0.01008,0.00094,0.003,0.00094,0.0104,0.00259,0.00847,0.00794,0.00054,0.00533,0.00442,0.00803,0.00084,0.01193,0.00092,0.01005,0.00575
hct,,,,,,0.23134,0.39666,0.06369,0.28776,0.05791,0.1765,0.0188,0.00393,0.00292,0.00514,0.00038,0.00699,0.00143,0.00819,0.00093,0.00183,0.00287,0.00593,0.01077,0.00377,0.01515,0.0205,0.01417,0.00523,0.00751,0.27865,0.11751,0.13835,0.00704,0.00166,0.00173,0.02232,0.1554,0.00246,0.2146,0.07016,0.1372,0.02678,0.03928,0.08344,0.02567,0.04592,0.1092,0.03852,0.02768,0.12631,0.09363,0.04719,0.06595,0.02018,0.03667,0.02367,0.06641,0.18015,0.12691,0.05577,0.05818,0.00649,0.04879,0.00978,0.05309,0.16173,0.15456,0.05578,0.09467,0.02919,0.01335,0.0595,0.03391,0.00113,0.0233,0.01866,0.02695,0.01989,0.07551,0.06126,0.09797,0.15609,0.03736,0.05612,0.11089,0.01925,0.03034,0.07072,0.09693,0.0471,0.0418,0.02158,0.0187,0.00953,0.21869,0.00342,0.00992,0.03551,0.04101,0.04477,0.04651,0.0347,0.00389,0.05172,0.03552,0.00643,0.02404,0.02245


- finding index of feature columns with correlation greater than `0.95`

In [17]:
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.95)]

In [18]:
high_corr_features

[]

- in a prior version of this notebook, `cva` was found to be highly correlated to `cvawhen` 
- as a result, dropped `cvawhen`

### Removing Irrelevant Features for Classification

- Albon 10.4, pages 174-176
- for `categorical` variables, calculate a `chi-square` statistic between each `feature` and the `target` vector
- for `quantitative` variables, compute the `ANOVA F-value` between each `feature` and the `target` vector

- need to split `feature_matrix` into `numerical` and `categorical` features

In [19]:
feature_matrix.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys,surgdt_month_Jan,surgdt_month_Feb,surgdt_month_Mar,surgdt_month_Apr,surgdt_month_May,surgdt_month_Jul,surgdt_month_Aug,surgdt_month_Sep,surgdt_month_Oct,surgdt_month_Nov,surgdt_month_Dec,surgdt_DayOfWeek_Mon,surgdt_DayOfWeek_Tues,surgdt_DayOfWeek_Thurs,surgdt_DayOfWeek_Fri,surgdt_DayOfWeek_Sat,surgdt_DayOfWeek_Sun,surgdt_PartOfMonth_Beg,surgdt_PartOfMonth_End,gender,racecaucasian,raceblack,raceasian,racenativeam,racnativepacific,ethnicity,diabetes,dyslip,dialysis,hypertn,infendo,slpapn,liverdis,immsupp,mediastrad,cancer,pvd,syncope,unrespstat,cvd,cva,cvdtia,cvdpcarsurg,hitanti,prcvint,prcab,prvalve,chf,priorhf,arrhyafib,medinotr,hdefd,vdaort,vdstena,vdstenm,diabctrl,infendty,Tobacco_Combined,chrlungd,hmo2,ivdrugab,alcohol,carshock24,resusc24,medasa,medaplt5days,medlipid,numdisv,anginalclass_SLIGHT,anginalclass_REST,classnyh_SLIGHT,classnyh_REST,vdinsufm_TRIVIAL,vdinsufm_MILD,vdinsufm_MODERATE,vdinsufm_SEVERE,vdinsuft_TRIVIAL,vdinsuft_MILD,vdinsuft_MODERATE,vdinsuft_SEVERE,incidencREOP_FIRST,incidencREOP_SECOND,incidencREOP_THIRD,incidencREOP_FOURTH,status_URGENT,status_EMERGENCY,status_SALVAGE,cvdcarsten_RIGHT,cvdcarsten_LEFT,cvdcarsten_BOTH,cvdstenrt_50%-79%,cvdstenrt_80-99%,cvdstenrt_100%,cvdstenlft_50%-79%,cvdstenlft_80-99%,cvdstenlft_100%,ArrhythDur_when_Combo_SHORT,ArrhythDur_when_Combo_LONG
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1.0,1.0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


- selecting `numerical` features

In [20]:
num_features = ['age',
                'heightcm',
                'weightkg',
                'bmi',
                'hct',
                'creatlst',
                'totalbumin',
                'a1clvl',
                'meldscr',
                'hdef',
                'pasys']

In [21]:
len(num_features)

11

- splitting `feature_matrix`

In [22]:
numerical_features = feature_matrix[num_features]

In [23]:
categorical_features = feature_matrix.drop(num_features, axis=1)

In [24]:
feature_matrix.shape, numerical_features.shape, categorical_features.shape

((42740, 109), (42740, 11), (42740, 98))

### For `X` Only - Need to Do Some Cleanup - `X_train` version (run on `2.73`) of this notebook skips this step
- Fill `NaN`s in `numerical_features` with `medians` and transform using `StandardScaler`
- Bleeding in some `dev` and `test` data here, but wanted to compare feature selection to `X_train`

In [25]:
numerical_features.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,,,,55.0,40.0


In [26]:
numerical_features[numerical_features['hct'].isnull()].head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
3123,85,179.0,82.6,25.77947,,,,,,63.0,39.0
6012,47,157.0,86.2,34.97099,,0.6,4.1,5.9,6.4,55.0,26.0


In [27]:
numerical_features[numerical_features['hdef'].isnull()].head(1)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
391,61,175.0,77.0,25.14286,42.0,1.0,3.4,5.4,7.2,,


In [28]:
validation_rows = [391, 4188]

In [29]:
numeric_medians = []

for column in numerical_features.columns.tolist():
    numeric_medians.append(numerical_features[column].median())

In [30]:
numeric_median_df = pd.DataFrame(list(zip(numerical_features.columns.tolist(), numeric_medians)),
                                       columns=['numeric_feature', 'median'])

In [31]:
numeric_median_df

Unnamed: 0,numeric_feature,median
0,age,67.0
1,heightcm,172.7
2,weightkg,87.3
3,bmi,29.41176
4,hct,39.1
5,creatlst,1.0
6,totalbumin,3.8
7,a1clvl,5.9
8,meldscr,7.5
9,hdef,55.0


- remarkably, overall `X` and `X_train` numerical feature medians are the same
- replacing `NaN`s with column medians

In [32]:
numerical_features = numerical_features.fillna(numerical_features.median())

In [33]:
numerical_features.iloc[validation_rows, :]

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
391,61,175.0,77.0,25.14286,42.0,1.0,3.4,5.4,7.2,55.0,35.0
4188,62,167.0,102.0,36.57356,43.6,0.9,3.8,6.9,6.4,60.0,35.0


- checking if all of the `NaN`s are gone

In [34]:
numerical_features.isnull().sum()

age           0
heightcm      0
weightkg      0
bmi           0
hct           0
creatlst      0
totalbumin    0
a1clvl        0
meldscr       0
hdef          0
pasys         0
dtype: int64

- now, need to standardize

In [35]:
numerical_features.head(2)

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,54,180.0,117.0,36.11111,43.0,0.9,3.8,7.2,6.5,47.0,42.0
1,65,175.3,79.4,25.83787,45.0,1.2,3.8,5.9,7.5,55.0,40.0


In [36]:
numerical_features.shape

(42740, 11)

In [37]:
scaler = StandardScaler()

In [38]:
numerical_features = scaler.fit_transform(numerical_features)

In [39]:
numerical_features = pd.DataFrame(numerical_features,
                                  columns=num_features)

In [40]:
numerical_features.head()

Unnamed: 0,age,heightcm,weightkg,bmi,hct,creatlst,totalbumin,a1clvl,meldscr,hdef,pasys
0,-1.11656,0.79395,1.34316,0.51981,0.77935,-0.25503,0.04458,0.61265,-0.64518,-0.48711,0.59303
1,-0.10874,0.35695,-0.49368,-0.45245,1.14854,0.06657,0.04458,-0.32735,-0.2946,0.16419,0.38702
2,1.54043,-0.82387,0.61526,0.75701,-1.80494,0.06657,-0.92159,-0.11042,0.09104,0.57126,-0.02499
3,-0.65846,-1.06562,1.8561,1.81576,-0.69739,-0.25503,-0.53512,0.75727,-0.68024,0.57126,-0.128
4,0.53261,-1.06562,-1.24601,-0.53174,-0.3282,-0.25503,0.04458,-0.47196,-0.68024,0.57126,0.38702


In [41]:
numerical_features.shape

(42740, 11)

#### `SelectKBest`
- going to calculate `chi-square` statistic for every `categorical` feature
- `chi-square` statistics examines the independence of two categorical vectors - that is, the statistic is the difference between the observed number of observations in each class of a categorical feature and what we would expect if that feature was independent (i.e., no relationship) with the target vector
- A `chi-square` statistic is a single number that tells you how much difference exists between your observed counts and the counts you would expect if there were no relationship at all in the population - by calculating the `chi-square` statistic between a feature and the target, we obtain a measurement of the independence between the two
- If the target is independent of the feature variable, then it is irrelevant for our purposes because it contains no information we can use for classification.
- On the other hand, if the two features are highly dependent, they likely are very informative for training models.

- instantiating a `SelectKBest` object

In [42]:
chi2_selector = SelectKBest(chi2, k='all')

In [43]:
cat_features_kbest = chi2_selector.fit_transform(categorical_features, target)

#### Let's look closely at the `chi-square` scores using the `scores_` and `pvalues_` attributes for `chi2_selector` object

In [44]:
cat_chi2_summary = pd.DataFrame(list(zip(categorical_features.columns.tolist(),
                                         chi2_selector.scores_,
                                         chi2_selector.pvalues_)),
                                columns=['Feature', 'chi-square_statistic', 'pvalue'])

In [45]:
cat_chi2_summary.head()

Unnamed: 0,Feature,chi-square_statistic,pvalue
0,surgdt_month_Jan,2.9524,0.08575
1,surgdt_month_Feb,0.90525,0.34138
2,surgdt_month_Mar,1.99958,0.15734
3,surgdt_month_Apr,0.02043,0.88635
4,surgdt_month_May,2.18153,0.13968


- assuming a critical value of 0.05, then if `pvalue` `<=` `0.05` then we can reject the null hypothesis that the feature and target are independent -- which is what we want

#### Finding features where `pvalue` `<=` `0.05`

In [46]:
sig_cat_features = cat_chi2_summary[cat_chi2_summary['pvalue'] <= 0.05].sort_values(by=['pvalue'],
                                                                                    ascending=True)

In [47]:
sig_cat_features

Unnamed: 0,Feature,chi-square_statistic,pvalue
40,cva,86.53887,1.37018e-20
39,cvd,76.47458,2.2306600000000003e-18
83,incidencREOP_FOURTH,53.7781,2.2446e-13
71,classnyh_REST,45.64067,1.42061e-11
36,pvd,37.75265,8.03075e-10
47,chf,26.56232,2.55172e-07
87,cvdcarsten_RIGHT,25.88135,3.63059e-07
41,cvdtia,25.21442,5.12973e-07
85,status_EMERGENCY,23.62103,1.17299e-06
95,cvdstenlft_100%,23.54697,1.21901e-06


In [48]:
sig_cat_features.shape

(37, 3)

#### Now let's use the same coding pattern with the numerical features, except using `ANOVA F-value` between each numerical feature and target vector

- instantiating a `SelectKBest` object

In [49]:
fvalue_selector = SelectKBest(f_classif, k='all')

In [50]:
num_features_kbest = fvalue_selector.fit_transform(numerical_features, target)

#### Let's look closely at the `F-values` scores using the `scores_` and `pvalues_` attributes for `fvalue_selector` object

In [51]:
num_fvalue_summary = pd.DataFrame(list(zip(numerical_features.columns.tolist(),
                                           fvalue_selector.scores_,
                                           fvalue_selector.pvalues_)),
                                  columns=['Feature', 'F_value_statistic', 'pvalue'])

In [52]:
num_fvalue_summary

Unnamed: 0,Feature,F_value_statistic,pvalue
0,age,76.16942,2.69557e-18
1,heightcm,43.93248,3.43933e-11
2,weightkg,24.06487,9.34862e-07
3,bmi,0.77314,0.379252
4,hct,47.01463,7.14141e-12
5,creatlst,7.48545,0.00622251
6,totalbumin,44.30874,2.83844e-11
7,a1clvl,8.68581,0.00320871
8,meldscr,18.28324,1.90788e-05
9,hdef,16.28832,5.44928e-05


In [53]:
num_fvalue_summary.shape

(11, 3)

#### Finding features where `pvalue` `<=` `0.05`

In [54]:
sig_num_features = num_fvalue_summary[num_fvalue_summary['pvalue'] <= 0.05].sort_values(by=['pvalue'],
                                                                                        ascending=True)

In [55]:
sig_num_features

Unnamed: 0,Feature,F_value_statistic,pvalue
0,age,76.16942,2.69557e-18
4,hct,47.01463,7.14141e-12
6,totalbumin,44.30874,2.83844e-11
1,heightcm,43.93248,3.43933e-11
2,weightkg,24.06487,9.34862e-07
10,pasys,19.23413,1.15904e-05
8,meldscr,18.28324,1.90788e-05
9,hdef,16.28832,5.44928e-05
7,a1clvl,8.68581,0.00320871
5,creatlst,7.48545,0.00622251


- `bmi` was did not have a statistically significant `F-value`

In [56]:
sig_num_features.shape

(10, 3)

- creating `sig_features`

In [57]:
sig_features = sig_num_features['Feature'].values.tolist() + sig_cat_features['Feature'].values.tolist()

In [58]:
sig_features

['age',
 'hct',
 'totalbumin',
 'heightcm',
 'weightkg',
 'pasys',
 'meldscr',
 'hdef',
 'a1clvl',
 'creatlst',
 'cva',
 'cvd',
 'incidencREOP_FOURTH',
 'classnyh_REST',
 'pvd',
 'chf',
 'cvdcarsten_RIGHT',
 'cvdtia',
 'status_EMERGENCY',
 'cvdstenlft_100%',
 'cvdstenrt_80-99%',
 'cvdstenrt_100%',
 'arrhyafib',
 'medinotr',
 'vdaort',
 'prvalve',
 'carshock24',
 'gender',
 'infendo',
 'vdinsufm_MODERATE',
 'vdinsuft_MODERATE',
 'cvdcarsten_LEFT',
 'cvdpcarsurg',
 'cvdstenlft_80-99%',
 'vdinsufm_MILD',
 'vdinsuft_MILD',
 'infendty',
 'diabetes',
 'priorhf',
 'vdinsufm_SEVERE',
 'prcvint',
 'diabctrl',
 'chrlungd',
 'raceblack',
 'surgdt_DayOfWeek_Mon',
 'mediastrad',
 'incidencREOP_FIRST']

In [59]:
len(sig_features), feature_matrix.shape

(47, (42740, 109))

- creating a mask of `sig_features` for `feature_matrix`
- use to select features in the same general order of `feature_matrix` instead of by `p-value` rank

In [60]:
auto_select_mask = [column for column in feature_matrix.columns.tolist() if column in sig_features]

In [61]:
auto_select_mask

['age',
 'heightcm',
 'weightkg',
 'hct',
 'creatlst',
 'totalbumin',
 'a1clvl',
 'meldscr',
 'hdef',
 'pasys',
 'surgdt_DayOfWeek_Mon',
 'gender',
 'raceblack',
 'diabetes',
 'infendo',
 'mediastrad',
 'pvd',
 'cvd',
 'cva',
 'cvdtia',
 'cvdpcarsurg',
 'prcvint',
 'prvalve',
 'chf',
 'priorhf',
 'arrhyafib',
 'medinotr',
 'vdaort',
 'diabctrl',
 'infendty',
 'chrlungd',
 'carshock24',
 'classnyh_REST',
 'vdinsufm_MILD',
 'vdinsufm_MODERATE',
 'vdinsufm_SEVERE',
 'vdinsuft_MILD',
 'vdinsuft_MODERATE',
 'incidencREOP_FIRST',
 'incidencREOP_FOURTH',
 'status_EMERGENCY',
 'cvdcarsten_RIGHT',
 'cvdcarsten_LEFT',
 'cvdstenrt_80-99%',
 'cvdstenrt_100%',
 'cvdstenlft_80-99%',
 'cvdstenlft_100%']

In [62]:
len(auto_select_mask)

47

### Key Takeaways
- on `Combined` dataset, `SelectKBest` identified more features as significant relative to the same analysis run on `2.73` only
- this could be because of the increased number of observations OR more inclusive definition of stroke as encoded in `strokeBin2`

#### Can apply `auto_select_mask` to a `feature_matrix` and model using those features identified using `SelectKBest`