# Analysis

In [115]:
import os

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns

In [70]:
os.chdir('../data')
DATA_DIR = os.getcwd()

## Convert data to proper dtypes

In [71]:
float16_cols = ['oldpeak_eq_st_depression']
categorical_cols = ['thal']
uint8_cols = ['sex', 'fasting_blood_sugar_gt_120_mg_per_dl', 'exercise_induced_angina', 'slope_of_peak_exercise_st_segment', 'resting_blood_pressure', 'chest_pain_type', 'num_major_vessels', 'resting_ekg_results', 'age', 'max_heart_rate_achieved']
uint16_cols = ['serum_cholesterol_mg_per_dl']
dtype_dict = {}
for c in float16_cols:
    dtype_dict[c] = np.float16
for c in categorical_cols:
    dtype_dict[c] = np.object
for c in uint8_cols:
    dtype_dict[c] = np.uint8
for c in uint16_cols:
    dtype_dict[c] = np.uint16

In [72]:
df_x = pd.read_csv(DATA_DIR + '/raw/train_values.csv', dtype=dtype_dict)
df_y = pd.read_csv(DATA_DIR + '/raw/train_labels.csv', dtype=dtype_dict)
df_x_test = pd.read_csv(DATA_DIR + '/raw/test_values.csv', dtype=dtype_dict)

In [73]:
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null uint8
thal                                    180 non-null object
resting_blood_pressure                  180 non-null uint8
chest_pain_type                         180 non-null uint8
num_major_vessels                       180 non-null uint8
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null uint8
resting_ekg_results                     180 non-null uint8
serum_cholesterol_mg_per_dl             180 non-null uint16
oldpeak_eq_st_depression                180 non-null float16
sex                                     180 non-null uint8
age                                     180 non-null uint8
max_heart_rate_achieved                 180 non-null uint8
exercise_induced_angina                 180 non-null uint8
dtypes: float16(1), object(2), uint16(1), uint8(10)
memory usage

## Scale numerical data with MinMaxScaler() 

In [74]:
cols_for_scaler = ['age', 'max_heart_rate_achieved', 'oldpeak_eq_st_depression', 'resting_blood_pressure', 'serum_cholesterol_mg_per_dl']

In [75]:
# Standard Scaler
std_scaler = StandardScaler()
df_x[cols_for_scaler] = std_scaler.fit_transform(df_x[cols_for_scaler])
df_x_test[cols_for_scaler] = std_scaler.transform(df_x_test[cols_for_scaler])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [83]:
min_max_scaler = MinMaxScaler(copy=False)
df_x[cols_for_scaler] = min_max_scaler.fit_transform(df_x[cols_for_scaler])
df_x_test[cols_for_scaler] = min_max_scaler.transform(df_x_test[cols_for_scaler])



  return self.partial_fit(X, y)


In [76]:
# Even though the data set is small, keep memory footprint low
df_x[cols_for_scaler] = df_x[cols_for_scaler].apply(pd.to_numeric, downcast='float')
df_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 14 columns):
patient_id                              180 non-null object
slope_of_peak_exercise_st_segment       180 non-null uint8
thal                                    180 non-null object
resting_blood_pressure                  180 non-null float32
chest_pain_type                         180 non-null uint8
num_major_vessels                       180 non-null uint8
fasting_blood_sugar_gt_120_mg_per_dl    180 non-null uint8
resting_ekg_results                     180 non-null uint8
serum_cholesterol_mg_per_dl             180 non-null float32
oldpeak_eq_st_depression                180 non-null float32
sex                                     180 non-null uint8
age                                     180 non-null float32
max_heart_rate_achieved                 180 non-null float32
exercise_induced_angina                 180 non-null uint8
dtypes: float32(5), object(2), uint8(7)
memory usage: 7.6

In [77]:
df_x.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,normal,-0.195195,2,0,0,2,1.118269,-0.9032,1,-1.053964,0.932485,0
1,ryoo3j,2,normal,-1.25632,3,0,0,0,-0.669778,0.527267,0,-0.087134,0.387084,0
2,yt1s1x,1,normal,-0.372049,4,3,0,2,1.042182,-0.9032,1,2.383654,0.568884,1
3,l2xjde,1,reversible_defect,1.219639,4,0,0,0,-0.498582,-0.9032,1,-1.591092,1.432436,0
4,oyt4ek,3,reversible_defect,2.752375,1,0,0,2,0.395442,2.851995,1,0.449994,-0.203768,0


## One-hot encode categorical data

In [78]:
label_enc_cols = ['thal']
one_hot_cols = ['slope_of_peak_exercise_st_segment', 'num_major_vessels', 'chest_pain_type', 'resting_ekg_results', 'sex', 'thal']

In [79]:
label_encoder = LabelEncoder()
label_encoder.fit(['normal', 'fixed_defect', 'reversible_defect'])

LabelEncoder()

In [80]:
# encode the 'thal' labels as integers
df_x[label_enc_cols] = label_encoder.transform(df_x[label_enc_cols].values.ravel())
df_x_test[label_enc_cols] = label_encoder.transform(df_x_test[label_enc_cols].values.ravel())

In [81]:
df_x.head()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
0,0z64un,1,1,-0.195195,2,0,0,2,1.118269,-0.9032,1,-1.053964,0.932485,0
1,ryoo3j,2,1,-1.25632,3,0,0,0,-0.669778,0.527267,0,-0.087134,0.387084,0
2,yt1s1x,1,1,-0.372049,4,3,0,2,1.042182,-0.9032,1,2.383654,0.568884,1
3,l2xjde,1,2,1.219639,4,0,0,0,-0.498582,-0.9032,1,-1.591092,1.432436,0
4,oyt4ek,3,2,2.752375,1,0,0,2,0.395442,2.851995,1,0.449994,-0.203768,0


In [82]:
# one-hot encode
# first concat the training and test so that in the end, both will have the same number of columns
encoded = pd.get_dummies(pd.concat([df_x, df_x_test], axis=0), columns=one_hot_cols, drop_first=True)
df_x = encoded.iloc[:df_x.shape[0], :]
df_x_test = encoded.iloc[df_x.shape[0]:, :]
#df_x = pd.get_dummies(df_x, columns=one_hot_cols)
#df_x_test = pd.get_dummies(df_x_test, columns=one_hot_cols, drop_first=True)

In [83]:
df_x.head()

Unnamed: 0,patient_id,resting_blood_pressure,fasting_blood_sugar_gt_120_mg_per_dl,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,age,max_heart_rate_achieved,exercise_induced_angina,slope_of_peak_exercise_st_segment_2,slope_of_peak_exercise_st_segment_3,...,num_major_vessels_2,num_major_vessels_3,chest_pain_type_2,chest_pain_type_3,chest_pain_type_4,resting_ekg_results_1,resting_ekg_results_2,sex_1,thal_1,thal_2
0,0z64un,-0.195195,0,1.118269,-0.9032,-1.053964,0.932485,0,0,0,...,0,0,1,0,0,0,1,1,1,0
1,ryoo3j,-1.25632,0,-0.669778,0.527267,-0.087134,0.387084,0,1,0,...,0,0,0,1,0,0,0,0,1,0
2,yt1s1x,-0.372049,0,1.042182,-0.9032,2.383654,0.568884,1,0,0,...,0,1,0,0,1,0,1,1,1,0
3,l2xjde,1.219639,0,-0.498582,-0.9032,-1.591092,1.432436,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,oyt4ek,2.752375,0,0.395442,2.851995,0.449994,-0.203768,0,0,1,...,0,0,0,0,0,0,1,1,0,1


In [84]:
df_x_test.head()

Unnamed: 0,patient_id,resting_blood_pressure,fasting_blood_sugar_gt_120_mg_per_dl,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,age,max_heart_rate_achieved,exercise_induced_angina,slope_of_peak_exercise_st_segment_2,slope_of_peak_exercise_st_segment_3,...,num_major_vessels_2,num_major_vessels_3,chest_pain_type_2,chest_pain_type_3,chest_pain_type_4,resting_ekg_results_1,resting_ekg_results_2,sex_1,thal_1,thal_2
0,olalu7,2.280764,0,0.737834,-0.724392,0.449994,0.432534,0,1,0,...,0,0,0,0,0,0,1,1,0,1
1,z9n6mx,0.394319,0,-1.259453,0.349114,-2.12822,1.477887,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,5k4413,-0.666806,0,-1.373584,1.332451,-1.268815,-1.340021,1,1,0,...,0,0,0,0,1,0,1,1,0,1
3,mrg7q5,-1.727931,0,1.308487,-0.9032,0.557419,0.477984,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,uki4do,0.394319,0,-1.582823,2.315788,0.664845,-1.112771,1,1,0,...,0,0,0,0,1,0,1,1,1,0


# Train models


## Logistic Regression

In [94]:
log_reg = LogisticRegression(random_state=0, solver='liblinear')


In [85]:
X = df_x.drop('patient_id', axis=1).values
y = df_y.drop('patient_id', axis=1).values.ravel()

In [103]:
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [104]:
log_reg.score(X, y)

0.8666666666666667

## Random Forest

In [105]:
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [106]:
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [107]:
rf.score(X, y)

0.8388888888888889

# Naive Bayes

In [108]:
gnb = GaussianNB()

In [109]:
gnb.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [110]:
gnb.score(X, y)

0.7833333333333333

In [96]:
rs = 10
names = ["QDA", "AdaBoost", "RF", "Gaussian Process", "Logistic Regression", "NB", "Nearest Neighbors", "MLP",
         "Linear SVM", "RBF SVM", "Decision Tree"]
classifiers = [
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(random_state=rs),
    RandomForestClassifier(max_depth=5, n_estimators=10, random_state=rs),
    GaussianProcessClassifier(random_state=rs, kernel=RBF(1.0)),
    LogisticRegression(random_state=rs, solver='liblinear'),
    GaussianNB(),
    KNeighborsClassifier(3),
    MLPClassifier(alpha=1, random_state=rs),
    SVC(kernel='linear', C=0.025, random_state=rs, probability=True),
    SVC(gamma=2, C=1, random_state=rs, probability=True),
    DecisionTreeClassifier(max_depth=5, random_state=rs)
]

In [97]:
kfold = StratifiedKFold(n_splits=10, random_state=rs).split(X, y)

In [98]:
scores = {}
for name in names:
    scores[name] = []

In [99]:
for k, (train, test) in enumerate(kfold):
    print('Length of train: {}'.format(len(train)))
    print('Length of test: {}'.format(len(test)))
    for name, clf in zip(names, classifiers):
        clf.fit(X[train], y[train])
        score = clf.score(X[test], y[test])
        scores[name].append(score)

        
print('='*20)
print('Average score')
for name in names:
    print(name + ': {}'.format(sum(scores[name])/len(scores[name])))

Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18




Length of train: 162
Length of test: 18
Average score
QDA: 0.5833333333333334
AdaBoost: 0.7499999999999999
RF: 0.7888888888888889
Gaussian Process: 0.8277777777777777
Logistic Regression: 0.8
NB: 0.7555555555555556
Nearest Neighbors: 0.8
MLP: 0.8055555555555556
Linear SVM: 0.8333333333333333
RBF SVM: 0.5555555555555555
Decision Tree: 0.7222222222222221




In [100]:
# With linear SVM having the highest score, let's train on all the data using that model
lin_svm = SVC(kernel='linear', C=0.025, random_state=rs, probability=True)
lin_svm.fit(X, y)
X_test = df_x_test.drop('patient_id', axis=1).values
y_pred = lin_svm.predict_proba(X_test)

In [104]:
lin_svm.classes_

array([0, 1])

In [106]:
y_pred[:,1]

array([0.57870992, 0.11232051, 0.94155549, 0.05582626, 0.88239374,
       0.05618466, 0.3010724 , 0.76353629, 0.15091929, 0.08337639,
       0.16364025, 0.46432286, 0.2189881 , 0.95455555, 0.11634225,
       0.05218495, 0.0196208 , 0.09182306, 0.68895965, 0.05389297,
       0.96556769, 0.28492663, 0.12400337, 0.07161804, 0.42149319,
       0.92132273, 0.17393008, 0.13082716, 0.51416087, 0.04332829,
       0.94189655, 0.55821172, 0.5       , 0.45296222, 0.15935082,
       0.1066624 , 0.37966751, 0.27536966, 0.23178891, 0.07361597,
       0.93735342, 0.15137043, 0.93450919, 0.05978641, 0.89462   ,
       0.1068575 , 0.12050823, 0.25932882, 0.28560859, 0.5528476 ,
       0.72764799, 0.06681485, 0.96705673, 0.13541016, 0.31926133,
       0.07356099, 0.93646383, 0.10006997, 0.10839874, 0.79090138,
       0.08096919, 0.94506351, 0.12330714, 0.92286347, 0.10988302,
       0.74911329, 0.67646978, 0.53020636, 0.83693379, 0.64516353,
       0.19845651, 0.97543002, 0.96653191, 0.96527857, 0.91880

In [108]:
df_submit = pd.DataFrame(data=df_x_test['patient_id'], columns=['patient_id'])

In [110]:
df_submit['heart_disease_present'] = y_pred[:,1]

In [114]:
df_submit.to_csv('../models/sub_01.csv', index=False)

# First submission results

I trained a linear SVM with C=0.025 where the processed data were scaled using the standard scaler except for the dummy variable features.

The log loss result was 0.33193. At the time of submission, this placed me at 29/579, including DrivenData's benchmark logistic regression.

With just some basic preprocessing and no model optimization, I managed to get to about the top 5%.

# Model Optimzation

To optimize the models, I'll use GridSearch as implemented by scikit-learn.

In [120]:
# Linear SVM
# Paremeters to adjust: C
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'C': param_range, 'kernel': ['linear'], 'random_state': [0]}]

In [121]:
gs = GridSearchCV(estimator=lin_svm, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=2)
gs = gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

0.8444444444444444
{'C': 100.0, 'kernel': 'linear', 'random_state': 0}


In [122]:
# I originally used C=0.025 but with this sampling above, it found C=100 to be better suited.
# I'll try another gridsearch with a more focused parameter range
param_range = [50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0]
param_grid = [{'C': param_range, 'kernel': ['linear'], 'random_state': [0]}]

In [123]:
gs = GridSearchCV(estimator=lin_svm, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=2)
gs = gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

0.85
{'C': 50.0, 'kernel': 'linear', 'random_state': 0}


In [126]:
# I originally used C=0.025 but with this sampling above, it found C=50 to be better suited.
# I'll try another gridsearch with a more focused parameter range
param_range = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0]
param_grid = [{'C': param_range, 'kernel': ['linear'], 'random_state': [0]}]

In [127]:
gs = GridSearchCV(estimator=lin_svm, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=2)
gs = gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

0.85
{'C': 30.0, 'kernel': 'linear', 'random_state': 0}


In [128]:
# I originally used C=0.025 but with this sampling above, it found C=50 to be better suited.
# I'll try another gridsearch with a more focused parameter range
param_range = [20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0, 36.0, 38.0, 40.0]
param_grid = [{'C': param_range, 'kernel': ['linear'], 'random_state': [0]}]

In [129]:
gs = GridSearchCV(estimator=lin_svm, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=2)
gs = gs.fit(X, y)
print(gs.best_score_)
print(gs.best_params_)

0.85
{'C': 28.0, 'kernel': 'linear', 'random_state': 0}


The search seems to have leveled off at an accuracy of 85% with C=28.0.
Re-train the model with this setting and submit for new results.


In [131]:
# With linear SVM having the highest score, let's train on all the data using that model
lin_svm = SVC(kernel='linear', C=28.0, random_state=rs, probability=True)
lin_svm.fit(X, y)
X_test = df_x_test.drop('patient_id', axis=1).values
y_pred = lin_svm.predict_proba(X_test)

In [133]:
df_submit = pd.DataFrame(data=df_x_test['patient_id'], columns=['patient_id'])
df_submit['heart_disease_present'] = y_pred[:,1]
df_submit.to_csv('../models/sub_02.csv', index=False)

# Results
This second model yielded a log-loss of 0.37185 which is worse than my first submission.

I could be overtraining to the data. I should look at metrics other than accuracy to get a handle on how the models are behaving.