https://www.drivendata.org/competitions/54/machine-learning-with-a-heart/page/107/

In [60]:
%pylab inline

#%matplotlib inline

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

#DATA_DIR = Path('..', 'data', 'final', 'public')
DATA_DIR = Path('.')

train_values = pd.read_csv(DATA_DIR / 'train_values.csv', index_col='patient_id')
train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv', index_col='patient_id')


Populating the interactive namespace from numpy and matplotlib


In [39]:
print(train_values.max_heart_rate_achieved.unique())



[170 158 162 181 145 150 157 112 140 151 178 152 182 126 175 144 202 147
 142 138 143 115 159 184 155 123 168 114 154 165 186 173 163 121 161 137
 172 130 167 141 166 125 103 120 132 169 179  99 177 160 156 109 139 134
 113 149 174 131 148 153 133 122 105 106 192 108  96 171 180 188 111  97
 117]


In [8]:
train_values.describe()

Unnamed: 0,slope_of_peak_exercise_st_segment,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,1.55,131.311111,3.155556,0.694444,0.161111,1.05,249.211111,1.01,0.688889,54.811111,149.483333,0.316667
std,0.618838,17.010443,0.938454,0.969347,0.368659,0.998742,52.717969,1.121357,0.464239,9.334737,22.063513,0.466474
min,1.0,94.0,1.0,0.0,0.0,0.0,126.0,0.0,0.0,29.0,96.0,0.0
25%,1.0,120.0,3.0,0.0,0.0,0.0,213.75,0.0,0.0,48.0,132.0,0.0
50%,1.0,130.0,3.0,0.0,0.0,2.0,245.5,0.8,1.0,55.0,152.0,0.0
75%,2.0,140.0,4.0,1.0,0.0,2.0,281.25,1.6,1.0,62.0,166.25,1.0
max,3.0,180.0,4.0,3.0,1.0,2.0,564.0,6.2,1.0,77.0,202.0,1.0


In [49]:
train_values.keys()

Index(['slope_of_peak_exercise_st_segment', 'thal', 'resting_blood_pressure',
       'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina'],
      dtype='object')

In [11]:
train_labels.head()

Unnamed: 0_level_0,heart_disease_present
patient_id,Unnamed: 1_level_1
0z64un,0
ryoo3j,0
yt1s1x,1
l2xjde,1
oyt4ek,0


In [12]:
y = train_labels['heart_disease_present'].values
y

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0], dtype=int64)

### Converting categorical data to numerical features


In [61]:
def cat_to_num(data):
    categories = unique(data)
    features = []
    for cat in categories:
        binary = (data == cat)
        features.append(binary.astype("int"))
    return features

def cat_proc(df): 
    arr = cat_to_num(df['slope_of_peak_exercise_st_segment'])
    df['slope_up'] = arr[0]
    df['slope_flat'] = arr[1]
    df['slope_down'] = arr[2]
    df.drop('slope_of_peak_exercise_st_segment', axis=1, inplace=True)

    arr = cat_to_num(df['chest_pain_type'])
    df['cp_1'] = arr[0]
    df['cp_2'] = arr[1]
    df['cp_3'] = arr[2]
    df['cp_4'] = arr[3]
    df.drop('chest_pain_type', axis=1, inplace=True)

    arr = cat_to_num(df['thal'])
    df['thal_fixd'] = arr[0]
    df['thal_norm'] = arr[1]
    df['thal_revd'] = arr[2]
    df.drop('thal', axis=1, inplace=True)

    arr = cat_to_num(df['sex'])
    df['sex_f'] = arr[0]
    df['sex_m'] = arr[1]
    df.drop('sex', axis=1, inplace=True)

    arr = cat_to_num(df['exercise_induced_angina'])
    df['exercise_induced_angina_0'] = arr[0]
    df['exercise_induced_angina_1'] = arr[1]
    df.drop('exercise_induced_angina', axis=1, inplace=True)

    arr = cat_to_num(df['fasting_blood_sugar_gt_120_mg_per_dl'])
    df['fasting_blood_sugar_gt_120_mg_per_dl_0'] = arr[0]
    df['fasting_blood_sugar_gt_120_mg_per_dl_1'] = arr[1]
    df.drop('fasting_blood_sugar_gt_120_mg_per_dl', axis=1, inplace=True)

    arr = cat_to_num(df['resting_ekg_results'])
    df['resting_ekg_results_0'] = arr[0]
    df['resting_ekg_results_1'] = arr[1]
    df['resting_ekg_results_2'] = arr[2]
    df.drop('resting_ekg_results', axis=1, inplace=True)

cat_proc(train_values)

In [62]:
train_values.keys()

Index(['resting_blood_pressure', 'num_major_vessels',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age',
       'max_heart_rate_achieved', 'slope_up', 'slope_flat', 'slope_down',
       'cp_1', 'cp_2', 'cp_3', 'cp_4', 'thal_fixd', 'thal_norm', 'thal_revd',
       'sex_f', 'sex_m', 'exercise_induced_angina_0',
       'exercise_induced_angina_1', 'fasting_blood_sugar_gt_120_mg_per_dl_0',
       'fasting_blood_sugar_gt_120_mg_per_dl_1', 'resting_ekg_results_0',
       'resting_ekg_results_1', 'resting_ekg_results_2'],
      dtype='object')

In [63]:
selected_features = ['resting_blood_pressure', 'num_major_vessels',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'age',
       'max_heart_rate_achieved', 'slope_up', 'slope_flat', 'slope_down',
       'cp_1', 'cp_2', 'cp_3', 'cp_4', 'thal_fixd', 'thal_norm', 'thal_revd',
       'sex_f', 'sex_m', 'exercise_induced_angina_0',
       'exercise_induced_angina_1', 'fasting_blood_sugar_gt_120_mg_per_dl_0',
       'fasting_blood_sugar_gt_120_mg_per_dl_1', 'resting_ekg_results_0',
       'resting_ekg_results_1', 'resting_ekg_results_2']
train_values_subset = train_values[selected_features]


## Logistic Regression

In [64]:
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.linear_model import LogisticRegression

# for combining the preprocess with model training
from sklearn.pipeline import Pipeline

# for optimizing parameters of the pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline(steps=[('scale', StandardScaler()), 
                       ('logistic', LogisticRegression())])


In [65]:
param_grid = {'logistic__C': [0.0001, 0.001, 0.01, 1, 10], 
              'logistic__penalty': ['l1', 'l2']}
gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  cv=3)

With the parameter grid we've created and cross-validation, we're about to test 30 different models and take the best one!

In [66]:
gs.fit(train_values_subset, train_labels.heart_disease_present)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform

  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform

  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'logistic__C': [0.0001, 0.001, 0.01, 1, 10], 'logistic__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

Let's look at the best parameters:

In [67]:
gs.best_params_

{'logistic__C': 1, 'logistic__penalty': 'l2'}

In [68]:
from sklearn.metrics import log_loss

in_sample_preds = gs.predict_proba(train_values[selected_features])
log_loss(train_labels.heart_disease_present, in_sample_preds)

  Xt = transform.transform(Xt)


0.3311809922625794

## Predict

In [69]:
test_values = pd.read_csv(DATA_DIR / 'test_values.csv', index_col='patient_id')

cat_proc(test_values)

test_values_subset = test_values[selected_features]

predictions = gs.predict_proba(test_values_subset)[:, 1]

submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col='patient_id')

my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

my_submission.head()

  Xt = transform.transform(Xt)


Unnamed: 0_level_0,heart_disease_present
patient_id,Unnamed: 1_level_1
olalu7,0.407035
z9n6mx,0.049859
5k4413,0.964079
mrg7q5,0.019228
uki4do,0.958967


In [70]:
my_submission.to_csv('submission.csv')

In [71]:
!more submission.csv


patient_id,heart_disease_present
olalu7,0.4070347580105988
z9n6mx,0.0498588348229431
5k4413,0.9640791584919114
mrg7q5,0.01922761948080944
uki4do,0.9589665235525946
kev1sk,0.018647395292761416
9n6let,0.021293916551493576
jxmtyg,0.9732966985607755
51s2ff,0.2134246709511192
wi9mcs,0.08139131572476754
741h4l,0.19223488424853383
1ef64a,0.597819137088304
wa2ix6,0.4110423203454415
8167zl,0.9937206435756372
n6nldr,0.1277813774814942
ph85fp,0.019857683192539093
jfan5p,0.004770049455901711
7c4iz1,0.006897521822292978
ukigml,0.9436613499344035
flwvnq,0.029840135715533254
5i4fw2,0.9831470170306105
du1pqf,0.18621883717524285
vs68qz,0.19641162700066492
pfyez0,0.005673604267647336
azvkw2,0.3620791085161123
cird1i,0.9428602032821646
3bg32t,0.167455264041848
xzd050,0.15453925436540683
eyi8et,0.7107256290680442
ce4x2h,0.015724786767940375
sm91nr,0.9176461889936156
2il8hh,0.30694440267050177
yq9cqg,0.693863182681815
520v5j,0.3981134428360165
ammgu2,0.07676047345646753
jix8hj,0.034812906553572165
lj5zrq,0