In [3]:
# Add your imports here!
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [4]:
# Read data
data = pd.read_csv("Data/heart_2020_cleaned.csv")

In [5]:
data["Smoking"] = data["Smoking"].map(dict(Yes=1, No=0))
data["AlcoholDrinking"] = data["AlcoholDrinking"].map(dict(Yes=1, No=0))
data["DiffWalking"] = data["DiffWalking"].map(dict(Yes=1, No=0))
data["Stroke"] = data["Stroke"].map(dict(Yes=1, No=0))
# data["Diabetic"] = data["Diabetic"].map(dict(Yes=1, No=0))
data["PhysicalActivity"] = data["PhysicalActivity"].map(dict(Yes=1, No=0))
data["Asthma"] = data["Asthma"].map(dict(Yes=1, No=0))
data["KidneyDisease"] = data["KidneyDisease"].map(dict(Yes=1, No=0))
data["SkinCancer"] = data["SkinCancer"].map(dict(Yes=1, No=0))
data["Sex"] = data["Sex"].map(dict(Female=1, Male=0))
data["HeartDisease"] = data["HeartDisease"].map(dict(Yes=1, No=0))
data["AgeCategory"] = data["AgeCategory"].map({"18-24":0, "25-29":1, "30-34":2, "35-39":3, "40-44":4, "45-49":5, "50-54":6, "55-59":7, "60-64":8, "65-69":9, "70-74":10, "75-79":11, "80 or older":12})
data["GenHealth"] = data["GenHealth"].map({"Poor":0, "Fair":1, "Good":2, "Very good":3, "Excellent":4})

In [6]:
race_dummies = pd.get_dummies(data["Race"])
diabetes_dummies = pd.get_dummies(data['Diabetic'])

In [7]:
data = pd.concat([data, race_dummies], axis = 1)
data = pd.concat([data, diabetes_dummies], axis = 1)
data = data.drop(columns = 'Race')
data = data.drop(columns = 'Diabetic')
data.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease',
       'SkinCancer', 'American Indian/Alaskan Native', 'Asian', 'Black',
       'Hispanic', 'Other', 'White', 'No', 'No, borderline diabetes', 'Yes',
       'Yes (during pregnancy)'],
      dtype='object')

In [8]:
data

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,American Indian/Alaskan Native,Asian,Black,Hispanic,Other,White,No,"No, borderline diabetes",Yes,Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,1,7,...,0,0,0,0,0,1,0,0,1,0
1,0,20.34,0,0,1,0.0,0.0,0,1,12,...,0,0,0,0,0,1,1,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,0,9,...,0,0,0,0,0,1,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,1,11,...,0,0,0,0,0,1,1,0,0,0
4,0,23.71,0,0,0,28.0,0.0,1,1,4,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,8,...,0,0,0,1,0,0,0,0,1,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,3,...,0,0,0,1,0,0,1,0,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,5,...,0,0,0,1,0,0,1,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,1,...,0,0,0,1,0,0,1,0,0,0


In [9]:
hd_corr = data.corr()['HeartDisease'].abs().sort_values()
hd_corr

Other                             0.003039
SleepTime                         0.008327
American Indian/Alaskan Native    0.008547
Black                             0.010156
Yes (during pregnancy)            0.013930
No, borderline diabetes           0.016182
MentalHealth                      0.028591
Asian                             0.030262
AlcoholDrinking                   0.032080
Hispanic                          0.036163
White                             0.040121
Asthma                            0.041444
BMI                               0.051803
Sex                               0.070040
SkinCancer                        0.093317
PhysicalActivity                  0.100030
Smoking                           0.107764
KidneyDisease                     0.145197
PhysicalHealth                    0.170721
No                                0.170977
Yes                               0.183072
Stroke                            0.196835
DiffWalking                       0.201258
AgeCategory

In [10]:
cols_to_drop = hd_corr.index[:14].values

In [11]:
data = data.drop(columns = cols_to_drop)
data

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No,Yes
0,0,1,0,3.0,0,7,1,3,0,1,0,1
1,0,0,1,0.0,0,12,1,3,0,0,1,0
2,0,1,0,20.0,0,9,1,1,0,0,0,1
3,0,0,0,0.0,0,11,0,2,0,1,1,0
4,0,0,0,28.0,1,4,1,3,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,7.0,1,8,0,1,0,0,0,1
319791,0,1,0,0.0,0,3,1,3,0,0,1,0
319792,0,0,0,0.0,0,5,1,2,0,0,1,0
319793,0,0,0,0.0,0,1,0,2,0,0,1,0


In [12]:
data = data.drop(columns = 'Yes')

In [13]:
data

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No
0,0,1,0,3.0,0,7,1,3,0,1,0
1,0,0,1,0.0,0,12,1,3,0,0,1
2,0,1,0,20.0,0,9,1,1,0,0,0
3,0,0,0,0.0,0,11,0,2,0,1,1
4,0,0,0,28.0,1,4,1,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,7.0,1,8,0,1,0,0,0
319791,0,1,0,0.0,0,3,1,3,0,0,1
319792,0,0,0,0.0,0,5,1,2,0,0,1
319793,0,0,0,0.0,0,1,0,2,0,0,1


In [15]:
no_hd_count, hd_count = data.HeartDisease.value_counts()
no_hd = data[data['HeartDisease'] == 0]
hd = data[data['HeartDisease'] == 1]
no_hd_sample = no_hd.sample(hd_count)

In [16]:
undersampled_df = pd.concat([no_hd_sample, hd], axis=0)

In [17]:
undersampled_df

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No
14892,0,0,0,0.0,0,11,1,3,0,0,0
192249,0,0,0,0.0,0,2,1,3,0,0,1
251411,0,0,0,12.0,0,3,1,1,0,0,1
214119,0,0,0,0.0,0,9,1,2,0,0,1
229741,0,1,0,0.0,0,6,1,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
319765,1,0,1,30.0,1,7,1,0,0,0,0
319767,1,1,0,0.0,1,11,1,2,0,0,0
319781,1,1,0,0.0,0,3,1,3,0,0,1
319786,1,1,0,0.0,0,8,1,3,0,0,0


In [19]:
X = undersampled_df.drop(columns = 'HeartDisease')
y = undersampled_df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [77]:
model = SVC()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
accuracy_score(y_test, y_preds)

0.750593607305936

In [78]:
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.739 total time= 1.6min


[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.739 total time= 1.6min


[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.734 total time= 1.8min


[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.734 total time= 1.6min


[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.736 total time= 1.6min


[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.759 total time= 1.1min


[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.753 total time= 1.2min


[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.749 total time= 1.1min


[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.746 total time= 1.2min


[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.1min


[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.757 total time= 1.1min


[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.752 total time= 1.1min


[CV 3/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.748 total time= 1.1min


[CV 4/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.742 total time= 1.1min


[CV 5/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.746 total time= 1.1min


[CV 1/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.749 total time= 1.1min


[CV 2/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.746 total time= 1.1min


[CV 3/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.741 total time= 1.1min


[CV 4/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.736 total time= 1.1min


[CV 5/5] END ....C=0.1, gamma=0.001, kernel=rbf;, score=0.741 total time= 1.1min


[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.750 total time= 2.2min


[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.748 total time= 2.2min


[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.743 total time= 2.2min


[CV 4/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.740 total time= 2.2min


[CV 5/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.744 total time= 2.2min


[CV 1/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.761 total time= 1.5min


[CV 2/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.755 total time= 1.4min


[CV 3/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.750 total time= 1.7min


[CV 4/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.750 total time= 2.9min


[CV 5/5] END ........C=1, gamma=0.1, kernel=rbf;, score=0.753 total time= 2.9min


[CV 1/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.759 total time= 1.8min


[CV 2/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.753 total time= 2.0min


[CV 3/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.749 total time= 2.0min


[CV 4/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.748 total time= 2.0min


[CV 5/5] END .......C=1, gamma=0.01, kernel=rbf;, score=0.751 total time= 1.8min


[CV 1/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.758 total time= 2.1min


[CV 2/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.754 total time= 1.8min


[CV 3/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.749 total time= 2.1min


[CV 4/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.747 total time= 1.8min


[CV 5/5] END ......C=1, gamma=0.001, kernel=rbf;, score=0.749 total time= 2.1min


[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.733 total time= 6.0min


[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.731 total time= 3.4min


In [0]:
grid.best_estimators_

In [27]:

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier().fit(X_train, y_train)


<bound method ForestClassifier.predict of RandomForestClassifier()>

In [23]:
y_pred = model.predict(X_test)

In [24]:
accuracy_score(y_test, y_pred)

0.73662100456621