In [2]:
# Add your imports here!
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [3]:
# Read data
data = pd.read_csv("Data/heart_2020_cleaned.csv")

In [4]:
data["Smoking"] = data["Smoking"].map(dict(Yes=1, No=0))
data["AlcoholDrinking"] = data["AlcoholDrinking"].map(dict(Yes=1, No=0))
data["DiffWalking"] = data["DiffWalking"].map(dict(Yes=1, No=0))
data["Stroke"] = data["Stroke"].map(dict(Yes=1, No=0))
# data["Diabetic"] = data["Diabetic"].map(dict(Yes=1, No=0))
data["PhysicalActivity"] = data["PhysicalActivity"].map(dict(Yes=1, No=0))
data["Asthma"] = data["Asthma"].map(dict(Yes=1, No=0))
data["KidneyDisease"] = data["KidneyDisease"].map(dict(Yes=1, No=0))
data["SkinCancer"] = data["SkinCancer"].map(dict(Yes=1, No=0))
data["Sex"] = data["Sex"].map(dict(Female=1, Male=0))
data["HeartDisease"] = data["HeartDisease"].map(dict(Yes=1, No=0))
data["AgeCategory"] = data["AgeCategory"].map({"18-24":0, "25-29":1, "30-34":2, "35-39":3, "40-44":4, "45-49":5, "50-54":6, "55-59":7, "60-64":8, "65-69":9, "70-74":10, "75-79":11, "80 or older":12})
data["GenHealth"] = data["GenHealth"].map({"Poor":0, "Fair":1, "Good":2, "Very good":3, "Excellent":4})

In [5]:
race_dummies = pd.get_dummies(data["Race"])
diabetes_dummies = pd.get_dummies(data['Diabetic'])

In [6]:
data = pd.concat([data, race_dummies], axis = 1)
data = pd.concat([data, diabetes_dummies], axis = 1)
data = data.drop(columns = 'Race')
data = data.drop(columns = 'Diabetic')
data.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease',
       'SkinCancer', 'American Indian/Alaskan Native', 'Asian', 'Black',
       'Hispanic', 'Other', 'White', 'No', 'No, borderline diabetes', 'Yes',
       'Yes (during pregnancy)'],
      dtype='object')

In [7]:
data

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,...,American Indian/Alaskan Native,Asian,Black,Hispanic,Other,White,No,"No, borderline diabetes",Yes,Yes (during pregnancy)
0,0,16.60,1,0,0,3.0,30.0,0,1,7,...,0,0,0,0,0,1,0,0,1,0
1,0,20.34,0,0,1,0.0,0.0,0,1,12,...,0,0,0,0,0,1,1,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,0,9,...,0,0,0,0,0,1,0,0,1,0
3,0,24.21,0,0,0,0.0,0.0,0,1,11,...,0,0,0,0,0,1,1,0,0,0
4,0,23.71,0,0,0,28.0,0.0,1,1,4,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,0,8,...,0,0,0,1,0,0,0,0,1,0
319791,0,29.84,1,0,0,0.0,0.0,0,0,3,...,0,0,0,1,0,0,1,0,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,1,5,...,0,0,0,1,0,0,1,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,1,1,...,0,0,0,1,0,0,1,0,0,0


In [8]:
hd_corr = data.corr()['HeartDisease'].abs().sort_values()
hd_corr

Other                             0.003039
SleepTime                         0.008327
American Indian/Alaskan Native    0.008547
Black                             0.010156
Yes (during pregnancy)            0.013930
No, borderline diabetes           0.016182
MentalHealth                      0.028591
Asian                             0.030262
AlcoholDrinking                   0.032080
Hispanic                          0.036163
White                             0.040121
Asthma                            0.041444
BMI                               0.051803
Sex                               0.070040
SkinCancer                        0.093317
PhysicalActivity                  0.100030
Smoking                           0.107764
KidneyDisease                     0.145197
PhysicalHealth                    0.170721
No                                0.170977
Yes                               0.183072
Stroke                            0.196835
DiffWalking                       0.201258
AgeCategory

In [9]:
cols_to_drop = hd_corr.index[:14].values

In [10]:
data = data.drop(columns = cols_to_drop)
data

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No,Yes
0,0,1,0,3.0,0,7,1,3,0,1,0,1
1,0,0,1,0.0,0,12,1,3,0,0,1,0
2,0,1,0,20.0,0,9,1,1,0,0,0,1
3,0,0,0,0.0,0,11,0,2,0,1,1,0
4,0,0,0,28.0,1,4,1,3,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,7.0,1,8,0,1,0,0,0,1
319791,0,1,0,0.0,0,3,1,3,0,0,1,0
319792,0,0,0,0.0,0,5,1,2,0,0,1,0
319793,0,0,0,0.0,0,1,0,2,0,0,1,0


In [11]:
data = data.drop(columns = 'Yes')

In [12]:
data

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No
0,0,1,0,3.0,0,7,1,3,0,1,0
1,0,0,1,0.0,0,12,1,3,0,0,1
2,0,1,0,20.0,0,9,1,1,0,0,0
3,0,0,0,0.0,0,11,0,2,0,1,1
4,0,0,0,28.0,1,4,1,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,7.0,1,8,0,1,0,0,0
319791,0,1,0,0.0,0,3,1,3,0,0,1
319792,0,0,0,0.0,0,5,1,2,0,0,1
319793,0,0,0,0.0,0,1,0,2,0,0,1


In [13]:
no_hd_count, hd_count = data.HeartDisease.value_counts()
no_hd = data[data['HeartDisease'] == 0]
hd = data[data['HeartDisease'] == 1]
no_hd_sample = no_hd.sample(hd_count)

In [14]:
undersampled_df = pd.concat([no_hd_sample, hd], axis=0)

In [15]:
undersampled_df

Unnamed: 0,HeartDisease,Smoking,Stroke,PhysicalHealth,DiffWalking,AgeCategory,PhysicalActivity,GenHealth,KidneyDisease,SkinCancer,No
137309,0,0,0,0.0,0,2,1,4,0,0,1
159981,0,0,0,0.0,0,11,1,3,0,0,0
280254,0,1,0,0.0,0,0,1,3,0,0,1
60548,0,0,0,0.0,0,3,1,2,0,0,0
102826,0,0,0,2.0,0,4,1,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
319765,1,0,1,30.0,1,7,1,0,0,0,0
319767,1,1,0,0.0,1,11,1,2,0,0,0
319781,1,1,0,0.0,0,3,1,3,0,0,1
319786,1,1,0,0.0,0,8,1,3,0,0,0


In [16]:
X = undersampled_df.drop(columns = 'HeartDisease')
y = undersampled_df['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [17]:
model = SVC()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
accuracy_score(y_test, y_preds)

KeyboardInterrupt: 

In [0]:
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)

In [0]:
grid.best_estimators_

In [0]:
x