In [3]:
import pandas as pd
import altair as alt
import numpy as np
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

The dataset, directly read from the web:

In [141]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
hd_original_data = pd.read_csv(url, header=None)
hd_original_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


To clean and wrangle this dataset into a tidy format, the last column, initially referred to as "num", has been renamed to "heart disease presence", and refers to the heart disease status of the patient. This status ranges from 0-4, 0 indicating no presence of heart disease. The original column "thalach" has also been renamed to "max heart rate", and refers to the maximum heart rate achieved. All missing values have been dropped.

In [150]:
hd_original_data.columns = ["age", "sex", "cp", "trestbps(systolic)", "chol", "fbs", "restecg", "max_heart_rate", "exang", "oldpeak", "slope", "ca", "thal", "heart_disease_presence"]
hd_original_data['heart_disease_presence'] = pd.Categorical(hd_original_data.heart_disease_presence)
hd_data = hd_original_data[(hd_original_data['age'] != "?")
                           & (hd_original_data['sex'] != "?")
                           & (hd_original_data['trestbps(systolic)'] != "?")
                           & (hd_original_data['chol'] != "?")
                           & (hd_original_data['fbs'] != "?")
                           & (hd_original_data['restecg'] != "?")
                           & (hd_original_data['max_heart_rate'] != "?")
                           & (hd_original_data['exang'] != "?")
                           & (hd_original_data['oldpeak'] != "?")
                           & (hd_original_data['slope'] != "?")
                           & (hd_original_data['ca'] != "?")
                           & (hd_original_data['thal'] != "?")
                           & (hd_original_data['heart_disease_presence'] != "?")]
hd_data


Unnamed: 0,age,sex,cp,trestbps(systolic),chol,fbs,restecg,max_heart_rate,exang,oldpeak,slope,ca,thal,heart_disease_presence
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


For our project, we will split 75% of the data to use as our training data and the other 25% as our testing data.

In [151]:
hd_train, hd_test = train_test_split(hd_data, test_size=0.25, random_state=123) 
hd_train

Unnamed: 0,age,sex,cp,trestbps(systolic),chol,fbs,restecg,max_heart_rate,exang,oldpeak,slope,ca,thal,heart_disease_presence
278,57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
259,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
186,42.0,1.0,3.0,120.0,240.0,1.0,0.0,194.0,0.0,0.8,3.0,0.0,7.0,0
172,59.0,0.0,4.0,174.0,249.0,0.0,0.0,143.0,1.0,0.0,2.0,0.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,57.0,1.0,3.0,128.0,229.0,0.0,2.0,150.0,0.0,0.4,2.0,1.0,7.0,1
83,68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0,3
17,54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
233,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0


Chart of the training data, with missing values dropped and with only our predicators and the heart disease presence:

In [152]:
hd_train_predictors = hd_train[["trestbps(systolic)", "chol", "max_heart_rate", "heart_disease_presence"]]
hd_train_predictors = hd_train_predictors[(hd_train_predictors["trestbps(systolic)"] != "?")
                                          & (hd_train_predictors["chol"] != "?")
                                          & (hd_train_predictors["max_heart_rate"] != "?")]
hd_train_predictors

Unnamed: 0,trestbps(systolic),chol,max_heart_rate,heart_disease_presence
278,154.0,232.0,164.0,1
259,124.0,261.0,141.0,1
7,120.0,354.0,163.0,0
186,120.0,240.0,194.0,0
172,174.0,249.0,143.0,1
...,...,...,...,...
107,128.0,229.0,150.0,1
83,180.0,274.0,150.0,3
17,140.0,239.0,160.0,0
233,120.0,269.0,121.0,0


In [153]:

explore_hd_grouped = (hd_train.groupby('heart_disease_presence').count())
explore_hd = explore_hd_grouped[["age"]].rename(columns={"age":"count"})
explore_hd = explore_hd.assign(
    percentage=100*explore_hd['count']/len(hd_train)
)
explore_hd

Unnamed: 0_level_0,count,percentage
heart_disease_presence,Unnamed: 1_level_1,Unnamed: 2_level_1
0,120,54.054054
1,43,19.369369
2,24,10.810811
3,25,11.261261
4,10,4.504505


In [154]:
hd_mean = pd.DataFrame(hd_train_predictors[["trestbps(systolic)", "chol", "max_heart_rate"]].apply(np.mean))
hd_mean_transposed = hd_mean.transpose()
hd_mean_transposed.index = ["mean"]
hd_mean_transposed



Unnamed: 0,trestbps(systolic),chol,max_heart_rate
mean,132.490991,246.022523,149.756757


In [155]:
chol_vs_restbps = (
    alt.Chart(hd_train_predictors)
    .mark_circle()
    .encode(
        x=alt.X("chol", title="Serum cholesterol level (mg/dl)"),
        y=alt.Y("trestbps(systolic)", title="Systolic resting blood pressure (mm Hg)", scale=alt.Scale(zero=False)),
        color=alt.Color("heart_disease_presence", title="Heart Disease Presence", scale=alt.Scale(scheme='dark2'))
    )
)
chol_vs_restbps

  for col_name, dtype in df.dtypes.iteritems():


In [156]:
chol_vs_max_htrt = (
    alt.Chart(hd_train)
    .mark_circle()
    .encode(
        x=alt.X("chol", title="Serum cholesterol level (mg/dl)"),
        y=alt.Y("max_heart_rate", title="Maximum heart rate (BPM)", scale=alt.Scale(zero=False)),
        color=alt.Color("heart_disease_presence", title="Heart Disease Presence")
    )
)
chol_vs_max_htrt

In [157]:
restbps_vs_max_htrt = (
    alt.Chart(hd_train)
    .mark_circle()
    .encode(
        x=alt.X("trestbps(systolic)", title="Systolic resting blood pressure (mm Hg)"),
        y=alt.Y("max_heart_rate", title="Maximum heart rate (BPM)", scale=alt.Scale(zero=False)),
        color=alt.Color("heart_disease_presence", title="Heart Disease Presence")
    )
)
restbps_vs_max_htrt