In [5]:
import pandas as pd
import altair as alt
import numpy as np
import sklearn
from sklearn.compose import make_column_transformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

The last column, initially referred to as "num", has been renamed to "heart disease presence", and refers to the heart disease status of the patient. This status ranges from 0-4, 0 indicating no presence of heart disease.

In [41]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
hd_data = pd.read_csv(url, header=None)
hd_data.columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "heart disease presence"]
hd_data




Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart disease presence
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [42]:
class_distribution = pd.DataFrame({'0':'164',
                 '1':'55', '2':'36', '3':'35', '4':'14', 'Total':'303'}, index=["Cleveland"])
class_distribution

Unnamed: 0,0,1,2,3,4,Total
Cleveland,164,55,36,35,14,303


For our project, we will split 75% of the data to use as our training data and the other 25% as our testing data.

In [43]:
hd_train, hd_test = train_test_split(hd_data, test_size=0.25, random_state=123) 
hd_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart disease presence
36,43.0,1.0,4.0,120.0,177.0,0.0,2.0,120.0,1.0,2.5,2.0,0.0,7.0,3
148,45.0,1.0,2.0,128.0,308.0,0.0,2.0,170.0,0.0,0.0,1.0,0.0,3.0,0
21,58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,0
187,66.0,1.0,2.0,160.0,246.0,0.0,0.0,120.0,1.0,0.0,2.0,3.0,6.0,2
161,77.0,1.0,4.0,125.0,304.0,0.0,2.0,162.0,1.0,0.0,1.0,3.0,3.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,59.0,1.0,4.0,140.0,177.0,0.0,0.0,162.0,1.0,0.0,1.0,1.0,7.0,2
83,68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0,3
17,54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
230,52.0,0.0,3.0,136.0,196.0,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0,0


In [45]:

explore_hd_grouped = (hd_train.groupby('heart disease presence').count())
explore_hd = explore_hd_grouped[["age"]].rename(columns={"age":"count"})
explore_hd = explore_hd.assign(
    percentage=100*explore_hd['count']/len(hd_train)
)
explore_hd

Unnamed: 0_level_0,count,percentage
heart disease presence,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,53.744493
1,39,17.180617
2,28,12.334802
3,27,11.894273
4,11,4.845815


In [55]:
missing_hd = hd_train.shape[0] - hd_train.dropna().shape[0]
print("Number of rows with missing data =", missing_hd)

Number of rows with missing data = 0
