In [1]:
#importing neccesary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [4]:
hd_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', header=None)

In [6]:
#check first 6 rows
hd_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


we notice that the column headers are not representative enough. This would make it difficult for interpretation, so to make it easier to know how to format the data, we'll replace the column header with column names.

- Age
- Sex
- cp, chest pain
- chot, serum cholesterol in mg/dl
- restecg, resting electrocardiographic results
- thalach, maximum heart rate achieved
- exang, exercise induded angina
- oldpeak, ST depression included by exercise relatige to rest 
- Slope, the slope of the peak exercise ST segment.
- ca, number of major vessels (0-3) colored by flouroscopy
- thal, this is short of thalium heart scan.
- hd, diagnosis of heart diseases, the predicted attribute

In [9]:
##### changing the column number to column names
hd_df.columns = ['age','sex','cp','restbp','chol',
                 'fbs','restecg','thalach','exang',
                 'oldpeak','slope','ca','thal','hd']

#printn first first 5 rows to check if columns names have changed
hd_df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [10]:
hd_df.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
hd           int64
dtype: object

we observa that ca and thal are of type object which suggests a mixture of different data types

In [11]:
hd_df['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [12]:
hd_df['thal'].unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

we observe that ca and thal have floats and question marks which suggest missing data, which we need to handle 

### Missing data 

Scikit learn's classification trees do not support datasets with missing values 

In [18]:
q_len=len(hd_df.loc[(hd_df['ca'] == '?')|
          (hd_df['thal'] == '?')])

In [24]:
print('number of items with missing values: ' + str(q_len))

number of items with missing values: 6


we observe that 6 rows/ items have question mark ?b

In [41]:
#checking 6 values data contain '?'
hd_df.loc[(hd_df['ca'] == '?') |
         (hd_df['thal'] == '?')]

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [26]:
df_len = len(hd_df)

In [32]:
per_MV = (q_len/ df_len) *100
per_MV

1.9801980198019802

The amont of missing value is insignifant with just 1.9%, thus we can ignore it

We proceed to move to select the columns with no missing rows and store them in the dataframe

In [36]:
#we subset the clean data with no question marks and store it in a new df
df_clean = hd_df.loc[(hd_df['ca'] != '?') &
                    (hd_df['thal'] != '?')]

In [40]:
#we check the len of the orginal dataframe and subset
print(df_len)
len(df_clean)

303


297

we observe that there's a difference between both data is 6 from per_MV

In [44]:
# cehcking to verify if they're still any missing values
df_clean['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0'], dtype=object)

In [47]:
df_clean['thal'].unique()

array(['6.0', '3.0', '7.0'], dtype=object)

### 

### 

#

### split data into target and feature

In [67]:
## Make a new copy of the columns used to make predictions
X = df_clean.drop('hd', axis=1).copy()  # y = df_clean.iloc[:,:-1] 
X.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


In [87]:
#make a new copy of the target column
y = df_clean['hd'].copy()  
y.head()

0    0
1    2
2    1
3    0
4    0
Name: hd, dtype: int64

#### Data types and format for encoding
- age	Feature	Discrete	
- sex	Feature	Categorical	
    - 0 = female
    - 1 = male
- cp	Feature	Categorical	
- restbps	Feature	Discrete	resting blood pressure (on admission to     the hospital)	mm Hg
- chol	Feature	Discrete	serum cholestoral
- fbs	Feature	Categorical	fasting blood sugar > 120 mg/dl
    - 0 = >=120mg/dl
    - 1 = <120mg/dl

- restecg	Feature	Categorical
    - 1 = normal
    - 2 = having ST-T wave abnormally
    - 3 = shwoing probable or definite left ventricular hypertrophy
- thalach	Feature	Discrete	maximum heart rate achieved
    - 0 = no
    - 1 = yes
    
- exang	Feature	Categorical	exercise induced angina
- oldpeak	Feature	Discrete	ST depression induced by exercise           relative to rest
- slope	Feature	Categorical
    - 1= upsloping
    - 2 = flat
    - 3 = downsloping
- ca	Feature	Discrete	number of major vessels (0-3) colored by       flourosopy
- thal	Feature	Categorical
    - 3 = normal (no cold spots)
    - 6 = fixed defect (cold spots during rest and exercise)
    - 7 = reversible defect (when spots only appear during exercise)
- num	Target	Discrete	diagnosis of heart disease

from source : https://archive-beta.ics.uci.edu/dataset/45/heart+disease

In [89]:
#lets check for datatypes in X
X.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
dtype: object

Age, restbp, chol and thalach are all float64 which is correct but we
would like to change some of categorical values to numerical because 
`scikit learn doesnot support categorical data.

In [92]:
#lets start by inspecting columns types to determine which ones need to be
#converted to categorical
X['cp'].unique()

array([1., 4., 3., 2.])

In [94]:
# use get dummmies for pandas for one hot encoding
pd.get_dummies(X, columns=['cp']).head()

Unnamed: 0,age,sex,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,cp_1.0,cp_2.0,cp_3.0,cp_4.0
0,63.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,1,0,0,0
1,67.0,1.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,0,0,0,1
2,67.0,1.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,0,0,0,1
3,37.0,1.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,1,0
4,41.0,0.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,1,0,0


In [97]:
X_encoded = pd.get_dummies(X, columns=['cp','restecg','slope','thal'])

X_encoded.head()

Unnamed: 0,age,sex,restbp,chol,fbs,thalach,exang,oldpeak,ca,cp_1.0,...,cp_4.0,restecg_0.0,restecg_1.0,restecg_2.0,slope_1.0,slope_2.0,slope_3.0,thal_3.0,thal_6.0,thal_7.0
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0.0,1,...,0,0,0,1,0,0,1,0,1,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,3.0,0,...,1,0,0,1,0,1,0,1,0,0
2,67.0,1.0,120.0,229.0,0.0,129.0,1.0,2.6,2.0,0,...,1,0,0,1,0,1,0,0,0,1
3,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0.0,0,...,0,1,0,0,0,0,1,1,0,0
4,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0.0,0,...,0,0,0,1,1,0,0,1,0,0


In [98]:
# lets check for the target variable to verify its category
y.unique()

array([0, 2, 1, 3, 4])

we see that the 5 levels however we're only interested in the case when 
the individual has or does not have heart disease

In [102]:
y_not_zero = y > 0 # get the index of each non-zero value in y
y[y_not_zero] = 1 #setting each non-zero value in y to 1
y.unique()

array([0, 1])