In [None]:
# if a virtual env is created, ignore this step
%pip install -r requirements.txt

In [139]:
# this script used to generate requirements.txt 
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Dataset preparation

For this assignment, we use a heart disease dataset, with patients' symtoms as attributes, and whether they have heart disease or not as labels.

- age
- sex
- chest pain type (4 values from 0 -> 3)
- resting blood pressure
- serum cholestoral in mg/dl
- fasting blood sugar > 120 mg/dl: 1 -> diabete, 0 -> normal
- resting electrocardiographic results (values 0, 1, 2)
- maximum heart rate achieved
- exercise induced angina
- oldpeak = ST depression induced by exercise relative to rest
- the slope of the peak exercise ST segment
- number of major vessels (0-3) colored by flourosopy
- thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

In [2]:
'''Run this code if you didn't have the dataset installed'''
import kagglehub

# Download latest version
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/johnsmith88/heart-disease-dataset?dataset_version_number=2...


100%|██████████| 6.18k/6.18k [00:00<00:00, 2.86MB/s]

Extracting files...
Path to dataset files: C:\Users\PC\.cache\kagglehub\datasets\johnsmith88\heart-disease-dataset\versions\2





In [15]:
import pandas as pd
import os

In [22]:
df = pd.read_csv(os.path.join(path, 'heart.csv'))

In [23]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [29]:
# inspect value each column 's metadata
for col in df.columns:
    print(df[col].value_counts())

age
58    68
57    57
54    53
59    46
52    43
56    39
51    39
62    37
60    37
44    36
64    34
41    32
63    32
61    31
67    31
55    30
65    27
42    26
43    26
53    26
45    25
66    25
46    23
48    23
50    21
47    18
49    17
35    15
70    14
39    14
38    12
68    12
71    11
40    11
69     9
34     6
37     6
29     4
76     3
77     3
74     3
Name: count, dtype: int64
sex
1    713
0    312
Name: count, dtype: int64
cp
0    497
2    284
1    167
3     77
Name: count, dtype: int64
trestbps
120    128
130    123
140    107
110     64
150     55
138     45
128     39
125     38
160     36
112     30
132     28
118     24
108     21
124     20
135     20
152     17
145     17
134     17
170     15
122     14
100     14
136     11
180     10
126     10
142      9
115      9
105      9
146      8
148      7
94       7
178      7
102      6
144      6
165      4
200      4
114      4
154      4
117      4
123      4
104      3
106      3
174      3
129      3
192   

In [31]:
# see if there's any null value
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [34]:
# inspect the 'target' class
# postive: 526, negative: 499
df['target'].value_counts()

target
1    526
0    499
Name: count, dtype: int64

In [41]:
# separate the columns into inputs and outputs
X = df.drop(['target'], axis=1) # specify axis=1 so that pd knows that target is a col name
y = df['target']

In [42]:
# split the dataset into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [44]:
# check for number of data points in training/test set
print(X_train.count()[0])

768


  print(X_train.count()[0])


## Feature Engineering

In [45]:
X_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
949,57,1,0,132,207,0,1,168,1,0.0,2,0,3
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
94,62,1,1,128,208,1,0,140,0,0.0,2,0,2
33,70,1,2,160,269,0,1,112,1,2.9,1,1,3
259,66,1,0,120,302,0,0,151,0,0.4,1,0,2


We can see that the data doesn't need any further encoding (ordial categorial datatypes of attributes like cp or restecg has already been converted into integer)

## Decision Tree Classifier

In [94]:
from sklearn.tree import DecisionTreeClassifier

### Decision Tree with Entropy criterion

In [95]:
entropy_dtree = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=42)
entropy_dtree.fit(X_train, y_train)

In [96]:
y_pred_test_entropy = entropy_dtree.predict(X_test)

In [97]:
# predict y_pred_train to check for overfitting
y_pred_train_entropy = entropy_dtree.predict(X_train)

### Decision Tree with Gini criterion

In [98]:
gini_dtree = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=42)
gini_dtree.fit(X_train, y_train)

In [99]:
y_pred_test_gini = gini_dtree.predict(X_test)

In [100]:
# predict y_pred_train to check for overfitting
y_pred_train_gini = gini_dtree.predict(X_train)

## Accuracy Checking

In [101]:
from sklearn.metrics import accuracy_score

In [102]:
print(f"GINI CRITERION -----------------------")
print(f"Training set accuracy: {accuracy_score(y_pred_test_gini, y_test)}")
print(f"Test set accuracy score: {accuracy_score(y_pred_train_gini, y_train)}")

GINI CRITERION -----------------------
Training set accuracy: 0.8249027237354085
Test set accuracy score: 0.921875


In [130]:
print(X_train.iloc[0])

age          57.0
sex           1.0
cp            0.0
trestbps    132.0
chol        207.0
fbs           0.0
restecg       1.0
thalach     168.0
exang         1.0
oldpeak       0.0
slope         2.0
ca            0.0
thal          3.0
Name: 949, dtype: float64


In [None]:
from sklearn.tree import plot_tree
from matplotlib import pyplot as plt

plt.figure(figsize=(30, 20))
feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
plot_tree(decision_tree=gini_dtree, fontsize=10, feature_names=feature_names, class_names=['0', '1'], precision=2, rounded=True)

In [103]:
print("ENTROPY CRITERION ----------------------")
print(f"Training set accuracy: {accuracy_score(y_pred_test_entropy, y_test)}")
print(f"Test set accuracy score: {accuracy_score(y_pred_train_entropy, y_train)}")

ENTROPY CRITERION ----------------------
Training set accuracy: 0.8560311284046692
Test set accuracy score: 0.9205729166666666


In [None]:
plt.figure(figsize=(30, 20))
feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
plot_tree(decision_tree=entropy_dtree, fontsize=10, feature_names=feature_names, class_names=['0', '1'], precision=2, rounded=True)