#### Load the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.impute import SimpleImputer

#### Load the dataset

In [2]:
data = pd.read_csv('heart.csv')

data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


#### Split the features and target variable

In [3]:
data.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

***We can see that there are several null values in the dataset. We will replace the null values with the mean values.***

#### Define the Simple Imputer

In [4]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')

##### Let the model analyse the data

In [5]:
imputer.fit(data)

##### Replace the NaN values with the mean values

In [6]:
imputed_data = imputer.transform(data)

##### Split the data in into features and targets

In [7]:
x, y = imputed_data[:, :-1], imputed_data[:, -1].astype('int')

y

array([0, 0, 0, ..., 0, 0, 0])

#### Split the dataset into testing and training sets

In [8]:
train_size = int(0.8 * len(x))
test_size = len(x) - train_size

x_train, x_test = x[:train_size], x[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

len(x_train)

3390

#### Define the classifier

In [9]:
# we are using entropy as the criteria for calculating gain
classifier = tree.DecisionTreeClassifier(criterion = 'entropy')

#### Train the model

In [10]:
classifier.fit(x_train, y_train)

#### Get the prediction

In [11]:
prediction = classifier.predict(x_test)

### Calculate the Accuracy

In [12]:
def calculate_accuracy(y, y_hat):
    m = len(y)
    count = 0
    
    for i in range(m):
        if (y[i] == y_hat[i]):
            count += 1
            
    accuracy = count / m * 100
    print("Accuracy =", accuracy, "%")

In [13]:
calculate_accuracy(y_test, prediction)

Accuracy = 74.64622641509435 %


***Therefore, using Entropy for gain calculation, we got accuracy of around 76%***

## Changing the Hyperparameters

### Using Gini as Splitting Criterion

In [14]:
classifier = tree.DecisionTreeClassifier(criterion = 'gini')

#### Train the model

In [15]:
classifier.fit(x_train, y_train)

#### Get the predictions

In [16]:
prediction = classifier.predict(x_test)

#### Calculate the accuracy

In [17]:
calculate_accuracy(y_test, prediction)

Accuracy = 73.23113207547169 %


***We got an accuracy of around 73% using Gini as the splitting criteria***

### Changing the maximum depth

In [18]:
classifier = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 2)

#### Train the model

In [19]:
classifier.fit(x_train, y_train)

#### Get the predictions

In [20]:
prediction = classifier.predict(x_test)

#### Calculate the accuracy

In [21]:
calculate_accuracy(y_test, prediction)

Accuracy = 85.02358490566037 %


***Amazingly, on increasing the maximum depth the accuracy of our model becomes about 85%***

### Changing the minimum number of leaf nodes

In [22]:
classifier = tree.DecisionTreeClassifier(criterion = 'entropy', min_samples_leaf = 20)

#### Train the model

In [23]:
classifier.fit(x_train, y_train)

#### Get the predictions

In [24]:
prediction = classifier.predict(x_test)

#### Calculate the accuracy

In [25]:
calculate_accuracy(y_test, prediction)

Accuracy = 82.66509433962264 %


***On making the mininum number of leaf nodes = 20, we notice that our accuracy has become around 83%***