## Decision Tree

In [20]:
%matplotlib notebook
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [21]:
# Read data 
data = pd.read_csv('./cardio_train.csv',sep=";")
data 

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


Preprocessing

In [22]:
# 1.  Check if there is any missing data 
data.isnull().values.any()

False

In [23]:
# 2. convert age from days to years
data['age'] = round(data['age'] /365)
data


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50.0,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55.0,1,156,85.0,140,90,3,1,0,0,1,1
2,2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48.0,2,169,82.0,150,100,1,1,0,0,1,1
4,4,48.0,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53.0,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,62.0,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,52.0,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,61.0,1,163,72.0,135,80,1,2,0,0,0,1


In [24]:
# Know more about data to deal with it and check for outliers
data.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,53.338686,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,6.765294,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,30.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,58.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,65.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [25]:
# There is an error in data in weight and height --> min age is 30 and min weight is 10 and min height is 55 aslo max height is 250 and max weight is 200
# So, We should remove outliers by dropping all rows that contains outliers
data.drop(data[(data['height'] > data['height'].quantile(0.975)) | (data['height'] < data['height'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['weight'] > data['weight'].quantile(0.975)) | (data['weight'] < data['weight'].quantile(0.025))].index,inplace=True) 

In [26]:
# There is also an error in ap_hi and ap_io --> They contain negative value and maximum values 16020 and 11000 respectively which is uncredible.
# So we need to remove all these values that considered as outliers
data.drop(data[(data['ap_hi'] > data['ap_hi'].quantile(0.975)) | (data['ap_hi'] < data['ap_hi'].quantile(0.025))].index,inplace=True)
data.drop(data[(data['ap_lo'] > data['ap_lo'].quantile(0.975)) | (data['ap_lo'] < data['ap_lo'].quantile(0.025))].index,inplace=True)

In [27]:
# Remove duplicates if any
data.drop_duplicates(inplace=True)

In [28]:
# Make copy of data
ndata = data.copy()

# Insert new feature bmi calculated by equation using weight and height
ndata.insert(3,'bmi', round(data['weight']/((data['height']/100)**2),2))

# categories bmi and set rangess (0 --> Underweight), (1 --> Ideal), (2 --> Overweight), (3 --> Obesity) 
ndata['bmi'] = pd.cut(ndata['bmi'], bins = [0, 18.5, 25,30,10000], labels = [0, 1, 2, 3])

# drop feature that are not used
ndata.drop(['id','height', 'weight'],axis=1, inplace=True)

ndata

Unnamed: 0,age,gender,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.0,2,1,110,80,1,1,0,0,1,0
1,55.0,1,3,140,90,3,1,0,0,1,1
2,52.0,1,1,130,70,3,1,0,0,0,1
3,48.0,2,2,150,100,1,1,0,0,1,1
4,48.0,1,1,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69993,54.0,1,1,130,90,1,1,0,0,1,1
69994,58.0,1,2,150,80,1,1,0,0,1,1
69995,53.0,2,2,120,80,1,1,1,0,1,0
69998,61.0,1,2,135,80,1,2,0,0,0,1


In [29]:
# categories age and set rangess (0 --> 0-5), (1 --> 5-10), (2 --> 10-15), (3 --> 15-20), (4 --> 20-25), (5 --> 25-30), (6 --> 30-35), (7 --> 35-40), (8 --> 40-45)
# (9 --> 45-50), (10 --> 50-55), (11 --> 55-60), (12 --> 60-65), (13 --> 65-70), (14 --> 70-75), (15 --> 75-80), (16 --> 80-85), (17 --> 85-90), (18 --> 90-95), (19 --> 950-100)
ndata['age'] = pd.cut(ndata['age'], bins = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100], labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])
ndata

def BPCategorize(x,y):
    if x<=120 and y<=80:
        return 0
    elif x<=129 and y<=80:
        return 1
    elif x<=139 or y<=89:
        return 2
    elif x<=180 or y<=120:
        return 3
    elif x>180 or y>120:
        return 4
    else:
        return None

# (normal --> 0), (elevated --> 1), (high 1 --> 2), (high 2 --> 3), (high 3 --> 4)
ndata.insert(4, "bp_cat", ndata.apply(lambda row: BPCategorize(row['ap_hi'], row['ap_lo']), axis=1))
ndata.drop(['ap_hi', 'ap_lo'],axis=1, inplace=True)
ndata

Unnamed: 0,age,gender,bmi,bp_cat,cholesterol,gluc,smoke,alco,active,cardio
0,9,2,1,0,1,1,0,0,1,0
1,10,1,3,3,3,1,0,0,1,1
2,10,1,1,2,3,1,0,0,0,1
3,9,2,2,3,1,1,0,0,1,1
4,9,1,1,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
69993,10,1,1,2,1,1,0,0,1,1
69994,11,1,2,2,1,1,0,0,1,1
69995,10,2,2,0,1,1,1,0,1,0
69998,12,1,2,2,1,2,0,0,0,1


Model

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X = ndata.iloc[:, 0:9]
y = ndata.iloc[:, 9:10]

# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.7 ,random_state = 0)

Using Built In Function

In [31]:
entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 0, max_depth= 9)

In [32]:
entropy.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=9, random_state=0)

In [33]:
train_score = entropy.score(X_train, y_train)
print("Train score accuracy : " , round(train_score*100,2), "%", " --> Using Built In Function.")

Train score accuracy :  72.76 %  --> Using Built In Function.


In [34]:
test_score = entropy.score(X_test, y_test)
print("Test score accuracy : " , round(test_score*100,2), "%", " --> Using Built In Function.")

Test score accuracy :  71.18 %  --> Using Built In Function.


Using My Implementation

In [35]:
from DecisionTree import *

tree = DecisionTreeClassifier()

In [36]:
tree.fit(X_train, y_train)

In [37]:
train_accuracy = tree.accuracy_score(X_train, y_train)
print("Train score accuracy : " , round(train_accuracy*100,2), "%", " --> Using My Implementation.")

Train score accuracy :  70.52 %  --> Using My Implementation.


In [38]:
test_accuracy = tree.accuracy_score(X_test, y_test)
print("Test score accuracy : " , round(test_accuracy*100,2), "%"," --> Using My Implementation.")

Test score accuracy :  70.87 %  --> Using My Implementation.
