##### Author: Ahmed Badra
##### Topic: CART decision tree implementation
> Note: implementation is in file ```cart.py```, it's separated to facilitate the development in the notebook

------

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# from imblearn.over_sampling import SMOTE
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import f1_score
from cart import CART

# preparing the data

In [2]:
data = pd.read_csv('cardio_train.csv', delimiter = ';')

In [3]:
data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [4]:
data.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [5]:
data = data.drop(columns = ['id'])

In [6]:
data.dtypes

age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [7]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_train,  X_tst, y_train, y_tst = train_test_split(X, y, test_size = 0.1, random_state = 0)

In [8]:
X_train.shape

(63000, 11)

In [9]:
X_tst.shape

(7000, 11)

In [10]:
X_train = X_train.to_numpy().reshape(-1, 11)

In [11]:
X_train

array([[1.5938e+04, 1.0000e+00, 1.7100e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [1.8038e+04, 1.0000e+00, 1.7500e+02, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [2.2793e+04, 1.0000e+00, 1.6500e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       ...,
       [2.2385e+04, 2.0000e+00, 1.7400e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [1.8321e+04, 1.0000e+00, 1.5300e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [2.0424e+04, 2.0000e+00, 1.6900e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [12]:
X_tst = X_tst.to_numpy().reshape(-1, 11)

In [13]:
y_train = y_train.to_numpy().reshape(-1, 1)

In [14]:
y_tst = y_tst.to_numpy().reshape(-1, 1)

# evaluation with testing data

### Gini impurity

training the Model

In [15]:
clf = CART(max_depth=10, metric = "gini")
clf.fit(X_train, y_train)

In [16]:
y_predicted = clf.predict(X_tst)

getting F1_score for y_predicted and y_actual of the testing data using This moded (with gini)

In [17]:
f1_err = f1_score(y_tst.reshape(-1).tolist(), y_predicted)
print(f1_err)

0.7045997610513739


checking out prediction accuracy with training data

In [20]:
y_predicted = clf.predict(X_train)

In [21]:
f1_err = f1_score(y_train.reshape(-1).tolist(), y_predicted)
print(f1_err)

0.7446972555324072


### Entropy

training the Model

In [22]:
clf_en = CART(max_depth=10, metric = "entropy")
clf_en.fit(X_train, y_train)

getting F1_score for y_predicted and y_actual of the testing data using This moded (with entropy)

In [23]:
y_predicted = clf_en.predict(X_tst)

In [24]:
f1_err = f1_score(y_tst.reshape(-1).tolist(), y_predicted)
print(f1_err)

0.7094963816275291


checking out prediction accuracy with training data

In [25]:
y_predicted = clf_en.predict(X_train)

In [26]:
f1_err = f1_score(y_train.reshape(-1).tolist(), y_predicted)
print(f1_err)

0.7431501831501832
