In [75]:
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as skClassifier
from Scratch.DecisionTreeClassifier import DecisionTreeClassifier 
from Scratch.DataEncoders import CategoricalEncoder
from sklearn.preprocessing import OrdinalEncoder

### Reading Data 

In [86]:
data = pd.read_csv("../datasets/stroke.csv")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
1,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
2,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
3,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes
4,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,Yes


### Encoding Data (choose one cell)

---------------------

In [77]:
enc = CategoricalEncoder()
cat_atrbs = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status']
data_enc = enc.fit_transform_df(data,cat_atrbs)
obj = data_enc['stroke']
data_enc.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,0,Yes
1,1,80.0,0,1,1,2,0,105.92,32.5,1,Yes
2,0,49.0,0,0,1,2,1,171.23,34.4,2,Yes
3,0,79.0,1,0,1,3,0,174.12,24.0,1,Yes
4,1,81.0,0,0,1,2,1,186.21,29.0,0,Yes


In [87]:
data_enc= data.copy()
enc = OrdinalEncoder()
data_enc[cat_atrbs] = enc.fit_transform(data[cat_atrbs])
X_train,X_test,y_train,y_test = train_test_split(data_enc.iloc[:,0:-1],obj,test_size=0.2,stratify=obj)
data_enc.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,67.0,0.0,1.0,1.0,2.0,1.0,228.69,36.6,0.0,Yes
1,1.0,80.0,0.0,1.0,1.0,2.0,0.0,105.92,32.5,1.0,Yes
2,0.0,49.0,0.0,0.0,1.0,2.0,1.0,171.23,34.4,2.0,Yes
3,0.0,79.0,1.0,0.0,1.0,3.0,0.0,174.12,24.0,1.0,Yes
4,1.0,81.0,0.0,0.0,1.0,2.0,1.0,186.21,29.0,0.0,Yes


--------------------------------------------

In [88]:
#Using sklearn's implemenation for splitting data, but transforming data so my classifier can also work with it.
#I use this implementation so I don't have to code it from scratch, since it's out of my scope right now.
X_train,X_test,y_train,y_test = train_test_split(data_enc.iloc[:,0:-1],obj,test_size=0.2,stratify=obj)
train = X_train.copy()
train['stroke'] = y_train
test = X_test.copy()
test['stroke'] = y_test

### With my implementations

#### Training the tree

In [89]:
clsf = DecisionTreeClassifier(max_depth = 4)
print(json.dumps(clsf.fit(train,'stroke'),indent=2))#Pass data and objective column name, json.dumps to make it "pretty"

{
  "leaf": false,
  "column": "age",
  "th": 67.5,
  "left": {
    "leaf": false,
    "column": "age",
    "th": 47.5,
    "left": {
      "leaf": false,
      "column": "bmi",
      "th": 56.3,
      "left": {
        "leaf": false,
        "column": "age",
        "th": 38.5,
        "left": {
          "leaf": true,
          "class": "No"
        },
        "right": {
          "leaf": true,
          "class": "No"
        }
      },
      "right": {
        "leaf": false,
        "column": "avg_glucose_level",
        "th": 217.29,
        "left": {
          "leaf": true,
          "class": "No"
        },
        "right": {
          "leaf": true,
          "class": "Yes"
        }
      }
    },
    "right": {
      "leaf": false,
      "column": "avg_glucose_level",
      "th": 103.4,
      "left": {
        "leaf": false,
        "column": "age",
        "th": 60.5,
        "left": {
          "leaf": true,
          "class": "No"
        },
        "right": {
          "lea

#### Predictions and accuracy

In [90]:
preds = np.array(clsf.predict_all(test))


In [91]:
clsf.accuracy(y_test,preds) #Not bad :)

0.9445255474452555

### With sklearn's implementations

#### Training the tree

In [92]:
clsf2 = skClassifier(max_depth = 4)
clsf2.fit(X_train,y_train)

#### Predictions and accuracy

In [93]:
preds2 = np.array(clsf2.predict(X_test))

In [94]:
clsf.accuracy(y_test,preds2)

0.9445255474452555

In [95]:
preds[preds!=preds2]

array([], dtype='<U3')

### Conclusions

When the same encoder is used in both, my implementation and sklearn's one give exactly the same results, though sklearn's is infinitely faster. Also, my implementation of the encoder is good enough that even when using different encodings for each implementation, results only differ in a very little percentage of examples. Altogheter, I'm very happy with my implementations of both the CategoricalEncoder and the DecisionTreeClassifier.

Implementing the CART regressor from here is trivial, since little changes are to do. For the regressor, instead of "class", a mean of all the objective values of the examples in the leaf should be returned. Also, instead of Gini, variance should be used as an impurity metric, and max depth should be mandatory in order to avoid infinite recursion.