In [1]:
import pandas as pd
import numpy as np
import scipy.optimize as opt

In [2]:
class Leaf:
    def __init__(self,value):
        self.value = value

In [3]:
class Node:
    def __init__(self,branches,attribute,threshold):
        self.branches = branches
        self.threshold = threshold
        self.attribute = attribute
        
    def get(self,df):
        return self.branches[0] if df[self.attribute] < self.threshold else self.branches[1]
        

In [4]:
class Tree:
    def __init__(self,root):
        self.root = root
        
    def predict(self,x):
        item = self.root
        while isinstance(item,Node):
            item = item.get(x)
        return item

In [5]:
r=Node([Leaf('young'),Leaf('old')],"age",18)
t=Tree(r)
print(t.predict({"age":2}).value)

young


In [6]:
df=pd.read_csv("iris.csv")

In [7]:
print(t.predict({"age":20}).value)

old


In [8]:
class CART:
    def __init__(self,df,y_name,X_names):
        self.df = df
        self.y_name = y_name
        self.X_names = X_names
        self.tree = None
        
    def create_tree(self):
        root = self._node_or_leaf(self.df)
        self.tree = Tree(root)
        return self.tree
    
    def _gini_impurity(self, df):
        unique, counts = np.unique(df[self.y_name].values, return_counts=True)
        N = df[self.y_name].values.ravel().size
        p = counts/N
        #print(unique)
        #print(p)
        return 1. - np.sum(p**2)
    
    def _opt_fun(self,df,split_name):
        df1 = df[df[split_name]<split_threshold]
        
    def _node_or_leaf(self,df,loss_parent=0.99):
        loss_best, split_df, split_threshold, split_name = self._loss_best(df)
        print(f"Computed split:\nloss: {loss_best:.2f} (parent: {loss_parent:.2f})\nattribute: {split_name}\nthreshold: {split_threshold}\ncount: {[len(df_.index) for df_ in split_df]}")
        if loss_best < loss_parent and loss_best > 0.:
            branches = []
            for i in range(2):
                branches.append(self._node_or_leaf(split_df[i],loss_parent=loss_best))
            item = Node(branches,split_name,split_threshold)
            print(f"\n * creating Node({split_name}, {split_threshold})")
        else:
            value = np.unique(df[self.y_name])[0]
            item = Leaf(value)
            print(f"\n * creating Leaf({value})")
        return item
    
    def _loss_best(self,df):
        loss0 = 10
        for name in self.X_names:
            split_threshold_ = np.median(df[name].values)
            split_df_ = [df[df[name]<split_threshold_],
                        df[df[name]>=split_threshold_]]
            loss = self._loss(split_df_[0]) + self._loss(split_df_[1])
            if loss < loss0:
                loss0 = loss
                split_threshold = split_threshold_
                split_df = split_df_
                split_name = name
                
        #print(loss0)
                
        return loss0, split_df, split_threshold, split_name
    
    def _loss(self,df):
        return self._gini_impurity(df)
            
        
        

In [9]:
df.columns
X_names=["sepal_length","sepal_width"]
df[X_names]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [10]:
df.iloc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

In [11]:
c = CART(df,"species",X_names)
c.create_tree()

Computed split:
loss: 0.95 (parent: 0.99)
attribute: sepal_length
threshold: 5.8
count: [73, 77]
Computed split:
loss: 0.54 (parent: 0.95)
attribute: sepal_width
threshold: 3.2
count: [36, 37]
Computed split:
loss: 0.67 (parent: 0.54)
attribute: sepal_length
threshold: 5.300000000000001
count: [18, 18]

 * creating Leaf(setosa)
Computed split:
loss: 0.00 (parent: 0.54)
attribute: sepal_length
threshold: 5.1
count: [16, 21]

 * creating Leaf(setosa)

 * creating Node(sepal_width, 3.2)
Computed split:
loss: 0.91 (parent: 0.95)
attribute: sepal_length
threshold: 6.4
count: [35, 42]
Computed split:
loss: 1.03 (parent: 0.91)
attribute: sepal_width
threshold: 2.8
count: [14, 21]

 * creating Leaf(setosa)
Computed split:
loss: 0.80 (parent: 0.91)
attribute: sepal_length
threshold: 6.7
count: [14, 28]
Computed split:
loss: 0.88 (parent: 0.80)
attribute: sepal_width
threshold: 3.0
count: [6, 8]

 * creating Leaf(versicolor)
Computed split:
loss: 0.65 (parent: 0.80)
attribute: sepal_width
thresh

<__main__.Tree at 0x3eca268>

In [12]:
c.tree.predict(df.iloc[0]).value

'setosa'