In [1]:
!pip install ipynb ipynb



In [2]:

from ipynb.fs.defs import treeRegression as tr
import pandas as pd
import numpy as np


In [3]:
class GradientBoostedTree:
    def __init__(self,df,y_name,X_names=None,learning_rate=0.1):
        self.df = df
        self.y_name = y_name
        if X_names is None:
            self.X_names = [s for s in df.columns if s not in [y_name]]
        else:
            self.X_names = X_names
        
        self.init_tree = None
        self.trees = []
        self.learning_rate = learning_rate
            
    def _initial_model(self):
        unique, counts = np.unique(self.df[self.y_name].values, return_counts=True)
        N = len(self.df.index)
        ind_max = np.argmax(counts)
        p = counts[ind_max]/N
        v = unique[ind_max]
        n = tr.Node(value=v)
        t = tr.Tree(root=n)
        t.show()
        return t, p
    
    def _initial_tree(self):
        c = tr.CART(self.df,self.y_name,X_names=self.X_names, max_depth=0)
        c.create_tree()
        c.prune()
        self.init_tree = c.tree
        return c
            
    def predict(self,x):
        p = self.init_tree.predict(x).value
        for t in self.trees:
            p += self.learning_rate * t.predict(x).value
        return p
        
    def _pseudo_residuals(self):
        res = np.empty_like(self.df[self.y_name].values).astype(np.float64)
        for i, x in enumerate(self.df.iloc):
            res[i] = x[self.y_name] - self.predict(x)
        return res
    
    def create_trees(self,M):
        res = self._pseudo_residuals()
        df = self.df
        df["pseudo_residuals"] = res
        for i in range(M):
            res = self._pseudo_residuals()
            print("\n>>>",np.linalg.norm(res))
            df["pseudo_residuals"] = res
            c = tr.CART(df,"pseudo_residuals",X_names=self.X_names,max_depth=3,min_leaf_samples=5,min_split_samples=4)
            c.create_tree()
            self.trees.append(c.tree)
            
    def _confusion_matrix(self,df):
        m = np.zeros((2,2),dtype=int)
        for i, x in enumerate(df.iloc):
            y = int(x[self.y_name])
            y_hat = int(round(self.predict(x)))
            m[y,y_hat] += 1
        return m
            
        

In [4]:

df = pd.read_csv("titanic.csv")
df_titanic = df.sample(frac=0.75)
df_test = df.loc[df.index.difference(df_titanic),:]
    

In [5]:
gbt_titanic = GradientBoostedTree(df_titanic,"Survived")

In [6]:
gbt_titanic.X_names

['Pclass',
 'Name',
 'Sex',
 'Age',
 'Siblings/Spouses Aboard',
 'Parents/Children Aboard',
 'Fare']

In [7]:

gbt_titanic._initial_model()


0



(<ipynb.fs.defs.treeRegression.Tree at 0x7f8c193b4b90>, 0.6210526315789474)

In [8]:
it=gbt_titanic._initial_tree()
it.tree.show()


Computed split:
loss: 0.16 (parent: 0.24)
attribute: Sex
threshold: ('female',)
count: [240, 425]
A tree with 2 leafs was created
0.37894736842105264



In [9]:
it.tree.root

<ipynb.fs.defs.treeRegression.Node at 0x7f8c193b4110>

In [10]:

gbt_titanic.learning_rate=0.5
gbt_titanic.create_trees(20)



>>> 12.5102063595248
Computed split:
loss: 0.16 (parent: 0.24)
attribute: Sex
threshold: ('female',)
count: [240, 425]
Computed split:
loss: 0.14 (parent: 0.19)
attribute: Pclass
threshold: 2.236073938360776
count: [128, 112]
Computed split:
loss: 0.04 (parent: 0.04)
attribute: Parents/Children Aboard
threshold: 1.1459035597723533
count: [110, 18]
Computed split:
loss: 0.03 (parent: 0.04)
attribute: Pclass
threshold: 1.9999940391390134
count: [59, 51]
Computed split:
loss: 0.08 (parent: 0.10)
attribute: Fare
threshold: 72.01699977578016
count: [11, 7]
Computed split:
loss: 0.22 (parent: 0.25)
attribute: Fare
threshold: 23.909402370825795
count: [91, 21]
Computed split:
loss: 0.22 (parent: 0.24)
attribute: Age
threshold: 37.84685038229026
count: [83, 8]
Computed split:
loss: 0.12 (parent: 0.12)
attribute: Fare
threshold: 34.86749371914438
count: [17, 4]
Computed split:
loss: 0.14 (parent: 0.14)
attribute: Pclass
threshold: 1.472141915860566
count: [88, 337]
Computed split:
loss: 0.18 (

In [11]:

gbt_titanic.predict(df_titanic.iloc[0])

0.8873924576231718

In [12]:
m = gbt_titanic._confusion_matrix(df_test)

In [13]:

P = tr.CART._precision(m)
R = tr.CART._recall(m)
A = tr.CART._accuracy(m)
F = tr.CART._F1(P,R)

print(P,R,F,A)




[0.91743119 0.77777778] [0.86805556 0.85530547] [0.89206066 0.81470138] 0.8635851183765502
