In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import sys
import math
import numpy as np
import time

In [2]:
x, y = make_regression(n_samples=1000, n_features=50, n_targets=1, noise=1.5)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape)

(800, 50) (200, 50)


In [3]:
sklearn_tree = DecisionTreeRegressor()
sklearn_tree.fit(x_train[:100], y_train[:100])
sklearn_predict = sklearn_tree.predict(x_test)
sklearn_mse = mean_squared_error(y_test, sklearn_predict)
print('sklearn mse: {} on test data'.format(sklearn_mse))

sklearn mse: 56623.7016733636 on test data


In [4]:
class RegressionTree:
    def __init__(self, min_samples_leaf):
        self.min_samples_leaf = min_samples_leaf
    
    
    def fit(self, x, y):
        self.tree = self.build_tree(x, y)
        
    
    def build_tree(self, x, y):
        
        best_feature_index, threshold, c1, c2 = self.choose_best_feature_to_split(x, y)
        
        tree = {}
        x1, y1, x2, y2 = self.split_data(x, y, best_feature_index, threshold)
        
        
        # 构造树的终止条件
        if len(x1) < self.min_samples_leaf:
            tree[(best_feature_index, threshold, '<=')] = c1
        else:
            tree[(best_feature_index, threshold, '<=')] = self.build_tree(x1, y1)
            
        if len(x2) < self.min_samples_leaf:
            tree[(best_feature_index, threshold, '>')] = c2
        else:
            tree[(best_feature_index, threshold, '>')] = self.build_tree(x2, y2)
            
        return tree
    
            
    def split_data(self, x, y, best_feature_index, threshold):
        
        x1, x2, y1, y2 = [], [], [], []
        
        for i in range(len(x)):
            if x[i][best_feature_index] <= threshold:
                x1.append(x[i])
                y1.append(y[i])
            else:
                x2.append(x[i])
                y2.append(y[i])
        
        return np.array(x1), np.array(y1), np.array(x2), np.array(y2)
    
    
    def calculate_mse(self, feature_index, x, y):
        values = []
        for i in range(len(x)):
            values.append(x[i][feature_index])
        
        values = list(set(values))
        
        n1, n2 = 0, 0
        y1, y2 = 0, 0
        
        best_mse = sys.maxsize
        best_threshold = None
        best_c1, best_c2 = None, None
        
        for value in values:
            for i in range(len(x)):
                if x[i][feature_index] <= value:
                    n1 += 1
                    y1 += y[i]
                elif x[i][feature_index] > value:
                    n2 += 1
                    y2 += y[i]
            
            if n1 != 0:
                c1 = y1/n1
            else:
                c1 = 0
            
            if n2 != 0:
                c2 = y2/n2
            else:
                c2 = 0
            
            mse = 0
            for i in range(len(x)):
                if x[i][feature_index] <= value:
                    mse += (c1 - y[i])*(c1 - y[i])
                elif x[i][feature_index] > value:
                    mse += (c2 - y[i])*(c2 - y[i])
                    
            if mse < best_mse:
                best_mse = mse
                best_threshold = value
                best_c1 = c1
                best_c2 = c2
        
        # 不会发生
        if best_threshold is None:
            pass
        
        return best_mse, best_threshold, best_c1, best_c2
    
    
    def choose_best_feature_to_split(self, x, y):
        n_features = x.shape[1]
        
        best_feature_index = -1
        best_mse = sys.maxsize
        best_feature_threshold = None
        best_c1 = None
        best_c2 = None
        
        for feature_index in range(n_features):
            mse, threshold, c1, c2 = self.calculate_mse(feature_index, x, y)
            
            if mse < best_mse:
                best_feature_index = feature_index
                best_mse = mse
                best_feature_threshold = threshold
                best_c1 = c1
                best_c2 = c2
        # 不会发生
        if best_feature_index == -1:
            pass
        
        return best_feature_index, best_feature_threshold, best_c1, best_c2
    
    
    def predict_value(self, x):
        tree = self.tree
        
        while type(tree).__name__ == 'dict':
            
            for key in tree.keys():
                if key[2] == '<=':
                    key1 = key
                elif key[2] == '>':
                    key2 = key
                    
                
            feature_index = key1[0]
            threshold = key1[1]
            
            if x[feature_index] <= threshold:
                tree = tree[key1]
            elif x[feature_index] > threshold:
                tree = tree[key2]

        
        if type(tree).__name__ != 'dict':
            return tree
        else:
            pass
    
    
    def predict(self, X):
         return np.array([self.predict_value(x) for x in X])

In [5]:
custom_tree = RegressionTree(min_samples_leaf=3)

print('start training regression tree...')
start = time.time()
custom_tree.fit(x_train[:100], y_train[:100])
end = time.time()
print('finish training... time cost: {}s'.format(end-start))

custom_predict = custom_tree.predict(x_test)
custom_mse = mean_squared_error(y_test, custom_predict)
print('custom mse: {} on test data'.format(custom_mse))

start training regression tree...
finish training... time cost: 2.53222918510437s
custom mse: 58110.76725332661 on test data
