In [148]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


class decisionNode(object):
    '''建立决策树结点类'''

    def __init__(self,
                 col=-1,
                 value=None,
                 results=None,
                 tb=None,
                 fb=None,
                 samples=None):
        '''col:待检验的列索引值;value:为了使结果为true,当前列必须匹配的值
        tb,fb:下一层结点,result:dict,叶结点处,该分支的结果,其它结点处为None'''
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb
        self.fb = fb
        self.samples = samples


class xgboost:
    def __init__(self, max_depth=8, n_estimators=30, lamda=0.1,
                 yita=0.0001):
        '''
        max_depth 最大深度
        n_estimators 树数量
        lamda 正则项系数
        yita 惩罚系数
        '''
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        #self.learning_rate = learning_rate
        self.lamda = lamda
        self.yita = yita
        self.trees = []
        self.label = []

    def log_reg(self, y_pre, y_true):
        p = 1.0 / (1.0 + np.exp(-y_pre))
        g = p - y_true
        h = p * (1 - p)
        return g, h

    def choosebest(self, x, y):
        m = len(x)
        bestfeature = -1
        bestpoint = -1.0
        numFeatures = len(x[0])
        bestInfoGain = 0.0
        for i in range(numFeatures):
            curfeature = []
            for j in range(m):
                curfeature.append(x[j][i])
            curfeature.sort()
            gain = 0.0
            point_id = (curfeature[-1] - curfeature[0]) / 10
            for j in range(11):
                point = curfeature[0] + point_id * j
                Set = [[k for k in range(m) if x[k][i] < point],
                       [k for k in range(m) if x[k][i] >= point]]
                GL, HL, GR, HR = 0, 0, 0, 0
                y_preL = np.mean([y[q] for q in Set[0]])
                y_preR = np.mean([y[q] for q in Set[1]])
                for k in Set[0]:
                    g, h = self.log_reg(y_preL, y[k])
                    GL += g
                    HL += h
                for k in Set[1]:
                    g, h = self.log_reg(y_preR, y[k])
                    GR += g
                    HR += h
                gain = GL * GL / (HL + self.lamda) + GR * GR / (
                    HR + self.lamda) - (GL + GR)**2 / (
                        HL + HR + self.lamda) - 2 * self.yita
                if gain > bestInfoGain:
                    bestInfoGain = gain
                    bestfeature = i
                    bestpoint = point
        return bestfeature, bestpoint, bestInfoGain

    #构建决策树
    def buildDTree(self, x, y, depth):
        max_depth = self.max_depth
        if depth >= max_depth:
            return decisionNode(results=self.result_list(
                x, y))  #当树深度>=max_depth时，停止分叉，并返回该结点的类别

        #定义一些变量以记录最佳拆分条件
        best_gain = 0
        bestFeat = None  #最佳拆分点:特征
        bestpoint = None  #最佳拆分点:特征取值

        #labels = range(len(x[0]))
        bestFeat, bestpoint, best_gain = self.choosebest(x, y)
        #切分左子树和右子树
        best_set_tx, best_set_ty, best_set_fx, best_set_fy = [], [], [], []
        for n in range(len(x)):
            if x[n][bestFeat] >= bestpoint:
                best_set_tx.append(x[n])
                best_set_ty.append(y[n])
            else:
                best_set_fx.append(x[n])
                best_set_fy.append(y[n])
        #创建分支：
        if best_gain > 0:
            TrueBranch = self.buildDTree(best_set_tx, best_set_ty, depth + 1)
            FalseBranch = self.buildDTree(best_set_fx, best_set_fy, depth + 1)
            return decisionNode(col=bestFeat,
                                value=bestpoint,
                                tb=TrueBranch,
                                fb=FalseBranch,
                                results=self.result_list(x, y))
        else:
            return decisionNode(results=y)

    def result_list(self, x, y):
        '''
        rows：输入样本集合;
        返回值dict
        '''
        results = []
        #print(rows)
        for i in range(len(x)):
            rs = y[i]
            results.append(rs)
        return results

    def tree_predict(self, tree, x):
        while tree.fb != None:
            if x[tree.col] < tree.value:
                tree = tree.fb
            else:
                tree = tree.tb
        #统计叶子结点的多数类别
        predict = np.mean(tree.results)
        return predict

    def fit(self, x, y):
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            x = x.values.tolist()
        if type(y) == type(pd.DataFrame()) or type(y) == type(pd.Series()):
            y = y.values.tolist()
        self.label = list(set(y))
        y_i = y
        for n in range(self.n_estimators):
            dt = self.buildDTree(x, y_i, 1)
            y_i = [y_i[i] - self.tree_predict(dt, x[i]) for i in range(len(x))]
            self.trees.append(dt)
            if set(y_i) == {0}:
                break

    def predict_sample(self, x):
        predict = 0
        for tree in self.trees:
            predict += self.tree_predict(tree, x)
        dis = 10000
        for label in self.label:
            if abs(label - predict) < dis:
                y_pred = label
                dis = abs(label - predict)
        return y_pred

    def predict(self, x):
        if type(x) == type(pd.DataFrame()) or type(x) == type(pd.Series()):
            x = x.values.tolist()
        y_pred = [self.predict_sample(sample) for sample in x]
        return y_pred

In [149]:
x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
y = [1, 2, 3]
xgb = xgboost()
xgb.fit(x, y)
xgb.predict(x)

[1, 2, 3]

In [150]:
data = pd.read_csv(r'D:\python data\kaggle-titanic-master\input\clean.csv')
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:, :-1],
                                                    data.iloc[:, -1],
                                                    test_size=0.2)

In [155]:
xgb = xgboost(max_depth=10, n_estimators=100)
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_train)
accuracy = len([n for n in range(len(y_pred)) if y_pred[n] == y_train.iloc[n]]) / len(y_pred)
print('train accuracy:', accuracy)

train accuracy: 0.9043600562587905


In [156]:
y_pred = xgb.predict(x_test)
accuracy = len([n for n in range(len(y_pred)) if y_pred[n]==y_test.iloc[n]])/len(y_pred)
print('test accuracy:',accuracy)

test accuracy: 0.8089887640449438


In [153]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=100)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
accuracy = len([n for n in range(len(y_pred)) if y_pred[n]==y_test.iloc[n]])/len(y_pred)
accuracy

0.797752808988764