In [1]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
iris = load_iris()

X = iris.data
y = iris.target

In [13]:
(a, b) = X.shape
a

150

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [6]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate= 0.2, max_depth=3, min_samples_leaf=1, 
                                   subsample= 0.7, random_state=1)

In [8]:

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.97


In [9]:
import numpy as np

# 手搓一个

In [46]:
from sklearn.tree import DecisionTreeRegressor

class GBDT:
    def __init__(self, n_estimators = 100, learning_rate = 0.3, max_depth = 3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.estimators = []
        
    def sigmoid(self, z):
        return 1.0 / (1.0+np.exp(-z))
    
    def gradient_loss(self, y_true, y_pred):
        return y_true - self.sigmoid(y_pred)
    
    def fit(self, X, y):
        n_sample, n_feature = X.shape
        y_pred = np.zeros(n_sample, dtype = float)
        
        for i in range(self.n_estimators):
            residual = self.gradient_loss(y, y_pred)
            #print(y, y, residual)
            
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            
            y_pred_tree = tree.predict(X)
            
            y_pred += self.learning_rate * y_pred_tree
            
            self.estimators.append(tree)
            
    def predict_proba(self, X):
        n_samples = X.shape[0]
        y_pred = np.zeros(n_samples)
        
        for tree in self.estimators:
            y_pred_tree = tree.predict(X)
            y_pred += self.learning_rate * y_pred_tree
            return self.sigmoid(y_pred)
    
    def predict(self, X, threshold = 0.5):
        y_proba = self.predict_proba(X)
        y_pred = np.where(y_proba>= threshold, 1, 0)
        
        return y_pred

In [47]:
gbclassfy = GBDT(n_estimators=100, learning_rate=0.2, max_depth=4)

#from sklearn.preprocessing import LabelEncoder

gbclassfy.fit(X_train, y_train)

In [48]:
y_pred = gbclassfy.predict(X_test)

accuracy = np.mean(y_pred == y_test)
accuracy

0.8

# XGBOOST を学びましょう!

In [49]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6
Note: you may need to restart the kernel to use updated packages.


In [2]:
import xgboost

In [6]:
xgboost.config_context()

<contextlib._GeneratorContextManager at 0x7f93d13ff2e0>

In [10]:
model_eg = xgboost.XGBClassifier()
model_eg.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

## example 1

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_breast_cancer

In [12]:
cancer = load_breast_cancer()

X = cancer.data
y = cancer.target

In [13]:
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=114514)

In [14]:
xlf = xgboost.XGBClassifier(max_depth =10, learning_rate = 0.2, n_estimator = 500, objective='binary:logistic',
                           nthread = -1, gamma = 0, nim_child_weight = 1, max_delta_step = 0, subsample = 0.8,
                           colsample_bytree=0.7, colsample_bylevel=1, reg_alpha = 0, reg_lambda = 1, 
                            scale_pos_weight=1,seed=114514)

In [15]:
xlf.fit(train_x, train_y, eval_metric='error', verbose=True, 
        eval_set=[(valid_x, valid_y)], early_stopping_rounds=30)

Parameters: { "n_estimator", "nim_child_weight" } are not used.

[0]	validation_0-error:0.05848
[1]	validation_0-error:0.06433
[2]	validation_0-error:0.05848
[3]	validation_0-error:0.05263
[4]	validation_0-error:0.04094
[5]	validation_0-error:0.02924
[6]	validation_0-error:0.03509
[7]	validation_0-error:0.03509
[8]	validation_0-error:0.04094
[9]	validation_0-error:0.04094
[10]	validation_0-error:0.04094
[11]	validation_0-error:0.04094
[12]	validation_0-error:0.03509
[13]	validation_0-error:0.03509
[14]	validation_0-error:0.03509
[15]	validation_0-error:0.03509
[16]	validation_0-error:0.03509
[17]	validation_0-error:0.02924
[18]	validation_0-error:0.03509
[19]	validation_0-error:0.02924
[20]	validation_0-error:0.03509
[21]	validation_0-error:0.03509
[22]	validation_0-error:0.03509
[23]	validation_0-error:0.02924
[24]	validation_0-error:0.03509
[25]	validation_0-error:0.03509
[26]	validation_0-error:0.02924
[27]	validation_0-error:0.03509
[28]	validation_0-error:0.03509
[29]	validation_0



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=1, colsample_bynode=None, colsample_bytree=0.7,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=0,
              max_depth=10, max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimator=500, n_estimators=100,
              n_jobs=None, nim_child_weight=1, nthread=-1, ...)

In [16]:
y_pred = xlf.predict(valid_x, ntree_limit=xlf.best_ntree_limit)
auc_score = roc_auc_score(valid_y, y_pred)



In [17]:
auc_score

0.965990990990991

## example 2

In [18]:
import pandas as pd

white_wine = pd.read_csv('winequality-white.csv', sep=';')
red_wine = pd.read_csv('winequality-red.csv', sep=';')

red_wine['wine_type'] = 'red'   # add a column for the type
white_wine['wine_type'] = 'white'
wines = pd.concat([red_wine, white_wine])
wines['quality_label'] = wines['quality'].apply(lambda value: 'low' if value <= 5 else 'medium' if value <= 7 else 'high')
wines = wines.sample(frac=1, random_state=42).reset_index(drop=True)

In [21]:
import numpy as np
from collections import Counter

wqp_features = wines.iloc[:,:-3]
wqp_class_labels = np.array(wines['quality_label'])
wqp_label_names = ['low', 'medium', 'high']
wqp_feature_names = list(wqp_features.columns)
wqp_train_X, wqp_test_X, wqp_train_y, wqp_test_y = train_test_split(wqp_features, wqp_class_labels, 
                                                                    test_size=0.3, random_state=42)

print(Counter(wqp_train_y), Counter(wqp_test_y))
print('Features:', wqp_feature_names)

Counter({'medium': 2737, 'low': 1666, 'high': 144}) Counter({'medium': 1178, 'low': 718, 'high': 54})
Features: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [22]:
from sklearn.preprocessing import StandardScaler

wqp_ss = StandardScaler().fit(wqp_train_X)

# Scale the train set
wqp_train_SX = wqp_ss.transform(wqp_train_X)

# Scale the test set
wqp_test_SX = wqp_ss.transform(wqp_test_X)

In [26]:
wqp_train_y

array(['medium', 'low', 'low', ..., 'medium', 'medium', 'medium'],
      dtype=object)

In [28]:
from sklearn.preprocessing import LabelEncoder

Encoder =  LabelEncoder()

y_train = Encoder.fit_transform(wqp_train_y)

y_test = Encoder.transform(wqp_test_y)

In [29]:
winexgb = xgboost.XGBClassifier(objective= 'multi:softmax', num_class = 3)

winexgb.fit(wqp_train_SX, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_class=3,
              num_parallel_tree=None, objective='multi:softmax', ...)

In [31]:
predictions = winexgb.predict(wqp_test_SX)

print(np.mean(predictions == y_test))

0.7974358974358975


In [33]:
import time
from sklearn.model_selection import GridSearchCV

start_time = time.time()

param_grid = {
                'n_estimators': [500, 250, 750], 
                'max_depth': [5, 10],
                'learning_rate': [0.3, 0.4]
              }

wine_clf = GridSearchCV(xgboost.XGBClassifier(objective= 'multi:softmax', num_class = 3, seed=42), param_grid, 
                       cv=10, scoring='accuracy')

wine_clf.fit(wqp_train_SX, y_train)

end_time = time.time()

print(wine_clf.best_params_)
elapsed_time = end_time - start_time
print("代码运行时间：", elapsed_time, "秒")

{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 250}
代码运行时间： 301.1964690685272 秒


In [35]:
results = wine_clf.cv_results_

for param, score_mean, score_sd in zip(results['params'], results['mean_test_score'], results['std_test_score']):
    print(param, round(score_mean, 4), round(score_sd, 4))

{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 500} 0.7818 0.021
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 250} 0.7869 0.019
{'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 750} 0.7779 0.0182
{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 500} 0.7867 0.0145
{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 250} 0.7893 0.0165
{'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 750} 0.7867 0.0129
{'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 500} 0.782 0.0196
{'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 250} 0.7812 0.016
{'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 750} 0.7794 0.0204
{'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 500} 0.7891 0.0126
{'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 250} 0.7891 0.0096
{'learning_rate': 0.4, 'max_depth': 10, 'n_estimators': 750} 0.7875 0.0125
