In [120]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor

In [112]:
df = pd.read_csv('./BlackFriday.csv')
X = df.loc[:, df.columns != 'Purchase'].copy()
y = df.loc[:, df.columns == 'Purchase'].copy()

for c in ['Product_ID', 'Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']:
    X[c] = LabelEncoder().fit_transform(X[c])

X = SimpleImputer().fit_transform(X)
y = y.values.reshape(-1)
print(X.shape, y.shape)

(537577, 11) (537577,)


In [113]:
model = DecisionTreeRegressor(max_depth=3)
model.fit(X, y)
print('CART base r2_score:', r2_score(y, model.predict(X)))

model = GradientBoostingRegressor(max_depth=3)
model.fit(X, y)
print('GBDT base r2_score:', r2_score(y, model.predict(X)))

CART base r2_score: 0.38927463076192326
GBDT base r2_score: 0.6553513882194604


In [115]:
F = np.zeros_like(y, dtype=float)
n_estimators = 101
learning_rate = 0.1
base_estimators = list()
is_init = False

for i in range(n_estimators):
    negative_gradient = y - F
    base = DecisionTreeRegressor(max_depth=3)
    base.fit(X, negative_gradient)
    base_estimators.append(base)
    
    if not init_flag:
        F = base.predict(X)
        is_init = True
    else:
        F += learning_rate * base.predict(X)
    
    if i % 10 == 0:
        print('iter: {} train_r2_scorre: {:.2f}'.format(i, r2_score(y, F)))

iter: 0 train_r2_scorre: -2.77
iter: 10 train_r2_scorre: 0.11
iter: 20 train_r2_scorre: 0.51
iter: 30 train_r2_scorre: 0.58
iter: 40 train_r2_scorre: 0.62
iter: 50 train_r2_scorre: 0.63
iter: 60 train_r2_scorre: 0.64
iter: 70 train_r2_scorre: 0.65
iter: 80 train_r2_scorre: 0.65
iter: 90 train_r2_scorre: 0.65
iter: 100 train_r2_scorre: 0.66


### Classification

In [116]:
df = pd.read_csv('./heart.csv', index_col=0)

X = df.loc[:, df.columns != 'target'].copy()
y = df.loc[:, df.columns == 'target'].copy()

X = X.values
y = LabelEncoder().fit_transform(y)

In [117]:
model = DecisionTreeClassifier(max_depth=3)
model.fit(X, y)
print('CART base accuracy:', accuracy_score(y, model.predict(X)))

model = GradientBoostingClassifier(max_depth=3)
model.fit(X, y)
print('GBDT base accuracy:', accuracy_score(y, model.predict(X)))

CART base accuracy: 0.8448844884488449
GBDT base accuracy: 0.9933993399339934


In [118]:
def logit(F):
    return 1.0 / (1.0 + np.exp(-F))

In [119]:
F = np.zeros_like(y, dtype=float)
n_estimators = 101
learning_rate = 0.1
is_init = False
base_estimators = list()

for i in range(n_estimators):
    negative_gradient = y - logit(F)
    base = DecisionTreeRegressor(max_depth=3)
    base.fit(X, negative_gradient)
    base_estimators.append(base)
    if not is_init:
        F = base.predict(X)
        is_init = True
    else:
        F += learning_rate * base.predict(X)
    if i % 10 == 0:
        print('iter: {} train_accuracy: {:.4f}'.format(
            i, accuracy_score(y, (logit(F) > 0.5).astype(int))))

iter: 0 train_accuracy: 0.8449
iter: 10 train_accuracy: 0.8713
iter: 20 train_accuracy: 0.8845
iter: 30 train_accuracy: 0.8845
iter: 40 train_accuracy: 0.8845
iter: 50 train_accuracy: 0.8944
iter: 60 train_accuracy: 0.8977
iter: 70 train_accuracy: 0.9043
iter: 80 train_accuracy: 0.9010
iter: 90 train_accuracy: 0.9043
iter: 100 train_accuracy: 0.9142
