In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x, y = make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [3]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904


In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)
print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

VotingClassifier 0.92


In [5]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(x_train, y_train)
bag_clf.oob_score_

0.904

In [6]:
y_pred = bag_clf.predict(x_test)
accuracy_score(y_test, y_pred)

0.904

In [7]:
bag_clf.oob_decision_function_

array([[0.421875  , 0.578125  ],
       [0.3442623 , 0.6557377 ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.06451613, 0.93548387],
       [0.37113402, 0.62886598],
       [0.00507614, 0.99492386],
       [1.        , 0.        ],
       [0.9673913 , 0.0326087 ],
       [0.82978723, 0.17021277],
       [0.00568182, 0.99431818],
       [0.77714286, 0.22285714],
       [0.85465116, 0.14534884],
       [0.94923858, 0.05076142],
       [0.06451613, 0.93548387],
       [0.        , 1.        ],
       [0.99411765, 0.00588235],
       [0.93846154, 0.06153846],
       [0.99450549, 0.00549451],
       [0.02312139, 0.97687861],
       [0.38693467, 0.61306533],
       [0.91803279, 0.08196721],
       [1.        , 0.        ],
       [0.98295455, 0.01704545],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.62721893, 0.37278107],
       [0.

In [8]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(x_train, y_train)

y_pred_rf = rnd_clf.predict(x_test)

# rought equivalent
# bag_clf = BaggingClassifier(
#     DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
#     n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

In [9]:
accuracy_score(y_test, y_pred_rf)

0.92

In [10]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10057649144401189
sepal width (cm) 0.025033691548530666
petal length (cm) 0.4497585566850691
petal width (cm) 0.42463126032238846


In [11]:
# AdaBoosting
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(x_train, y_train)

y_pred_ada = ada_clf.predict(x_test)

In [12]:
accuracy_score(y_test, y_pred_ada)

0.896

In [13]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=2)

In [14]:
y2 = y_train - tree_reg1.predict(x_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(max_depth=2)

In [15]:
y3 = y2 - tree_reg2.predict(x_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(max_depth=2)

In [16]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
# print(y_pred)

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(x_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [18]:
y_pred = gbrt.predict(x_test)

In [19]:
# find optimal # of tress with staged_prdict
import numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(x_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(x_test)]
bst_n_estimators = numpy.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(x_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=75)

In [20]:
# early stopping by warm_start, which keeps existing trees

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_increasing = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(x_train, y_train)
    y_pred = gbrt.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_incrasing = 0
    else:
        error_increasing += 1
        if error_increasing == 5:
            print(n_estimators)
            break

52


In [21]:
# XGBoost : Extreme Gradient Boosting
# import xgboost