In [4]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X, y = make_moons(n_samples=10000, noise=0.15)
X_train, X_test = train_test_split(X, test_size=0.1, random_state=41, shuffle=True)
y_train, y_test = train_test_split(y, test_size=0.1, random_state=41, shuffle=True)

In [1]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [2]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.872
RandomForestClassifier 0.983
SVC 0.986
VotingClassifier 0.984


In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=1.0, bootstrap=True, n_jobs=-1,
    oob_score=True
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
bag_clf.oob_score_

0.9881111111111112

In [3]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

NameError: name 'bag_clf' is not defined

In [5]:
bag_clf.oob_decision_function_

array([[1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       ...,
       [0.99441341, 0.00558659],
       [0.        , 1.        ],
       [1.        , 0.        ]])

random forest
배깅(페이스팅)을 적용한 결정 트리의 앙상블

In [5]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.992

In [6]:
from sklearn.ensemble import ExtraTreesClassifier

ext_clf = ExtraTreesClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
ext_clf.fit(X_train, y_train)

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.992

In [7]:
#위의 코드와 동일한 구조를 가짐
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features="auto", max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.976

In [8]:
#결정 트리는 일부 특성을 무시하지만 랜덤 포레스트의 경우 모든 특성을 골고루 사용하기 때문에 중요도를 측정하기 유용하다.
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)
#아래의 결과를 봐서 petal length와 petal width가 중요하고 sepal width는 별로 중요하지 않은 것으로 나타난다.

sepal length (cm) 0.09763783728746056
sepal width (cm) 0.022416816927722995
petal length (cm) 0.4145232801597839
petal width (cm) 0.4654220656250325


In [10]:
#depth가 1인 약간 트리를 여러개 쌓아 강한 예측기를 만들어 테스트함
from sklearn.ensemble import AdaBoostClassifier

#SAMME.R의 SAMME의 변종으로 클래스가 2개 뿐인 경우 AdaBoost와 동일함
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [19]:
#gradient boost는 ada와 달리 샘플의 가중치를 조절하지 않고 앞의 예측기의 잔여 오차(y - prediction)를 학습시킨다.
from sklearn.tree import DecisionTreeRegressor
import numpy as np

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)
y2_train = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2_train)
y3_train = y2_train - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3_train)

#새로운 값을 추론할 경우 모든 트리에서 추론한 값들을 모두 더한 값이 예측값이 됨
y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred = np.where(y_pred > 0.5, 1, 0)#0~1 사이의 값이 아닌 클래스 값을 가지도록 변경
accuracy_score(y_test, y_pred)

0.91

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X_train, y_train)
#위의 코드와 동일한 구조를 가짐
y_pred = gbrt.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)
accuracy_score(y_test, y_pred)

0.91

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

#staged_predict -> 트리의 수에 따라 예측한 값
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
#가장 낮은 에러율을 가진 트리 수를 파악해서 다음 모델 생성 시, 사용함
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [24]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float('inf')
error_going_up = 0

#early stopping을 구현
for n_estimators in range(1, 120):#epochs
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:#최소 에러값 갱신 및 patient count 값 초기화
        min_val_error = val_error
        error_going_up = 0
    else:#patient 값 추가하고 임계값을 넘을 경우 훈련 종료
        error_going_up += 1
        if error_going_up == 5:
            break#early stopping

xgboost는 파이썬 라이브러리 중 하나로 최적화된 그레디언트 부스팅 구현이 가능함

In [26]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

xgb_reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.35806
[1]	validation_0-rmse:0.25900
[2]	validation_0-rmse:0.19199
[3]	validation_0-rmse:0.14904
[4]	validation_0-rmse:0.12240
[5]	validation_0-rmse:0.10731
[6]	validation_0-rmse:0.09828
[7]	validation_0-rmse:0.09493
[8]	validation_0-rmse:0.09330
[9]	validation_0-rmse:0.09186
[10]	validation_0-rmse:0.09154
[11]	validation_0-rmse:0.09122
[12]	validation_0-rmse:0.09081
[13]	validation_0-rmse:0.09025
[14]	validation_0-rmse:0.08968
[15]	validation_0-rmse:0.08952
[16]	validation_0-rmse:0.08978
[17]	validation_0-rmse:0.09020
[18]	validation_0-rmse:0.09029
[19]	validation_0-rmse:0.09004
[20]	validation_0-rmse:0.08996


