In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC




In [15]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)

from sklearn.model_selection import train_test_split
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Voting classifier

#### hard voting

In [26]:
log_clf = LogisticRegression(solver='lbfgs',random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100,random_state=42)
svm_clf = SVC(random_state=42)

votting_clf = VotingClassifier(
    estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
    voting='hard'
)


from sklearn.metrics import accuracy_score

for clf in (log_clf,rnd_clf,svm_clf,votting_clf):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


#### soft voting  (preobability required)

In [25]:
log_clf = LogisticRegression(solver='lbfgs',random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(gamma='scale',probability=True,random_state=42)


votting_clf = VotingClassifier(
    estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
    voting='soft'
)


from sklearn.metrics import accuracy_score

for clf in (log_clf,rnd_clf,svm_clf,votting_clf):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test,y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


# Bagging and pasting in Scikit learn

In [32]:
# Bagging and pasting in Scikit learn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),n_estimators=500,max_samples=100,
    bootstrap=True, n_jobs=-1,random_state=42)
bag_clf.fit(x_train,y_train)
y_pred = bag_clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.904

In [35]:
# Decision Tree
tree_clf = DecisionTreeClassifier(random_state = 42)
tree_clf.fit(x_train,y_train)
y_pred_tree = tree_clf.predict(x_test)
accuracy_score(y_test,y_pred_tree)

0.856

#### Out of bag evaluation

In [38]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),n_estimators=500,
    bootstrap=True,oob_score=True, random_state=40)

bag_clf.fit(x_train,y_train)
bag_clf.oob_score_

0.8986666666666666

In [39]:
bag_clf.oob_decision_function_

array([[0.32275132, 0.67724868],
       [0.34117647, 0.65882353],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.09497207, 0.90502793],
       [0.31147541, 0.68852459],
       [0.01754386, 0.98245614],
       [0.97109827, 0.02890173],
       [0.97765363, 0.02234637],
       [0.74404762, 0.25595238],
       [0.        , 1.        ],
       [0.7173913 , 0.2826087 ],
       [0.85026738, 0.14973262],
       [0.97222222, 0.02777778],
       [0.0625    , 0.9375    ],
       [0.        , 1.        ],
       [0.97837838, 0.02162162],
       [0.94642857, 0.05357143],
       [1.        , 0.        ],
       [0.01704545, 0.98295455],
       [0.39473684, 0.60526316],
       [0.88700565, 0.11299435],
       [1.        , 0.        ],
       [0.97790055, 0.02209945],
       [0.        , 1.        ],
       [0.99428571, 0.00571429],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.62569832, 0.37430168],
       [0.

In [40]:
y_pred = bag_clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.912

# Random forest
- Random forest is an ensemble of Decision Tree trained via `bagging` method
- `max_samples` set the size of training set.

In [41]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(x_train,y_train)
y_pred_rf = rnd_clf.predict(x_test)
accuracy_score(y_test,y_pred_rf)

0.912

#### Equivalet bagging classifer as random forest

In [42]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter='random',max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0,bootstrap=True,n_jobs=-1)

bag_clf.fit(x_train,y_train)
y_pred_bag = bag_clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.912

In [43]:
# Feature importance
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rnd_clf.fit(iris['data'],iris['target'])
for name, score in zip(iris['feature_names'],rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.09018222200537772
sepal width (cm) 0.022820361284059912
petal length (cm) 0.46082890912800634
petal width (cm) 0.4261685075825561


- Most important feature `petatl length (44%)` and `peatal width (42%)`
- `sepal width 2%` and `lenght (11%)` are unimportant.

# Boosting

#### Adaboost (Adaptive Boosting)

In [46]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(x_train,y_train)
y_pred_ada = ada_clf.predict(x_test)
accuracy_score(y_test,y_pred_ada)

0.896

#### Gradient Boosting

In [73]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [64]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [65]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [66]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2, random_state=42)

In [67]:
X_new = np.array([[0.8]])

In [69]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.75026781])

In [76]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2,n_estimators=3,learning_rate=1.0)
gbrt.fit(X,y)



GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [77]:
gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)
gbrt_slow.fit(X, y)

GradientBoostingRegressor(max_depth=2, n_estimators=200, random_state=42)

In [80]:
# Early stopping
from sklearn.metrics import mean_squared_error
x_train,x_test,y_train,y_test = train_test_split(X,y)


In [81]:
gbrt = GradientBoostingRegressor(max_depth=2,n_estimators=200)
gbrt.fit(x_train,y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=200)

In [83]:
errors = [mean_squared_error(y_test,y_pred) for y_pred in gbrt.staged_predict(x_test)]

best_n_estimator = np.argmin(errors)

In [84]:
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=best_n_estimator)
gbrt_best.fit(x_train,y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=67)

In [85]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(x_train, y_train)
    y_pred = gbrt.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # early stopping

In [86]:
gbrt.n_estimators

73

In [87]:
print("Minimum validation MSE:", min_val_error)

Minimum validation MSE: 0.0030248830463853182


In [88]:
import xgboost
try:
    import xgboost
except ImportError as ex:
    print('Error: the xgboost is not isntalled')
    xgboost = None

In [90]:
if xgboost is not None:  # not shown in the book
    xgb_reg = xgboost.XGBRegressor(random_state=42)
    xgb_reg.fit(x_train, y_train)
    y_pred = xgb_reg.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred) # Not shown
    print("Validation MSE:", val_error)           # Not shown

Validation MSE: 0.003943980081715639


In [93]:
if xgboost is not None:  # not shown in the book
    xgb_reg.fit(x_train, y_train,
                eval_set=[(X_test, y_test)], early_stopping_rounds=2)
    y_pred = xgb_reg.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)  # Not shown
    print("Validation MSE:", val_error)            # Not shown

XGBoostError: [23:15:45] C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1253: Check failed: learner_model_param_.num_feature == p_fmat->Info().num_col_ (2 vs. 1) : Number of columns does not match number of features in booster.

In [95]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(x_train,y_train)
y_pred = xgb_reg.predict(x_test)
mean_squared_error(y_test,y_pred)

0.003943980081715639