In [1]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

X, y = datasets.make_moons(n_samples=500, noise=0.30)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs")
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma="scale")

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_train,y_train)

0,1,2
,estimators,"[('lr', ...), ('rf', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.832
RandomForestClassifier 0.912
SVC 0.92
VotingClassifier 0.92


In [4]:
svm_clf = SVC(gamma="scale",probability=True)
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.832
RandomForestClassifier 0.92
SVC 0.92
VotingClassifier 0.92


In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100,
    bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)

In [8]:
print(accuracy_score(y_test, y_pred))

0.928


In [9]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

0.896


In [12]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,oob_score=True,
    bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.8693333333333333

In [13]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.928

In [14]:
bag_clf.oob_decision_function_

array([[0.00526316, 0.99473684],
       [0.39548023, 0.60451977],
       [0.        , 1.        ],
       [0.94797688, 0.05202312],
       [0.09375   , 0.90625   ],
       [1.        , 0.        ],
       [0.5       , 0.5       ],
       [0.        , 1.        ],
       [0.96296296, 0.03703704],
       [1.        , 0.        ],
       [0.93532338, 0.06467662],
       [1.        , 0.        ],
       [0.00520833, 0.99479167],
       [0.41242938, 0.58757062],
       [0.06779661, 0.93220339],
       [1.        , 0.        ],
       [0.00534759, 0.99465241],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.64571429, 0.35428571],
       [0.        , 1.        ],
       [0.11904762, 0.88095238],
       [0.15263158, 0.84736842],
       [0.10582011, 0.89417989],
       [0.98901099, 0.01098901],
       [0.        , 1.        ],
       [0.02793296, 0.97206704],
       [0.        , 1.        ],
       [0.57303371, 0.42696629],
       [0.99404762, 0.00595238],
       [0.

In [15]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(X_train,y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [16]:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [17]:
iris = datasets.load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rnd_clf.fit(iris["data"],iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.10575034711907867
sepal width (cm) 0.023656312720972263
petal length (cm) 0.45728572065776735
petal width (cm) 0.41330761950218164


In [19]:
import numpy as np

X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [20]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [21]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [22]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
X_new = np.array([[0.8]])

In [24]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [25]:
y_pred

array([0.72194043])

In [26]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

0,1,2
,loss,'squared_error'
,learning_rate,1.0
,n_estimators,3
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


In [27]:
gbrt.predict(X_new)

array([0.72194043])

In [29]:
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

bst_n_estimators = np.argmin(errors) + 1
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train,y_train)

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,np.int64(105)
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,2
,min_impurity_decrease,0.0


In [31]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

In [32]:
print(gbrt.n_estimators)

64


In [34]:
import xgboost
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
val_error

0.0048971648811941965

In [39]:
mnist = datasets.fetch_openml("mnist_784",version=1,as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000)

In [40]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier


random_forest_clf = RandomForestClassifier(n_estimators=100)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100)
svm_clf = LinearSVC(max_iter=100, tol=20)
mlp_clf = MLPClassifier()

In [41]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    estimator.fit(X_train, y_train)

In [42]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.97, 0.9737, 0.0979, 0.9648]

In [43]:
named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

In [44]:
voting_clf = VotingClassifier(named_estimators)

In [45]:
voting_clf.fit(X_train, y_train)

0,1,2
,estimators,"[('random_forest_clf', ...), ('extra_trees_clf', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,20
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [46]:
voting_clf.score(X_val, y_val)

0.9734

In [47]:
[estimator.score(X_val, y_val) for estimator in voting_clf.estimators_]

[0.9711, 0.9725, 0.0979, 0.9643]

In [48]:
voting_clf.set_params(svm_clf=None)

0,1,2
,estimators,"[('random_forest_clf', ...), ('extra_trees_clf', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False

0,1,2
,hidden_layer_sizes,"(100,)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [49]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier()),
 ('extra_trees_clf', ExtraTreesClassifier()),
 ('svm_clf', None),
 ('mlp_clf', MLPClassifier())]

In [50]:
voting_clf.estimators_

[RandomForestClassifier(),
 ExtraTreesClassifier(),
 LinearSVC(max_iter=100, tol=20),
 MLPClassifier()]

In [51]:
del voting_clf.estimators_[2]

In [52]:
voting_clf.score(X_val, y_val)

0.9752

In [53]:
voting_clf.voting = "soft"

In [54]:
voting_clf.score(X_val, y_val)

0.9704

In [55]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.971

In [56]:
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.9645, 0.9703, 0.9616]

In [57]:
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [58]:
X_val_predictions

array([[4., 4., 0., 4.],
       [6., 6., 0., 6.],
       [6., 6., 0., 6.],
       ...,
       [7., 7., 0., 7.],
       [9., 9., 0., 9.],
       [0., 0., 0., 0.]], shape=(10000, 4), dtype=float32)

In [59]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True)
rnd_forest_blender.fit(X_val_predictions, y_val)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [60]:
rnd_forest_blender.oob_score_

0.972

In [61]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [63]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [64]:
accuracy_score(y_test, y_pred)

0.968