In [3]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
x,y = make_moons(n_samples=100,noise=0.15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
                      estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
                       voting='hard')
voting_clf.fit(x_train,y_train)
for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))
x_train[0],y_train[0],x_train.shape,y_train.shape

LogisticRegression 0.8
RandomForestClassifier 0.9666666666666667
SVC 0.9333333333333333
VotingClassifier 0.9333333333333333


(array([-1.07803341, -0.11123378]), 0, (70, 2), (70,))

In [3]:
#bagging 算法
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

x,y = make_moons(n_samples=1000,noise=0.15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,    #False => pasting
    n_jobs=-1)
bag_clf.fit(x_train,y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9833333333333333


In [4]:
#随机森林
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(x_train,y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test,y_pred))

bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter='random',max_leaf_nodes=16),n_estimators=500,max_samples=1.0,bootstrap=True,n_jobs=-1)
bag_clf.fit(x_train,y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9833333333333333
0.9866666666666667


In [5]:
#极端随机树   和随机森林方法可以用网格搜索比较一下模型的好坏
from sklearn.ensemble import ExtraTreesClassifier

rnd_clf = ExtraTreesClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
rnd_clf.fit(x_train,y_train)
y_pred = bag_clf.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9866666666666667


In [6]:
#特征重要性
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rnd_clf.fit(iris["data"],iris['target'])
for name,score in zip(iris['feature_names'],rnd_clf.feature_importances_):
    print(name,score)

sepal length (cm) 0.09752007834187981
sepal width (cm) 0.025189828375105833
petal length (cm) 0.4336388278300012
petal width (cm) 0.44365126545301314


In [8]:
#AdaBoost算法   修改分类错误实例的权重
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),n_estimators=200,
    algorithm = 'SAMME.R',learning_rate = 0.5)
ada_clf.fit(x_train,y_train)
y_pred = ada_clf.predict(x_test)
print(accuracy_score(y_test,y_pred))

0.9866666666666667


In [67]:
#梯度提升 GBRT
from sklearn.ensemble import GradientBoostingRegressor

grbt = GradientBoostingRegressor(max_depth=2,n_estimators=3,learning_rate=1.0)
grbt.fit(x_train,y_train)
y_pred = grbt.predict(x_test)
# print(grbt.score(x_test,y_test)



#提前停止法 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
x,y = make_moons(n_samples=100,noise=0.15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

grbt = GradientBoostingRegressor(max_depth=2,n_estimators=200)
grbt.fit(x_train,y_train)
errors = [mean_squared_error(y_val,y_pred) for y_pred in grbt.staged_predict(x_val)]
best_n_estimators = np.argmin(errors)+1
print(best_n_estimators)

grbt_best = GradientBoostingRegressor(max_depth=2,n_estimators=best_n_estimators)
grbt_best.fit(x_train,y_train)
y_pred = grbt_best.predict(x_test)
mean_squared_error(y_test,y_pred)
grbt.score(x_test,y_test)

198


0.8730728938106305