# 集成学习和随机森林

## 投票分类

在moons上使用集成分类器

In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.3, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

  return f(*args, **kwds)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gin...
                                        

In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf): 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8588
RandomForestClassifier 0.9064
SVC 0.9196
VotingClassifier 0.9172


In [4]:
# 使用软投票
log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", probability=True, random_state=42) # 给SVC增加predict_proba方法

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft') # soft会使用学习器的predict_proba 获得分类概率, 选择概率值最高的一项

voting_clf.fit(X_train, y_train)

for clf in (log_clf, rnd_clf, svm_clf, voting_clf): 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8588
RandomForestClassifier 0.9064
SVC 0.9196
VotingClassifier 0.9176


## 随机森林

In [5]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)

print("random forest acc score: ", accuracy_score(y_test, y_pred))

random forest acc score:  0.92


## boosting提升

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# 200个深度为1的决策树
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), 
    n_estimators=200,  
    algorithm="SAMME.R",
    learning_rate=0.5)

ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)

print("adaboost acc score: ", accuracy_score(y_test, y_pred))

adaboost acc score:  0.9072


## 习题

### 习题8
导入 MNIST 数据(第三章中介绍),把它切分进一个训练集,一个验证集,和一个测试集(例如 40000 个实例进行训练,10000 个进行验证,10000 个进行测试)。然后训练多个分类器,例如一个随机森林分类器,一个 Extra-Tree 分类器和一个 SVM。接下来, 尝试将它们组合成集成,使用软或硬投票分类器来胜过验证集上的所有集合。一旦找到了,就在测试集上实验。与单个分类器相比,它的性能有多好?

In [7]:
# 引入MNIST数据集
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

mnist = fetch_openml("mnist_784", data_home="./datasets")

X, y = mnist["data"], mnist["target"]

X1, X_test, y1, y_test = train_test_split(X, y, test_size=10000, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X1, y1, test_size=10000, random_state=42)
print(X_train.shape)
print(y_train.shape)

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()

X_train = scale.fit_transform(X_train)
X_valid = scale.transform(X_valid)
X_test = scale.transform(X_test)

(50000, 784)
(50000,)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
ext_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
svm_clf = LinearSVC(random_state=42)

clf_list = [("random_forest", rnd_clf), ("extra_trees", ext_clf), ("support_vector", svm_clf)]

for (name, clf) in clf_list:
    clf.fit(X_train, y_train)
    print(name, " acc score: ", clf.score(X_valid, y_valid))

voting_clf = VotingClassifier(clf_list, voting="hard", n_jobs=-1)

voting_clf.fit(X_train, y_train)

print("voting clf acc score: ", voting_clf.score(X_valid, y_valid))

random_forest  acc score:  0.9469
extra_trees  acc score:  0.9492
support_vector  acc score:  0.9032




voting clf acc score:  0.9521


### 习题9
从练习 8 中运行个体分类器来对验证集进行预测,并创建一个新的训练集并生成预测: 每个训练实例是一个向量,包含来自所有分类器的图像的预测集,目标是图像类别。祝贺你,你刚刚训练了一个 blender ,和分类器一起组成了一个叠加组合!现在让我们来评估测试集上的集合。对于测试集中的每个图像,用所有分类器进行预测,然后将预测馈送到 blender 以获得集合的预测。它与你早期训练过的投票分类器相比如何?

In [37]:
import numpy as np

def batch_predict(clf_list, X):
    preds = [clf.predict(X) for (_, clf) in clf_list]
    return np.column_stack(preds)
    
X_train_new = batch_predict(clf_list, X_train)

X_train_new.shape

(50000, 3)

In [44]:
blender = RandomForestClassifier(n_estimators=10, random_state=42)
blender.fit(X_train_new, y_train)

X_valid_new = batch_predict(clf_list, X_valid)

print(X_valid_new.shape)

print("blender acc score: ", blender.score(X_valid_new, y_valid))

(10000, 3)
blender acc score:  0.9486
