## Hard Voting Classifier : Ensembles

In [7]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4)

log_clf = LogisticRegression()
svc_clf = SVC()
rf_clf = RandomForestClassifier()

voting_clf = VotingClassifier(estimators=[("lr", log_clf), ("svm", svc_clf), ("rf", rf_clf)], voting = "hard")

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)#, random_state = 42)

from sklearn.metrics import accuracy_score
for clf in (log_clf, rf_clf, svc_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.848
RandomForestClassifier 0.8505
SVC 0.868
VotingClassifier 0.8645


## Bagging Classifier

In [11]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500, bootstrap = True, max_samples = 100, n_jobs = -1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8665

In [12]:
### OOB Evaluation

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators = 500, bootstrap = True, max_samples = 100, n_jobs = -1, oob_score = True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.85975

### Random Forest

In [13]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs = -1)
rnd_clf.fit(iris['data'], iris['target'])

for names, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(names, score)

sepal length (cm) 0.09891613568588883
sepal width (cm) 0.024937877435850644
petal length (cm) 0.4257493352666834
petal width (cm) 0.4503966516115772


### Adaboost

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 1),
n_estimators = 200, algorithm = 'SAMME.R', learning_rate = 0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [16]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.863

## Gradient Boosting

In [19]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston()

In [20]:
X, y = boston['data'], boston["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Basic Tree
tree_reg = DecisionTreeRegressor(max_depth = 2)
tree_reg.fit(X_train, y_train)
y_pred = tree_reg.predict(X_test)
print(mean_squared_error(y_pred, y_test))

25.993190895971193


In [33]:
## Gradient Boosting

# Tree 1
tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X_train, y_train)
y_pred1 = tree_reg1.predict(X_train)

# Tree 2
y_error1 = y_train - y_pred1
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X_train, y_error1)
y_pred2 = tree_reg2.predict(X_train)

# Tree 3
y_error2 = y_error1 - y_pred2
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X_train, y_error2)
y_pred3 = tree_reg3.predict(X_train)

final_prediction = np.sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

print(mean_squared_error(final_prediction, y_test))

<IPython.core.display.Javascript object>

21.605190702000193


In [35]:
## Gradient Boosting : Scikit

from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
mean_squared_error(y_pred, y_test)

21.605190702000183

In [39]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 0.1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
print(mean_squared_error(y_pred, y_test))

51.08895704979148


In [37]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 300, learning_rate = 1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
print(mean_squared_error(y_pred, y_test))

14.61533125232632


In [38]:
gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 300, learning_rate = 0.1)
gbrt.fit(X_train, y_train)
y_pred = gbrt.predict(X_test)
print(mean_squared_error(y_pred, y_test))

9.8373719804626
