# Ensemble Learning
## Init

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [3]:
PERCENTAGE_OF_DATA = 100 / 100

In [4]:
df_train = pd.read_csv('./mnist_train.csv', header=None)
df_test = pd.read_csv('./mnist_test.csv', header=None)
df_train = df_train.iloc[:int(df_train.shape[0] * PERCENTAGE_OF_DATA), :]
df_test = df_test.iloc[:int(df_test.shape[0] * PERCENTAGE_OF_DATA), :]

In [5]:
df_train = df_train.rename(columns={0: 'digit'})
df_test = df_test.rename(columns={0: 'digit'})

y_train = df_train['digit']
X_train = df_train.drop('digit', axis=1)
y_test = df_test['digit']
X_test = df_test.drop('digit', axis=1)

## Voting classifier

In [7]:
model_1 = SGDClassifier(random_state=0)
model_2 = DecisionTreeClassifier(random_state=0)
model_3 = KNeighborsClassifier()
model_4 = VotingClassifier(estimators=[('SGD', model_1), ('Tree', model_2), ('KNN', model_3)])

for model in [model_1, model_2, model_3, model_4]:
  print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

SGDClassifier 0.8592
DecisionTreeClassifier 0.8781
KNeighborsClassifier 0.9688
VotingClassifier 0.9454


## Bagging

In [9]:
for estimator in [model_2, model_3]:
  model = BaggingClassifier(base_estimator=estimator, n_estimators=100)
  print(estimator.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

DecisionTreeClassifier 0.9557


In [39]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

RandomForestClassifier 0.922


## Boosting

In [8]:
model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0), n_estimators=100, random_state=0)
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

AdaBoostClassifier 0.8757


In [57]:
model = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=0)
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

GradientBoostingClassifier 0.85


## Stacking

In [6]:
estimators = [
    ('Forest', RandomForestClassifier(n_estimators=100, random_state=0)),
    ('BG_KNN',
     BaggingClassifier(base_estimator=KNeighborsClassifier(),
                       n_estimators=100)),
    ('BG_Tree',
     BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=0),
                       n_estimators=100)),
    ('SGD', make_pipeline(StandardScaler(), SGDClassifier(random_state=0)))
]

model = StackingClassifier(estimators, RandomForestClassifier(n_estimators=100, random_state=0))
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))


In [7]:
model = BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=100)
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

In [6]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
print(model.__class__.__name__, model.fit(X_train, y_train).score(X_test, y_test))

RandomForestClassifier 0.9705
