In [43]:
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection  import train_test_split
from sklearn.metrics import accuracy_score

### Load Dataset

In [4]:
data = datasets.load_digits()

In [9]:
X_data = data.images   # load X_data
y_data = data.target   # load y_data

In [10]:
X_data = X_data.reshape(X_data.shape[0], X_data.shape[1] * X_data.shape[2])    # flatten X_data
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 7)    # split data into train & test set

### 1. Single Classifier
- A single SVM classifier

In [18]:
clf = SVC()

In [19]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
y_pred = clf.predict(X_test)

In [21]:
print(accuracy_score(y_pred, y_test))

0.377777777778


### 2. Bagging (Bootstrap Aggregation)
- Aggregate predictions from many estimators with random bootstrap samples
- Train 100 independent SVM classifiers and aggregate them
- Parameter description
    - **base_estimator**: classification algorithm to predict & aggregate its results
    - **n_estimators**: number of bootstrap samples & classifiers created by them
    - **max_samples**: proportion of samples to be drawn from training data to train
    - **max_features**: proportion of features to be used for training

In [38]:
bag_clf = BaggingClassifier(base_estimator = SVC(), n_estimators = 100, max_samples = 0.5, max_features = 1.0, random_state = 5)    # create a bagging classifier

In [39]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=100, n_jobs=1, oob_score=False,
         random_state=5, verbose=0, warm_start=False)

In [40]:
y_pred = bag_clf.predict(X_test)

In [41]:
print(accuracy_score(y_pred, y_test))

0.388888888889


### 3. Voting Classifier
- Combine different classifiers and "vote" using their results 
- Voting methods
    - **Soft voting**: for each data instance, class label with greated summed predicted probabilities is selected as final class
    - **Hard voting**: for each data instance, majority of class labels predicted are selected as final class
- Not only same classification algorithms, but also different algorithms can be combined as well

#### Hard voting with same classifiers (SVC)

In [44]:
clf_1 = SVC()
clf_2 = SVC()
clf_3 = SVC()

In [46]:
hard_vote_clf = VotingClassifier(estimators = [('svm1', clf_1), ('svm2', clf_2), ('svm3', clf_3)], voting = 'hard')

In [47]:
hard_vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('svm1', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)), ('svm2', SVC(C=1.0, cache_size=200, class_weight=None...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='hard', weights=None)

In [48]:
y_pred = hard_vote_clf.predict(X_test)

In [49]:
print(accuracy_score(y_pred, y_test))

0.377777777778


#### Soft voting with different classifiers (SVC, DT, NB)

In [54]:
clf_1 = SVC(probability=True)    # probability has to be set 'True' in order to perform soft voting
clf_2 = DecisionTreeClassifier()
clf_3 = GaussianNB()

In [55]:
soft_vote_clf = VotingClassifier(estimators = [('svm', clf_1), ('decision_tree', clf_2), ('naive_bayes', clf_3)], voting = 'soft')

In [56]:
soft_vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)), ('decision_tree', DecisionTreeClassifier(class_weight=N...      presort=False, random_state=None, splitter='best')), ('naive_bayes', GaussianNB(priors=None))],
         n_jobs=1, voting='soft', weights=None)

In [57]:
y_pred = soft_vote_clf.predict(X_test)

In [58]:
print(accuracy_score(y_pred, y_test))

0.886111111111
