# <span style='color:blue'><center>Ensemble Learning</center></span>

Combine multiple classifiers to make another, more robust, classifier. The initial classifiers don't need to be extremely good, just need to guarantee that the error for each one is less than 50% (perform a bit better than a random classifier). Random forests, for example, are a combination of multiple decision trees, each one tunned over a random subset of the training data with some randomly chosen features.

In [23]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import norm, multivariate_normal
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
letter_data = pd.DataFrame.from_csv('letter_recognition.txt', index_col=None)
letter_labels = letter_data['lettr']
letter_data = letter_data.drop(['lettr'], axis=1)

letter_data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [3]:
encoder = LabelEncoder()
categorical_labels = letter_labels
labels_encoded = encoder.fit_transform(categorical_labels)
print(np.unique(labels_encoded))
print(encoder.classes_)
letter_labels = labels_encoded

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']


In [4]:
train_set, test_set, train_labels, test_labels = train_test_split(letter_data, letter_labels, test_size = 0.2, random_state = 6969)

print('Training set: ', train_set.shape, ' Test set: ', test_set.shape)

Training set:  (16000, 16)  Test set:  (4000, 16)


In [20]:
logistic_clf = LogisticRegression( solver='lbfgs', multi_class='multinomial', max_iter=500 )
randforest_clf = RandomForestClassifier( n_estimators = 5 )
svm_clf = SVC( gamma='auto', probability = True )

voting_clf = VotingClassifier( estimators = [('lr', logistic_clf), ('rf', randforest_clf), ('svc', svm_clf)], voting = 'soft')

voting_clf.fit( train_set, train_labels )



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', ...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [21]:
for each_clf in [logistic_clf, randforest_clf, svm_clf, voting_clf]:
    each_clf.fit(train_set, train_labels)
    acc_clf_train = each_clf.score(train_set, train_labels)
    acc_clf_test = each_clf.score(test_set, test_labels)
    
    print('Algorithm: ', each_clf.__class__.__name__, ' Train accuracy: ', acc_clf_train, ' Test accuracy: ', acc_clf_test)



Algorithm:  LogisticRegression  Train accuracy:  0.7820625  Test accuracy:  0.76575
Algorithm:  RandomForestClassifier  Train accuracy:  0.993375  Test accuracy:  0.9005
Algorithm:  SVC  Train accuracy:  0.995  Test accuracy:  0.973




Algorithm:  VotingClassifier  Train accuracy:  0.99725  Test accuracy:  0.966


### Use Adaboost algorithm to improve a decision tree sequentially:

In [29]:
ada_clf = AdaBoostClassifier( DecisionTreeClassifier(max_depth=7), n_estimators=200, 
                             algorithm="SAMME.R", learning_rate=0.5 )
ada_clf.fit(train_set, train_labels)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

In [30]:
print('AdaBoost - DecisionTree: ', ' Train accuracy: ', ada_clf.score(train_set, train_labels), ' Test accuracy: ', ada_clf.score(test_set, test_labels) )

AdaBoost - DecisionTree:   Train accuracy:  0.972  Test accuracy:  0.91875
