# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [46]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [47]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import itertools

In [48]:
def build_classifiers():
    
    # fill this part
    
    linear_regression = LinearRegression()
    linear_regression.fit(data_set, labels)
    
    neighbors = KNeighborsClassifier(3)
    neighbors.fit(data_set, labels)
    
    svm = SVC(gamma='auto')
    svm.fit(data_set, labels)
    
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(data_set, labels)
    
    naive_bayes = GaussianNB()
    naive_bayes.fit(data_set, labels)
    
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(data_set, labels)

    return linear_regression, neighbors, svm, decision_tree, naive_bayes, qda

In [49]:
def build_stacked_classifier(classifiers):
    classifiers_three_list = list(itertools.combinations(classifiers, 3))
    best_accuracy = 0
    best_classifiers_three = []
    best_stacked_classifier = None
    
    for classifiers_three in classifiers_three_list:
        output = []
        for classifier in classifiers_three:
            output.append(classifier.predict(data_set))
        output = np.array(output).reshape((130,3))
    
        # stacked classifier part:
        stacked_classifiers_list = (DecisionTreeClassifier(), KNeighborsClassifier(3), 
                                    SVC(gamma='auto'))
        
        for stacked_classifier in stacked_classifiers_list:
            #stacked_classifier = None # set here
            stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
            stacked_predicted = stacked_classifier.predict(output)
            accuracy = accuracy_score(labels, stacked_predicted)
            #print(stacked_predicted, labels, accuracy)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_classifiers_three = classifiers_three
                best_stacked_classifier = stacked_classifier
                
    test_set = []
    for classifier in best_classifiers_three:
        test_set.append(classifier.predict(test_data_set))
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    predicted = best_stacked_classifier.predict(test_set)
    
    print("BEST CLASSIFIERS:")
    for classifier in best_classifiers_three:
        print(classifier)
        
    print("BEST STACKED CLASSIFIER:")
    print(best_stacked_classifier)
    
    return predicted

In [59]:
classifiers = build_classifiers()
predicted = build_stacked_classifier(classifiers)
accuracy = accuracy_score(test_labels, predicted)
print(accuracy)

BEST CLASSIFIERS:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
BEST STACKED CLASSIFIER:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0

## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [62]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

In [63]:
def calculate_accuracy_vector(predicted, labels):
    result = []
    for i in range(len(predicted)):
        if predicted[i] == labels[i]:
            result.append(0)
        else:
            result.append(1)
    return result

Fill the two functions below:

In [64]:
def set_new_weights(model):
    # fill the code here (two lines)
    # there was a misprint in the formula. The sum must be over t, not i.
    # in the original arc-x4 method the formula contains 1+I(...)^4, not I(...)
    weights = calculate_accuracy_vector(model.predict(test_set), test_labels)
    for item in weights:
        item = item + 1.0
    return weights / np.sum(weights)

def calculate_error(model):
    # fill the code here (two lines)
    predicted = model.predict(test_set)
    return np.dot(weights,calculate_accuracy_vector(predicted, test_labels))


Train the classifier with the code below:

In [65]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)

print(weights)


validate_x, validate_label = generate_data(1, dimension, labels)

[0.         0.         0.         0.00188324 0.         0.00188324
 0.00188324 0.00188324 0.         0.         0.         0.00188324
 0.         0.00188324 0.00188324 0.         0.00188324 0.
 0.         0.         0.         0.00188324 0.00188324 0.
 0.00188324 0.00188324 0.         0.         0.00188324 0.00188324
 0.00188324 0.         0.         0.         0.         0.00188324
 0.         0.         0.         0.         0.         0.
 0.00188324 0.00188324 0.00188324 0.         0.00188324 0.00188324
 0.00188324 0.00188324 0.00188324 0.         0.         0.
 0.00188324 0.00188324 0.00188324 0.00188324 0.00188324 0.00188324
 0.00188324 0.         0.00188324 0.         0.         0.00188324
 0.         0.         0.00188324 0.         0.         0.00188324
 0.         0.         0.         0.00188324 0.00188324 0.00188324
 0.00188324 0.         0.00188324 0.         0.         0.
 0.00188324 0.         0.00188324 0.         0.         0.00188324
 0.         0.00188324 0.         0

Set the validation data set:

In [66]:
validate_x, validate_label = generate_data(1, dimension, labels)

Fill the prediction code:

In [67]:
def get_prediction(x):
    # fill the code here (5-6 lines)
    # weighted voting method
    # assuming x is a single input vector, not a list of them
    predictions = []
    for i in range(len(classifiers)):
        predicted = classifiers[i].predict(x)
        predictions.append(predicted)
        
    scores = [0.0, 0.0]
    
    for prediction, idx_clf in enumerate(predictions):
        if prediction == 0:
            scores[0] = scores[0] + weights[idx_clf]
        else:
            scores[1] = scores[1] + weights[idx_clf]

    #weighted majority voting
    if scores[0] >= scores[1]:
        return 0
    else:
        return 1


Test it:

In [68]:
prediction = get_prediction(validate_x)

print(prediction)

0
