<a href="https://colab.research.google.com/github/acevedosharp/ensemble-testing-chamber/blob/master/notebooks/ensemble_classifier_combination_tester.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Measuring the effectiveness of ensembles

- assemble all possible combination of ensembles $k \in \{1,2,3,4,5\}$
- 10-fold cross validation
- the single best learner (according to some cross-validation) for each dataset
- for each $k \in \{2,3,4,5\}$, how many ensembles do improve over the single best learner by at least $0.005$ (absolute and relative counts of improvement)
- for each base learner understand in how many improving ensembles it is contained (absolute and relative numbers)

## Experimental setup

In [119]:
import itertools
import numpy as np
import pandas as pd
import sklearn
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# classifiers
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# datasets
from sklearn.datasets import *

from datetime import datetime

In [289]:
# Running the experiments with the 4 datasets at the same time might take longer than 1.5h (bad for Colab) so we can run them separately and join them later
ds_names = [
            "Breast Cancer",
            "Digits",
            "Wine",
            "Madelon"
]
madelonValues = pd.read_csv('madelon.csv').values
randomIndices = np.random.choice(madelonValues.shape[0], size=1000, replace=False) # take only 1000 instances
madelonX = np.take(madelonValues, list(range(madelonValues.shape[1]-1)), axis = 1)[randomIndices]
madelonY = np.take(madelonValues, [madelonValues.shape[1]-1], axis = 1).ravel()[randomIndices] # make 1d instead of column vector
datasets = [
            sklearn.datasets.load_breast_cancer(return_X_y=True),
            sklearn.datasets.load_digits(return_X_y=True),
            sklearn.datasets.load_wine(return_X_y=True),
            (madelonX, madelonY)
]

classifiers = {
    "Linear SVC": LinearSVC(),
    "Decission Tree": DecisionTreeClassifier(), # typo but won't change it :P
    "Extra Tree": ExtraTreeClassifier(),
    "Logistic": LogisticRegression(),
    "Passive Aggressive": PassiveAggressiveClassifier(),
    "Perceptron": Perceptron(),
    "Ridge": RidgeClassifier(),
    "SGD": SGDClassifier(),
    "Multi-layer Perceptron": MLPClassifier(),
    "Linear Discriminant": LinearDiscriminantAnalysis(),
    "Quadratic Discriminant": QuadraticDiscriminantAnalysis(),
    "BernoulliNB": BernoulliNB(),
    "MultinomialNB": MultinomialNB(),
    "Nearest Neighbors": KNeighborsClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Random Forest (10 estimators)": RandomForestClassifier(n_estimators=10),
    "Gradient Boosting": GradientBoostingClassifier()
}

### Execute experiments

In [None]:
results = []

kf = KFold(n_splits=10)
for ds_idx, ds in enumerate(datasets):
  X, Y = ds[0], ds[1]
  #X = StandardScaler().fit_transform(X)
  fold_index = 0
  for train_index, test_index in kf.split(X):
    fold_start_time = datetime.now()
    print(">>> Fold", fold_index)
    X_train, X_test = X[train_index], X[test_index] # np.take
    Y_train, Y_test = Y[train_index], Y[test_index] # np.take

    # Train every classifier with the new data
    for classifier_name, classifier in classifiers.items():
      classifier.fit(X_train, Y_train)
    # Assemble ensembles of size k in {1,2,3,4,5}
    for k in range(1,6):
      print("\t>>> k:", k)
      for combination in list(itertools.combinations(classifiers.keys(), k)):
        ensemble = []
        ensemble_description = ""

        # group classifiers (already exist fitted in dict)
        for idx in range(k):
          ensemble.append(classifiers[combination[idx]])
          ensemble_description += combination[idx]
          ensemble_description += "-"
        ensemble_description = ensemble_description[:-1]

        # save predictions
        predictions = np.zeros((len(X_test), k)) # (# test instances, ensemble size)
        for idx, classifier in enumerate(ensemble):
          predictions[:,idx] = classifier.predict(X_test)
        
        # do hard voting
        hard_voting_predictions = np.zeros((len(X_test), 1)) # (# test instances, 1)
        for idx in range(predictions.shape[0]):
          values, counts = np.unique(predictions[idx], return_counts=True)
          hard_voting_predictions[idx] = values[np.argmax(counts)]
        
        # compare voting predictions against Y_test
        total_instance_number = len(X_test)
        errors = 0
        for idx in range(hard_voting_predictions.shape[0]):
          if (hard_voting_predictions[idx][0] != Y_test[idx]):
            errors += 1
        score = errors/total_instance_number

        # save result (ensemble description, ensemble size, fold index, dataset, score)
        results.append([ensemble_description, k, fold_index, ds_names[ds_idx], score])

    fold_index += 1
    print(">>> fold took:", datetime.now() - fold_start_time)
        

## Analysing results

### Utility functions

In [2]:
def saveResultsAsCsv(filename, headers, rows):
  if len(headers) == len(rows[0]):
    with open(filename,'w') as file:
      file.write(','.join(headers))
      file.write('\n')
      for row in rows:
        file.write(','.join(list(map(lambda rc: str(rc), row))))
        file.write('\n')
  else:
    raise Exception('length of headers does not match length of single rows.')

def filterResultsByDataset(datasetName, results):
  return list(filter(lambda res: str(res[2]) == datasetName, results))

def loadResultsFromCsv(filename):
  loadedResults = []
  with open(filename) as file:
    loadedResults = list(map(lambda entry: entry.split(','), [x.strip() for x in file.readlines()]))
  file.close()
  print('Headers are:', loadedResults[0])
  del loadedResults[0]
  return loadedResults


### Aggregate data

In [106]:
uniqueEnsembles = []
finalResults = []

for ensemble in list(map(lambda res: res[0], results)):
  if ensemble not in uniqueEnsembles:
    uniqueEnsembles.append(ensemble)

for i, ensemble_description in enumerate(uniqueEnsembles):
  filtered = list(filter(lambda res: str(res[0]) == ensemble_description, results))
  sum = 0
  for entry in filtered:
    sum += float(entry[4])
  mean_error = sum / 10
  finalResults.append([filtered[0][0], filtered[0][1], filtered[0][3], mean_error]) # [ensemble_description, ensemble_size, dataset, mean_error]


In [109]:
finalResults = loadResultsFromCsv('finalResults.csv')

Headers are: ['ensembleDescription', 'k', 'dataset', 'errorRate']


### Save results

In [118]:
saveResultsAsCsv('finalResults.csv', ['ensembleDescription', 'k', 'dataset', 'errorRate'], finalResults)

### Load results

In [131]:
finalResults = loadResultsFromCsv('finalResults.csv')
resultsDf = pd.read_csv('finalResults.csv')

Headers are: ['ensembleDescription', 'k', 'dataset', 'errorRate']


### Metrics

#### Error rates for different sizes of k

In [None]:
for dataset, matchingResults in resultsDf.groupby(['dataset']): 
  boxplots = []
  for k, matchingKs in matchingResults.groupby(['k']):
    boxplots.append(matchingKs['errorRate'].values)
  fig, ax = plt.subplots()
  ax.boxplot(boxplots)
  ax.set_title(dataset)
  plt.show()  

#### Best single learners for every dataset

In [193]:
bestSingleLearners = {} # dataset: (ensembleDescription, errorRate)

for dataset, matchingResults in resultsDf.groupby(['dataset']): 
  matchingK1s = matchingResults.loc[resultsDf['k'] == 1]
  minIndex = matchingK1s['errorRate'].idxmin()
  minRow = resultsDf.iloc[minIndex]
  bestSingleLearners[dataset] = (minRow['ensembleDescription'], minRow['errorRate'])

bestSingleLearners

{'Breast Cancer': ('Extra Trees', 0.06906380242972421),
 'Digits': ('Nearest Neighbors', 0.026151458721291126),
 'Iris': ('Linear Discriminant', 0.07222222222222223),
 'Madelon': ('Nearest Neighbors', 0.298),
 'Wine': ('Ridge', 0.03333333333333334)}

#### For every $k \in \{2,3,4,5\}$, how many ensembles do improve over the single best learner by at least $0.005$ (absolute and relative counts of improvement)

In [220]:
improvementMargin = 0.005

improvementEnsembles = {dataset: [] for dataset in bestSingleLearners.keys()} # dataset: (ensembleDescription, errorRate, k)

for dataset, matchingResults in resultsDf.groupby(['dataset']): 
  for k, matchingKs in matchingResults.groupby(['k']):
    if k != 1: # ...
      for i, row in matchingKs.iterrows():
        if (row['errorRate'] + improvementMargin) < bestSingleLearners[dataset][1]:
          improvementEnsembles[dataset].append((row['ensembleDescription'], row['errorRate'], k))

In [None]:
for dataset in improvementEnsembles.keys():
  print(f"\n{dataset}'s best single learner had an error rate of: {bestSingleLearners[dataset][1]}")
  nImproved = len(improvementEnsembles[dataset])
  print(f"\t{nImproved} ({round(nImproved/resultsDf[(resultsDf['dataset'] == dataset) & (resultsDf['k'] != 1)]['dataset'].count()*100, 2)}%) ensembles improved by at least {improvementMargin}")
  for k in range(2, 6):
    matchingImprovementEnsemblesWithK = list(filter(lambda entry: entry[2] == k, improvementEnsembles[dataset]))
    print(f"\t\t{len(matchingImprovementEnsemblesWithK)} with k={k} - mean error rate: {np.mean(list(map(lambda tup: tup[1], matchingImprovementEnsemblesWithK)))}")

#### For each base learner understand in how many improving ensembles it is contained (absolute and relative numbers)

In [None]:
for dataset in ds_names:
  print(f'Dataset: {dataset}')
  improvers = [] # classifier, timesAppearedInImprovements, timesAppearedInImprovementsRelative
  for classifier in classifiers.keys():
    timesAppearedInImprovements = 0
    for row in improvementEnsembles[dataset]:
      if classifier in row[0]:
        timesAppearedInImprovements += 1
    if len(improvementEnsembles[dataset]) != 0:
      improvers.append((classifier, timesAppearedInImprovements, timesAppearedInImprovements/len(improvementEnsembles[dataset])))
    else:
      improvers.append((classifier, 0, 0))

  for entry in sorted(improvers, key=lambda tup: tup[1], reverse=True):
    print(f"\t{entry[0]}: {entry[1]} ({entry[2]*100}%)")
  print()