# Primeiro de Inteligência Artificial e Sistemas Inteligentes

Rebeca Cecco de Oliveira

## Load packages

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV, RepeatedStratifiedKFold

## Implementation of Heterogenous Pooling

In [None]:
def HeterogenousPoolingClassifier():

## Search parameters

In [4]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Bagging' : BaggingClassifier();
    'AdaBoost' : AdaBoostClassifier();
    'HeterogenousPooling' : HeterogenousPoolingClassifier();
}

parameters = {
    'Random Forest': {'n_estimators': [10, 25, 50, 100]},
    'Bagging': {'n_estimators': [10,25,50,100]},
    'RandomForest': {'n_estimators': [10,25,50,100]},
    'HeterogenousPooling': {'n_samples': [1,3,5,7]}
    
}

## Experimental

In [12]:
def experimental(base, filename):
    X, y = iris.data, iris.target
    print()
    print(filename)
    ans = {}
    for c in ['Bagging', 'AdaBoost', 'Random Forest']:
        start = time.process_time()
        clf = Pipeline([('transformer', StandardScaler()), 
                        ('estimator', GridSearchCV(models[c], param_grid=parameters[c], scoring='accuracy', 
                                                   cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=36851234)))]).fit(X, y) # Grid search
        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', clf)])
        ans[c] = cross_validate(pipeline, X, y, 
                                cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234), scoring='accuracy')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))



### Execution

In [5]:
iris = datasets.load_iris()
digits = datasets.load_digits()
wine = datasets.load_wine()
cancer = datasets.load_breast_cancer()

for k, base in {'iris':iris, 'digits':digits, 'wine':wine, 'cancer':cancer}.items():
    experimental(base, k)

NameError: name 'experimental' is not defined

### Results

In [None]:
import pickle

filename = "iris"
ans = pickle.load(open(filename, "rb" ))

In [None]:
DF = pd.DataFrame()
for a in ans.keys():
    DF[a] = ans[a]['test_score']
DF.head()

In [None]:
pd.DataFrame({'mean': DF.mean(), 'std': DF.std(), 
              'confit_inf': DF.apply(lambda x: st.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=st.sem(x))[0]),
              'confit_sup': DF.apply(lambda x: st.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=st.sem(x))[0])})

In [None]:
plt.figure()
sns.boxplot(data=DF)
plt.xticks(rotation=90)
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
import scipy.stats as stats
M = [[-1]*len(ans) for _ in range(len(ans))]

for i in range(len(ans)):
    for j in range(i+1, len(ans)):
        # Calculate the t-test on TWO RELATED samples of scores, a and b.
        M[i][j] = stats.ttest_rel(DF.iloc[:,i], DF.iloc[:,j]).pvalue
        if any((DF.iloc[:,i] - DF.iloc[:,j]) != 0):
            # Calculate the Wilcoxon signed-rank test.
            M[j][i] = stats.wilcoxon(DF.iloc[:,i], DF.iloc[:,j]).pvalue
pd.DataFrame(M, index=DF.columns, columns=DF.columns)
# Reject when p-value is greater then 0.05