# Primeiro de Inteligência Artificial e Sistemas Inteligentes

Rebeca Cecco de Oliveira

## Load packages

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import pickle
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV, RepeatedStratifiedKFold
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

## Implementation of Heterogenous Pooling

In [12]:
class HeterogenousPoolingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_samples=3):
        self.n_samples = n_samples
    
    def fit(self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)

        self.X_ = X
        self.y_ = y

        values, counts = np.unique(y, return_counts=True)
        classes = pd.DataFrame({'values':values, 'counts':counts})
        classes.sort_values('counts', ascending=False, inplace=True)
        self._classes = classes

        self._clfs = {
            'GaussianNB': [GaussianNB() for _ in range(self.n_samples)],
            'DecisionTreeClassifier':[DecisionTreeClassifier() for _ in range(self.n_samples)] ,
            'KNeighborsClassifier': [KNeighborsClassifier() for _ in range (self.n_samples)]
        }

        for sample in range(self.n_samples):
            if(sample == 0):
                X_train, y_train = X, y
            else:
                X_train, y_train = resample(X, y, random_state=sample-1)

            self._clfs['GaussianNB'][sample].fit(X_train,y_train)
            self._clfs['DecisionTreeClassifier'][sample].fit(X_train,y_train)
            self._clfs['KNeighborsClassifier'][sample].fit(X_train,y_train)

        return self
    
    
    def predict(self, X, y):
        # Check is fit had been called
        check_is_fitted(self)
        
        # Input validation
        X = check_array(X)
        
        df = pd.DataFrame()

        for sample in range(self.n_samples):
            df['GaussianNB_' + str(sample)] = self._clfs['GaussianNB'][sample].predict(X)
            df['DecisionTreeClassifier_' + str(sample)] = self._clfs['DecisionTreeClassifier'][sample].predict(X)
            df['KNeighborsClassifier_' + str(sample)] = self._clfs['KNeighborsClassifier'][sample].predict(X)

        modes = df.mode(axis=1)
        modes['pred'] = modes.iloc[:,0]

        if (len(modes.columns) > 2):
            modes_ajust = modes.loc[~modes.iloc[:,1].isnull()]
            for i,row in modes_ajust.iterrows():
                values = row.values[:-1]
                values = values[~np.isnan(values)]
                possiveis = self._classes.loc[self._classes['values'].isin(values),:]

                modes.iloc[i,:]['pred'] = possiveis.loc[possiveis['counts'].idxmax()]['values']

        return modes['pred'].to_numpy()

In [6]:
cls = HeterogenousPoolingClassifier(3)
cls.fit(X, y)
cls.predict(X, y)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Search parameters

In [13]:
#dicionário de modelos
models = {
    'Random Forest': RandomForestClassifier(),
    'Bagging' : BaggingClassifier(),
    'AdaBoost' : AdaBoostClassifier(),
    'HeterogenousPooling' : HeterogenousPoolingClassifier()
}
#parametros dados pelo professor na espeficiação do trabalho
parameters = {
    'Random Forest': {'n_estimators': [10, 25, 50, 100]},
    'Bagging': {'n_estimators': [10,25,50,100]},
    'AdaBoost': {'n_estimators': [10,25,50,100]},
    'HeterogenousPooling': {'n_samples': [1,3,5,7]}
}

## Experimental

In [9]:
def experimental(base, filename):
    X, y = iris.data, iris.target
    print()
    print(filename)
    ans = {}
    for c in ['Random Forest', 'Bagging', 'AdaBoost', 'HeterogenousPooling']:
        start = time.process_time()
        clf = Pipeline([('transformer', StandardScaler()), 
                        ('estimator', GridSearchCV(models[c], param_grid=parameters[c], scoring='accuracy', 
                                                   cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=36851234)))]).fit(X, y) # Grid search
        pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', clf)])
        ans[c] = cross_validate(pipeline, X, y, 
                                cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=36851234), scoring='accuracy')
        print('Elapsed time of {} is {:.6f} seconds.'.format(c, time.process_time() - start))



### Execution

In [None]:
digits = datasets.load_digits()
wine = datasets.load_wine()
cancer = datasets.load_breast_cancer()

for k, base in {'digits':digits, 'wine':wine, 'cancer':cancer}.items():
    experimental(base, k)

### Results

In [None]:
import pickle

filename = "iris"
ans = pickle.load(open(filename, "rb" ))

In [None]:
DF = pd.DataFrame()
for a in ans.keys():
    DF[a] = ans[a]['test_score']
DF.head()

In [None]:
pd.DataFrame({'mean': DF.mean(), 'std': DF.std(), 
              'confit_inf': DF.apply(lambda x: st.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=st.sem(x))[0]),
              'confit_sup': DF.apply(lambda x: st.t.interval(0.95, len(x)-1, loc=np.mean(x), scale=st.sem(x))[0])})

In [None]:
plt.figure()
sns.boxplot(data=DF)
plt.xticks(rotation=90)
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
import scipy.stats as stats
M = [[-1]*len(ans) for _ in range(len(ans))]

for i in range(len(ans)):
    for j in range(i+1, len(ans)):
        # Calculate the t-test on TWO RELATED samples of scores, a and b.
        M[i][j] = stats.ttest_rel(DF.iloc[:,i], DF.iloc[:,j]).pvalue
        if any((DF.iloc[:,i] - DF.iloc[:,j]) != 0):
            # Calculate the Wilcoxon signed-rank test.
            M[j][i] = stats.wilcoxon(DF.iloc[:,i], DF.iloc[:,j]).pvalue
pd.DataFrame(M, index=DF.columns, columns=DF.columns)
# Reject when p-value is greater then 0.05