In [1]:
from mlaut.data import Data
import pydataset
from mlaut.experiments import Orchestrator
from mlaut.estimators.baseline_estimators import Baseline_Classifier
from mlaut.estimators.bayes_estimators import Gaussian_Naive_Bayes
from mlaut.estimators.decision_trees import Decision_Tree_Classifier
from mlaut.resampling import Single_Split
from sklearn.model_selection import train_test_split
from mlaut.experiments.data import DatasetHDF5
from mlaut.experiments.data import ResultHDF5

from sklearn import preprocessing

import os
import shutil


In [2]:
from mlaut.experiments.analysis import AnalyseResults
from mlaut.experiments.scores import ScoreAccuracy


In [3]:
os.mkdir('data')

### Get the data and organise it

In [4]:
aids = pydataset.data('aids')
uis = pydataset.data('uis')

aids_meta = {
'target': 'adult',
'source':'pydataset',
'dataset_name':'aids'
}

uis_meta = {
    'target': 'IV3',
    'source': 'pydataset',
    'dataset_name': 'uis'
}

data = Data(hdf5_datasets_group='pydata')
data.set_io(input_data='data/test_input.h5', output_data='data/test_output.h5')

datasets = [aids, uis]
metadata = [aids_meta, uis_meta]
data.pandas_to_db(datasets=datasets, dts_metadata=metadata)

### Orchestrate the experiments

In [5]:
aids_data = DatasetHDF5(hdf5_path='data/test_input.h5',dataset_path='pydata/aids')
uis_data = DatasetHDF5(hdf5_path='data/test_input.h5',dataset_path='pydata/uis')

cv = Single_Split(cv=train_test_split)
datasets = [aids_data,uis_data]
strategies = [Baseline_Classifier(), Decision_Tree_Classifier(), Gaussian_Naive_Bayes()]


result = ResultHDF5(hdf5_path='data/test_result.h5', 
                    predictions_save_path='predictions', 
                    trained_strategies_save_path='data/trained_estimators')

orchestrator = Orchestrator(datasets=datasets, strategies=strategies, cv = cv, result=result)
orchestrator.run()


INFO:root:fitting: {'estimator_family': ['Baseline'], 'tasks': ['Classification'], 'name': 'DummyClassifier'} on dataset: aids
INFO:root:fitting: {'estimator_family': ['Decision_Tree'], 'tasks': ['Classification'], 'name': 'DecisionTreeClassifier'} on dataset: aids
INFO:root:fitting: {'estimator_family': ['Naive_Bayes'], 'tasks': ['Classification'], 'name': 'GaussianNaiveBayes'} on dataset: aids
INFO:root:fitting: {'estimator_family': ['Baseline'], 'tasks': ['Classification'], 'name': 'DummyClassifier'} on dataset: uis
INFO:root:fitting: {'estimator_family': ['Decision_Tree'], 'tasks': ['Classification'], 'name': 'DecisionTreeClassifier'} on dataset: uis
INFO:root:fitting: {'estimator_family': ['Naive_Bayes'], 'tasks': ['Classification'], 'name': 'GaussianNaiveBayes'} on dataset: uis


### Analyse the results of the experiments

In [6]:
analyse = AnalyseResults(result)
score_accuracy = ScoreAccuracy()
loss_dict, loss_pd = analyse.prediction_errors(score_accuracy)


#### t-test

In [7]:
t_test, t_test_df = analyse.t_test(loss_dict)
t_test_df

Unnamed: 0_level_0,DecisionTreeClassifier,DecisionTreeClassifier,DummyClassifier,DummyClassifier,GaussianNaiveBayes,GaussianNaiveBayes
Unnamed: 0_level_1,t_stat,p_val,t_stat,p_val,t_stat,p_val
DecisionTreeClassifier,0.0,1.0,1.682613,0.234479,-0.393935,0.731662
DummyClassifier,-1.682613,0.234479,0.0,1.0,-1.765798,0.219471
GaussianNaiveBayes,0.393935,0.731662,1.765798,0.219471,0.0,1.0


Other available statistical tests that can be perfomed include:

* `mlaut.experiments.analysis.ranks()`
* `mlaut.experiments.analysis.t_test()`
* `mlaut.experiments.analysis.sign_test()`
* `mlaut.experiments.analysis.ranksum_test()`
* `mlaut.experiments.analysis.t_test_with_bonferroni_correction()`
* `mlaut.experiments.analysis.wilcoxon_test()`
* `mlaut.experiments.analysis.friedman_test()`
* `mlaut.experiments.analysis.nemenyi()`


In [8]:
shutil.rmtree('data')
