# Imports, etc.

In [None]:
from AugBoost import AugBoostClassifier as ABC
from AugBoost import AugBoostRegressor as ABR

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss

import pandas as pd
import numpy as np

import pickle
import datetime
import gc

In [None]:
data_path = './data/'
result_path = './results/'

# Classification experiments

In [None]:
with open(data_path + 'classification_data_processed.pkl', 'rb') as f:
    classification_datasets = pickle.load(f)

In [None]:
kf = KFold(n_splits=3, shuffle=True)
experiment_details = 'classification_nn_150_estimators_3_subsets_10_trees-between-updates'

In [None]:
classification_datasets_docs = []
prev_time = datetime.datetime.now()
for i, dataset in enumerate(classification_datasets):
    X, y, dataset_name = dataset.values()
    print('**********', 'Dataset: ', dataset_name, '**********')
    folds_docs = []
    for train_index, test_index in kf.split(X):
        print('~~~ new fold ~~~')
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        prev_time = datetime.datetime.now()
        model = ABC(n_estimators = 150, max_epochs = 1000, learning_rate = 0.1, \
            n_features_per_subset = round(len(X_train.columns)/3), trees_between_feature_update = 10,\
            augmentation_method = 'nn', save_mid_experiment_accuracy_results = False)
        model.fit(X = X_train, y = y_train, X_val = X_val, y_val = y_val) 
        new_time = datetime.datetime.now()
        training_secs = (new_time - prev_time).total_seconds()
        score = log_loss(y_val, model.predict_proba(X_val), labels = y.unique())
        folds_docs.append((score, test_index, training_secs))
        del model
        gc.collect()
    classification_datasets_docs.append((i, dataset_name, folds_docs))
    with open(result_path + 'datasets_docs_' + experiment_details + '.pkl', 'wb') as f:
        pickle.dump(classification_datasets_docs, f)

In [None]:
results = []
for dataset in classification_datasets_docs:
    temp_losses = []
    for j in range(len(dataset[2])):
        temp_losses.append(dataset[2][j][0])
    results.append([dataset[1], np.mean(temp_losses), np.std(temp_losses)])
results = pd.DataFrame(results)
results.columns = ['dataset', 'mean (log-loss)', 'std (log-loss)']

In [None]:
results

# Regression experiments

In [None]:
with open(data_path + 'regression_data_processed.pkl', 'rb') as f:
    regression_datasets = pickle.load(f)

In [None]:
kf = KFold(n_splits=3, shuffle=True)
experiment_details = 'regression_nn_150_estimators_3_subsets_10_trees-between-updates'

In [None]:
regression_datasets_docs = []
prev_time = datetime.datetime.now()
for i, dataset in enumerate(regression_datasets):
    X, y, dataset_name = dataset.values()
    print('**********', 'Dataset: ', dataset_name, '**********')
    folds_docs = []
    for train_index, test_index in kf.split(X):
        print('~~~ new fold ~~~')
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        prev_time = datetime.datetime.now()
        model = ABR(n_estimators = 150, max_epochs = 1000, learning_rate = 0.1, \
            n_features_per_subset = round(len(X_train.columns)/3), trees_between_feature_update = 10,\
            augmentation_method = 'nn', save_mid_experiment_accuracy_results = False)
        model.fit(X = X_train, y = y_train, X_val = X_val, y_val = y_val) 
        new_time = datetime.datetime.now()
        training_secs = (new_time - prev_time).total_seconds()
        score = log_loss(y_val, model.predict_proba(X_val), labels = y.unique())
        folds_docs.append((score, test_index, training_secs))
        del model
        gc.collect()
    regression_datasets_docs.append((i, dataset_name, folds_docs))
    with open(result_path + 'datasets_docs_' + experiment_details + '.pkl', 'wb') as f:
        pickle.dump(regression_datasets_docs, f)

In [None]:
results = []
for dataset in regression_datasets_docs:
    temp_losses = []
    for j in range(len(dataset[2])):
        temp_losses.append(dataset[2][j][0])
    results.append([dataset[1], np.mean(temp_losses), np.std(temp_losses)])
results = pd.DataFrame(results)
results.columns = ['dataset', 'mean (log-loss)', 'std (log-loss)']

In [None]:
results