In [None]:
%load_ext autoreload

In [None]:
import utils
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

import utils.models_utils as models

import const

In [None]:
%autoreload 2

In [None]:
# get dataset
dataset = utils.load_dataset(10)

# fit scalers
# base set
base_set_standard_scaler = StandardScaler().fit(dataset.x_train_base)
base_set_minmax_scaler = MinMaxScaler().fit(dataset.x_train_base)

# complete set
complete_set_standard_scaler = StandardScaler().fit(dataset.x_train_complete)
complete_set_minmax_scaler = MinMaxScaler().fit(dataset.x_train_complete)

# sub set
sub_set_standard_scaler = StandardScaler().fit(dataset.x_train_subset)
sub_set_minmax_scaler = MinMaxScaler().fit(dataset.x_train_subset)


In [None]:
# KNN
knn_params = {
    'n_neighbors': [10, 20, 25, 30, 50, 60, 100, 150, 250],
}

In [None]:
# get models with differents types of preprocessing
base_results = models.train_models(KNeighborsRegressor(), dataset.x_train_base,
                    dataset.y_train, dataset.x_test_base, dataset.y_test, knn_params, base_set_standard_scaler, base_set_minmax_scaler)
base_results['set'] = 'base'

base_results

In [None]:
sub_results = models.train_models(KNeighborsRegressor(), dataset.x_train_subset,
                    dataset.y_train, dataset.x_test_subset, dataset.y_test, knn_params, sub_set_standard_scaler, sub_set_minmax_scaler)
                    
sub_results['set'] = 'sub'

sub_results

In [None]:
complete_results = models.train_models(KNeighborsRegressor(), dataset.x_train_complete,
                    dataset.y_train, dataset.x_test_complete, dataset.y_test, knn_params, complete_set_standard_scaler, complete_set_minmax_scaler)

complete_results['set'] = 'complete'

complete_results

In [None]:
knn_results = base_results.append([sub_results, complete_results])
knn_results['model'] = 'knn'

utils.plot_scores(x='set', y='r2_test', hue='preprocessing',  data=knn_results, title='knn scores')

In [None]:
rf_params = {
    'n_estimators': [200, 300, 500]
}

In [None]:
# get models with differents types of preprocessing
base_results = models.train_models(RandomForestRegressor(), dataset.x_train_base,
                    dataset.y_train, dataset.x_test_base, dataset.y_test, rf_params, base_set_standard_scaler, base_set_minmax_scaler)
base_results['set'] = 'base'

base_results

In [None]:
sub_results = models.train_models(RandomForestRegressor(), dataset.x_train_subset,
                    dataset.y_train, dataset.x_test_subset, dataset.y_test, rf_params, sub_set_standard_scaler, sub_set_minmax_scaler)
sub_results['set'] = 'sub'

sub_results

In [None]:
complete_results = models.train_models(RandomForestRegressor(), dataset.x_train_complete,
                    dataset.y_train, dataset.x_test_complete, dataset.y_test, rf_params, complete_set_standard_scaler, complete_set_minmax_scaler)
complete_results['set'] = 'complete'

complete_results

In [None]:
rf_results = base_results.append([sub_results, complete_results])

rf_results['model'] = 'rf'

utils.plot_scores(x='set', y='r2_test', hue='preprocessing',  data=rf_results, title='random forest scores')

In [None]:
sgd_params = {
    'early_stopping': [True, False],
    'alpha': [10.0**-n for n in range(7)]
}

In [None]:
# get models with differents types of preprocessing
base_results = models.train_models(SGDRegressor(), dataset.x_train_base,
                    dataset.y_train, dataset.x_test_base, dataset.y_test, sgd_params, base_set_standard_scaler, base_set_minmax_scaler)
base_results['set'] = 'base'

base_results

In [None]:
sub_results = models.train_models(SGDRegressor(), dataset.x_train_subset,
                    dataset.y_train, dataset.x_test_subset, dataset.y_test, sgd_params, sub_set_standard_scaler, sub_set_minmax_scaler)
sub_results['set'] = 'sub'

sub_results

In [None]:
complete_results = models.train_models(SGDRegressor(), dataset.x_train_complete,
                    dataset.y_train, dataset.x_test_complete, dataset.y_test, sgd_params, complete_set_standard_scaler, complete_set_minmax_scaler)
complete_results['set'] = 'complete'

complete_results

In [None]:
sgd_results = base_results.append([sub_results, complete_results])
sgd_results['model'] = 'sgd'

utils.plot_scores(x='set', y='r2_test', hue='preprocessing',  data=sgd_results[sgd_results['preprocessing'] !='no'], title='sgd scores')

In [None]:
# base set (minmax scaled data)
nn_params = {
    "hidden_sizes": [5, 10, 20, 50, 100],
    "nums_layers": [1, 2],
    "num_epochs": [500, 1000],
    "batch_sizes": [512],
    "learning_rates": [0.1],
    "gamma": [0.05], # lr decay
    "dropout": [0, 0.2]
}

train_data_nn, test_data_nn = utils.get_nn_dataset(dataset.x_train_base, dataset.x_test_base, dataset.y_train, dataset.y_test ,scaler = base_set_minmax_scaler)

base_nn_results = models.train_neural_nets(train_data_nn, test_data_nn, nn_params)
base_nn_results['set'] = "base"
base_nn_results['preprocessing'] = 'minmax'

utils.res_to_csv(base_nn_results, f'{const.csv_results_folder}base_nn_results')

# print three best results
base_nn_results.sort_values('r2_test', ascending=False).head(3)


In [None]:
# sub set (minmax scaled data)

nn_params = {
    "hidden_sizes": [15, 20, 50, 100, 200],
    "nums_layers": [1, 2],
    "num_epochs": [500, 1000],
    "batch_sizes": [512],
    "learning_rates": [0.1],
    "gamma": [0.05], # lr decay
    "dropout": [0, 0.2]
}

train_data_nn, test_data_nn = utils.get_nn_dataset(dataset.x_train_subset, dataset.x_test_subset, dataset.y_train, dataset.y_test, scaler=sub_set_minmax_scaler)

sub_nn_results = models.train_neural_nets(train_data_nn, test_data_nn, nn_params)
sub_nn_results['set'] = "sub"
sub_nn_results['preprocessing'] = 'minmax'

utils.res_to_csv(sub_nn_results, f'{const.csv_results_folder}subset_nn_results')

sub_nn_results.sort_values('r2_test', ascending=False).head(3)


In [None]:
# complete set (minmax scaled data)

nn_params = {
    "hidden_sizes": [20, 50, 100, 200],
    "nums_layers": [1, 2],
    "num_epochs": [500, 1000],
    "batch_sizes": [512],
    "learning_rates": [0.1],
    "gamma": [0.05], # lr decay
    "dropout": [0, 0.2]
}

train_data_nn, test_data_nn = utils.get_nn_dataset(dataset.x_train_complete, dataset.x_test_complete, dataset.y_train, dataset.y_test, scaler=complete_set_minmax_scaler)

complete_nn_results = models.train_neural_nets(train_data_nn, test_data_nn, nn_params)
complete_nn_results['set'] = "complete"
complete_nn_results['preprocessing'] = 'minmax'

utils.res_to_csv(complete_nn_results, f'{const.csv_results_folder}complete_nn_results')

complete_nn_results.sort_values('r2_test', ascending=False).head(3)

In [None]:
nn_results = base_nn_results.append([sub_nn_results, complete_nn_results]).sort_values('r2_test', ascending=False)
nn_results['model'] = 'nn'

# get best res for each set results
nn_best_results = nn_results.groupby(['set', 'preprocessing']).head(1)

utils.plot_scores('set', 'r2_test', 'preprocessing', nn_best_results, "best neural net scores")

nn_best_results

In [None]:
total_results = knn_results.append([rf_results, sgd_results, nn_best_results])

total_best_results = total_results.sort_values(by=['r2_test'], ascending=False).groupby('model').head(1)

utils.plot_scores('model', 'r2_test', None, total_best_results, 'mh')

total_best_results