# Results Analysis

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

path = "./results/"
ml_models = ['knn', 'rf', 'sgd']
subset_types = ['base', 'complete', 'sub']
preprocessing_types=['no_scaling', 'standard_scaling', 'minmax_scaling']

# dpi = 100

# init useful functions
def grouped_bar_plot(x, y, hue, data, model_name, legend_pos='lower right'):
    # plt.figure(dpi=dpi)

    sns.set_style(style='darkgrid') 
    sns.barplot(x=x, y=y, hue=hue, data=data)
    plt.legend(loc=legend_pos)

    plt.ylabel(f"{y.replace('_', ' ')} value")
    plt.xlabel(x.replace('_', ' '))
    plt.title(f"{model_name} scores")

def train_test_bar_plot(train_values, test_values, groups, score_type, title):

    fig = plt.figure()

    X = np.arange(0, len(groups)/2, 0.5)

    ax = fig.add_axes([0,0,1,1])
    
    ax.bar(X + 0.00, train_values, width=0.20)
    ax.bar(X + 0.20, test_values, width=0.20)
    plt.xticks(X + 0.10, groups)

    ax.legend(labels=['Train', 'Test'])
    plt.xlabel('Preprocessing type')
    plt.ylabel(score_type)

    plt.title(title)


## Classic machine learning models


In [None]:
ml_results = pd.DataFrame()

for model_name in ml_models:
    ml_results = ml_results.append(pd.read_csv(f'{path}{model_name}_results.csv'))

ml_results = ml_results.drop("Unnamed: 0",axis=1).reset_index(drop=True)

ml_results.head(5)

### K-Nearest Neighbors Regressor

In [None]:
knn_results=ml_results[ml_results['model_type'] == 'knn']
grouped_bar_plot(x='dataset_type', y='r2_test', hue='preprocessing_type', data=knn_results, model_name='KNN regressor')

In [None]:

for set_type in subset_types:
    result = [knn_results['dataset_type'] == set_type]
    train_test_bar_plot(result['r2_train'], result['r2_test'], preprocessing_types, 'R2 score', f'SGD on {set_type} set R2 on train/test')
    

### Random forest regressor

In [None]:
rf_results = ml_results[ml_results['model_type'] == 'rf']
grouped_bar_plot(x='dataset_type', y='r2_test', hue='preprocessing_type', data= rf_results, model_name='Random forest regressor')

In [None]:
for set_type in subset_types:
    result = rf_results[rf_results['dataset_type'] == set_type]
    train_test_bar_plot(result['r2_train'], result['r2_test'], preprocessing_types, 'R2 score', f'SGD on {set_type} set R2 on train/test')

### Stochastic gradient descend regressor

In [None]:
sgd_results = ml_results[ml_results['model_type'] == 'sgd']
grouped_bar_plot(x='dataset_type', y='r2_test', hue='preprocessing_type', data= sgd_results, model_name='SGD regressor', legend_pos="upper left")

In [None]:
for set_type in subset_types:
    result = sgd_results[sgd_results['dataset_type'] == set_type]
    train_test_bar_plot(result['r2_train'], result['r2_test'], preprocessing_types, 'R2 score', f'SGD on {set_type} set R2 on train/test')

## Neural networks results

In [None]:
nn_results = pd.DataFrame()

for subset_type in subset_types:
    ml_results = ml_results.append(pd.read_csv(f'{path}{model_name}_results.csv'))

ml_results = ml_results.drop("Unnamed: 0",axis=1).reset_index(drop=True)

ml_results.head(5)