In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [None]:
ver1 = pd.read_csv('../data/processed/summary_dropna_all.csv')
ver2 = pd.read_csv('../data/processed/summary_dropna_selected.csv')
ver3 = pd.read_csv('../data/processed/summary_imputed_all.csv')
ver4 = pd.read_csv('../data/processed/summary_imputed_selected.csv')

df_all_ver = pd.concat([ver1, ver2, ver3, ver4], axis=0)
df_all_ver.sort_values(by='Test RMSE', inplace=True)
df_all_ver.head()

In [None]:
df_all_ver = pd.read_excel('Model_Summary.xlsx')

In [None]:
df_all_ver['Dataset Version'].unique()

array(['dropna + all + unscaled', 'dropna + all + scaled',
       'imputed + all + scaled', 'imputed + all + unscaled',
       'imputed + selected + scaled', 'imputed + selected + unscaled',
       'dropna + selected + unscaled', 'dropna + selected + scaled'],
      dtype=object)

In [None]:
grouped = df_all_ver.groupby(['Model'], as_index=False)[['Train RMSE', 'CV RMSE', 'Test RMSE']].mean()
grouped = round(grouped.sort_values(by='Test RMSE'), 4)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Train RMSE'],
    text=grouped['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['CV RMSE'],
    text=grouped['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Test RMSE'],
    text=grouped['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Average RMSE Score',
    xaxis=dict(title='Model'),
    yaxis=dict(title='RMSE Score')
)
fig.show()

In [None]:
order_all = {'Dataset Version': [
    'dropna + all + unscaled', 'dropna + all + scaled',
    'imputed + all + unscaled', 'imputed + all + scaled',
    'dropna + selected + unscaled', 'dropna + selected + scaled',
    'imputed + selected + unscaled', 'imputed + selected + scaled'
]}

fig = px.bar(
    df_all_ver, x='Model', y='Test RMSE', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Test RMSE'
)

fig.update_layout(title='RMSE Score on Test Set')

fig.show()

In [None]:
fig = px.bar(
    df_all_ver, x='Model', y='Fit Time', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Fit Time',
    log_y=True
)

fig.update_layout(title='Training Time on Cross Validation 5 Folds')

fig.show()

In [None]:
filtered = df_all_ver[df_all_ver['Dataset Version'].str.contains('all')]

order = {'Dataset Version': ['dropna + all + unscaled', 'dropna + all + scaled',
                             'imputed + all + unscaled', 'imputed + all + scaled']}

fig = px.bar(filtered, x='Model', y='Test RMSE', color='Dataset Version',
             barmode='group', category_orders=order, text='Test RMSE')

fig.update_layout(title='Model RMSE Score on Test Set (all features only)')

fig.show()