In [6]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "presentation"
pio.templates

Templates configuration
-----------------------
    Default template: 'presentation'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [109]:
ver1 = pd.read_csv('../data/processed/summary_dropna_all.csv')
ver2 = pd.read_csv('../data/processed/summary_dropna_selected.csv')
ver3 = pd.read_csv('../data/processed/summary_imputed_all.csv')
ver4 = pd.read_csv('../data/processed/summary_imputed_selected.csv')

df_all_ver = pd.concat([ver1, ver2, ver3, ver4], axis=0)
df_all_ver.sort_values(by='Test RMSE', inplace=True)
df_all_ver = round(df_all_ver, 3)
df_all_ver.head()

Unnamed: 0,Model,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE,Dataset Version
0,CatBoostRegressor,0.992,0.917,0.927,0.973,3.061,3.174,0.647,1.15,1.211,dropna + all + unscaled
7,CatBoostRegressor,0.992,0.917,0.927,0.973,3.061,3.174,0.647,1.15,1.211,dropna + all + scaled
7,CatBoostRegressor,0.989,0.915,0.923,1.13,3.141,3.254,0.723,1.213,1.258,dropna + selected + scaled
0,CatBoostRegressor,0.989,0.915,0.923,1.13,3.141,3.254,0.723,1.213,1.258,dropna + selected + unscaled
7,CatBoostRegressor,0.993,0.924,0.912,0.956,3.038,3.389,0.633,1.179,1.158,imputed + all + scaled


In [112]:
df_all_ver['Dataset Version'].unique()

array(['dropna + all + unscaled', 'dropna + all + scaled',
       'dropna + selected + scaled', 'dropna + selected + unscaled',
       'imputed + all + scaled', 'imputed + all + unscaled',
       'imputed + selected + unscaled', 'imputed + selected + scaled'],
      dtype=object)

In [113]:
df_all_ver["Model"] = df_all_ver["Model"].str.replace("Regressor", "")

In [115]:
grouped = df_all_ver.groupby(['Model'], as_index=False)[['Train RMSE', 'CV RMSE', 'Test RMSE']].mean()
grouped = round(grouped.sort_values(by='Test RMSE'), 3)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Train RMSE'],
    text=grouped['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['CV RMSE'],
    text=grouped['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Test RMSE'],
    text=grouped['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Average RMSE Score <br>(7 Models with 8 Dataset Version)',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=1200,
    margin=dict(l=100, r=100, t=100, b=100),
    font_size=17
)
fig.show()

In [40]:
order_all = {'Dataset Version': [
    'dropna + all + unscaled', 'dropna + all + scaled',
    'imputed + all + unscaled', 'imputed + all + scaled',
    'dropna + selected + unscaled', 'dropna + selected + scaled',
    'imputed + selected + unscaled', 'imputed + selected + scaled'
]}

fig = px.bar(
    df_all_ver, x='Model', y='Test RMSE', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Test RMSE'
)

fig.update_layout(title='RMSE Score on Test Set')

fig.show()

In [41]:
fig = px.bar(
    df_all_ver, x='Model', y='Fit Time', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Fit Time',
    log_y=True
)

fig.update_layout(title='Training Time on Cross Validation 5 Folds')

fig.show()

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['Model', 'Train R2', 'CV R2', 'Test R2', 'Train RMSE', 'CV RMSE', 'Test RMSE', 'Train MAE', 'CV MAE', 'Test MAE', 'Dataset Version'] but received: Fit Time

In [42]:
filtered = df_all_ver[df_all_ver['Dataset Version'].str.contains('all')]

order = {'Dataset Version': ['dropna + all + unscaled', 'dropna + all + scaled',
                             'imputed + all + unscaled', 'imputed + all + scaled']}

fig = px.bar(filtered, x='Model', y='Test RMSE', color='Dataset Version',
             barmode='group', category_orders=order, text='Test RMSE')

fig.update_layout(title='Model RMSE Score on Test Set (all features only)')

fig.show()

## Best 5 Models

In [116]:
df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)

Unnamed: 0,Model,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE,Dataset Version
0,CatBoost,0.992,0.917,0.927,0.973,3.061,3.174,0.647,1.15,1.211,dropna + all + unscaled
0,CatBoost,0.989,0.915,0.923,1.13,3.141,3.254,0.723,1.213,1.258,dropna + selected + unscaled
0,CatBoost,0.993,0.924,0.912,0.956,3.038,3.389,0.633,1.179,1.158,imputed + all + unscaled
1,LGBM,0.971,0.916,0.91,1.882,3.197,3.427,0.829,1.269,1.258,imputed + all + unscaled
0,LGBM,0.966,0.91,0.909,2.041,3.304,3.451,0.899,1.286,1.292,imputed + selected + unscaled
1,CatBoost,0.99,0.917,0.907,1.114,3.18,3.487,0.717,1.231,1.257,imputed + selected + unscaled
1,LGBM,0.971,0.916,0.908,1.86,3.13,3.567,0.803,1.208,1.333,dropna + all + unscaled
2,RandomForest,0.988,0.91,0.902,1.234,3.314,3.582,0.487,1.328,1.378,imputed + all + unscaled
3,XGB,0.998,0.909,0.894,0.541,3.322,3.707,0.375,1.26,1.272,imputed + all + unscaled
1,LGBM,0.966,0.906,0.897,2.028,3.317,3.775,0.87,1.257,1.369,dropna + selected + unscaled


In [117]:
def replace_ver(data):
    if data == "dropna + all + unscaled":
        return "Ver1"
    elif data == "imputed + all + unscaled":
        return "Ver2"
    elif data == "dropna + selected + unscaled":
        return "Ver3"
    else:
        return "Ver4"

In [118]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)
best_10_before

Unnamed: 0,Model,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Train MAE,CV MAE,Test MAE
0,CatBoost (Ver1),0.992,0.917,0.927,0.973,3.061,3.174,0.647,1.15,1.211
0,CatBoost (Ver3),0.989,0.915,0.923,1.13,3.141,3.254,0.723,1.213,1.258
0,CatBoost (Ver2),0.993,0.924,0.912,0.956,3.038,3.389,0.633,1.179,1.158
1,LGBM (Ver2),0.971,0.916,0.91,1.882,3.197,3.427,0.829,1.269,1.258
0,LGBM (Ver4),0.966,0.91,0.909,2.041,3.304,3.451,0.899,1.286,1.292
1,CatBoost (Ver4),0.99,0.917,0.907,1.114,3.18,3.487,0.717,1.231,1.257
1,LGBM (Ver1),0.971,0.916,0.908,1.86,3.13,3.567,0.803,1.208,1.333
2,RandomForest (Ver2),0.988,0.91,0.902,1.234,3.314,3.582,0.487,1.328,1.378
3,XGB (Ver2),0.998,0.909,0.894,0.541,3.322,3.707,0.375,1.26,1.272
1,LGBM (Ver3),0.966,0.906,0.897,2.028,3.317,3.775,0.87,1.257,1.369


In [119]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(5)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train RMSE'],
    text=best_10_before['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV RMSE'],
    text=best_10_before['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test RMSE'],
    text=best_10_before['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Best 5 Model\'s RMSE Score',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()

In [120]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train R2'],
    text=best_10_before['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV R2'],
    text=best_10_before['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test R2'],
    text=best_10_before['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='Best 5 Model\'s R-Squared Score',
    xaxis=None,
    yaxis=dict(title='R2 Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()