In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "presentation"
pio.templates

Templates configuration
-----------------------
    Default template: 'presentation'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [13]:
ver1 = pd.read_csv('../data/processed/summary_dropna_all.csv')
ver2 = pd.read_csv('../data/processed/summary_dropna_selected.csv')
ver3 = pd.read_csv('../data/processed/summary_imputed_all.csv')
ver4 = pd.read_csv('../data/processed/summary_imputed_selected.csv')

df_all_ver = pd.concat([ver1, ver2, ver3, ver4], axis=0)
df_all_ver.sort_values(by='CV RMSE', inplace=True)
df_all_ver = round(df_all_ver, 3)
df_all_ver = df_all_ver[~((df_all_ver["CV RMSE"] > 100) | (df_all_ver["Test RMSE"] > 100))]
df_all_ver.head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
8,CatBoost,3.275,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + scaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
8,CatBoost,3.249,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + scaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
9,LightGBM,0.23,0.973,0.919,0.903,1.788,3.083,3.649,dropna + all + scaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
10,XGBoost,0.519,0.998,0.914,0.895,0.544,3.151,3.811,dropna + all + scaled
11,RandomForest,2.929,0.988,0.914,0.902,1.21,3.177,3.674,dropna + all + scaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled


In [14]:
df_all_ver['Dataset Version'].unique()

array(['dropna + all + unscaled', 'dropna + all + scaled',
       'imputed + all + unscaled', 'imputed + all + scaled',
       'dropna + selected + unscaled', 'dropna + selected + scaled',
       'imputed + selected + unscaled', 'imputed + selected + scaled'],
      dtype=object)

In [23]:
grouped = df_all_ver.groupby(['Model'], as_index=False)[['Train RMSE', 'CV RMSE', 'Test RMSE']].mean()
grouped = round(grouped.sort_values(by='Test RMSE'), 3)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Train RMSE'],
    text=grouped['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['CV RMSE'],
    text=grouped['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Test RMSE'],
    text=grouped['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Average RMSE Score of Base Model (on 8 dataset version each)',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=1200,
    margin=dict(l=100, r=100, t=100, b=100),
    font_size=17
)
fig.show()

In [19]:
order_all = {'Dataset Version': [
    'dropna + all + unscaled', 'dropna + all + scaled',
    'imputed + all + unscaled', 'imputed + all + scaled',
    'dropna + selected + unscaled', 'dropna + selected + scaled',
    'imputed + selected + unscaled', 'imputed + selected + scaled'
]}

fig = px.bar(
    df_all_ver, x='Model', y='Test RMSE', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Test RMSE'
)

fig.update_layout(
    title='RMSE Score on Test Set', 
    height=500,
    width=1200
)

fig.show()

In [25]:
fig = px.bar(
    df_all_ver, x='Model', y='Fit Time', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Fit Time',
    log_y=True
)

fig.update_layout(
    title='Training Time on Cross Validation 5 Folds',
    height=500,
    width=1200
)

fig.show()

In [26]:
filtered = df_all_ver[df_all_ver['Dataset Version'].str.contains('all')]

order = {'Dataset Version': ['dropna + all + unscaled', 'dropna + all + scaled',
                             'imputed + all + unscaled', 'imputed + all + scaled']}

fig = px.bar(filtered, x='Model', y='Test RMSE', color='Dataset Version',
             barmode='group', category_orders=order, text='Test RMSE')

fig.update_layout(
    title='Model RMSE Score on Test Set (all features only)',
    height=500,
    width=1200
)

fig.show()

## Best 5 Models

In [27]:
df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled
1,RandomForest,3.192,0.988,0.913,0.912,1.232,3.255,3.392,imputed + all + unscaled
2,LightGBM,0.233,0.972,0.913,0.908,1.851,3.264,3.454,imputed + all + unscaled
3,XGBoost,0.468,0.997,0.911,0.906,0.564,3.296,3.505,imputed + all + unscaled
0,LightGBM,0.177,0.956,0.903,0.882,2.295,3.388,4.027,dropna + selected + unscaled
1,CatBoost,3.455,0.98,0.902,0.912,1.555,3.388,3.485,dropna + selected + unscaled


In [28]:
def replace_ver(data):
    if data == "dropna + all + unscaled":
        return "Ver1"
    elif data == "imputed + all + unscaled":
        return "Ver2"
    elif data == "dropna + selected + unscaled":
        return "Ver3"
    else:
        return "Ver4"

In [30]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)
best_10_before

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,CatBoost (Ver1),3.48,0.992,0.925,0.925,0.95,2.939,3.207
0,CatBoost (Ver2),3.53,0.992,0.923,0.912,0.99,3.078,3.385
1,LightGBM (Ver1),0.219,0.973,0.919,0.904,1.793,3.078,3.648
2,XGBoost (Ver1),0.406,0.998,0.914,0.895,0.544,3.151,3.809
3,RandomForest (Ver1),2.914,0.988,0.913,0.902,1.207,3.2,3.675
1,RandomForest (Ver2),3.192,0.988,0.913,0.912,1.232,3.255,3.392
2,LightGBM (Ver2),0.233,0.972,0.913,0.908,1.851,3.264,3.454
3,XGBoost (Ver2),0.468,0.997,0.911,0.906,0.564,3.296,3.505
0,LightGBM (Ver3),0.177,0.956,0.903,0.882,2.295,3.388,4.027
1,CatBoost (Ver3),3.455,0.98,0.902,0.912,1.555,3.388,3.485


In [31]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(5)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train RMSE'],
    text=best_10_before['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV RMSE'],
    text=best_10_before['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test RMSE'],
    text=best_10_before['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Best 5 Model\'s RMSE Score',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()

In [32]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train R2'],
    text=best_10_before['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV R2'],
    text=best_10_before['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test R2'],
    text=best_10_before['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='Best 5 Model\'s R-Squared Score',
    xaxis=None,
    yaxis=dict(title='R2 Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()