In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "presentation"
pio.templates

Templates configuration
-----------------------
    Default template: 'presentation'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [3]:
ver1 = pd.read_csv('../data/processed/summary_dropna_all.csv')
ver2 = pd.read_csv('../data/processed/summary_dropna_selected.csv')
ver3 = pd.read_csv('../data/processed/summary_imputed_all.csv')
ver4 = pd.read_csv('../data/processed/summary_imputed_selected.csv')

df_all_ver = pd.concat([ver1, ver2, ver3, ver4], axis=0)
df_all_ver.sort_values(by='CV RMSE', inplace=True)
df_all_ver = round(df_all_ver, 3)
df_all_ver = df_all_ver[~((df_all_ver["CV RMSE"] > 100) | (df_all_ver["Test RMSE"] > 100))]
df_all_ver.head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
8,CatBoost,3.275,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + scaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
8,CatBoost,3.249,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + scaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
9,LightGBM,0.23,0.973,0.919,0.903,1.788,3.083,3.649,dropna + all + scaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
10,XGBoost,0.519,0.998,0.914,0.895,0.544,3.151,3.811,dropna + all + scaled
11,RandomForest,2.929,0.988,0.914,0.902,1.21,3.177,3.674,dropna + all + scaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled


In [4]:
df_all_ver['Dataset Version'].unique()

array(['dropna + all + unscaled', 'dropna + all + scaled',
       'imputed + all + unscaled', 'imputed + all + scaled',
       'dropna + selected + unscaled', 'dropna + selected + scaled',
       'imputed + selected + unscaled', 'imputed + selected + scaled'],
      dtype=object)

In [5]:
grouped = df_all_ver.groupby(['Model'], as_index=False)[['Train RMSE', 'CV RMSE', 'Test RMSE']].mean()
grouped = round(grouped.sort_values(by='Test RMSE'), 3)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Train RMSE'],
    text=grouped['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['CV RMSE'],
    text=grouped['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Test RMSE'],
    text=grouped['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Average RMSE Score of Base Model (on 8 dataset version each)',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=1200,
    margin=dict(l=100, r=100, t=100, b=100),
    font_size=17
)
fig.show()

In [6]:
order_all = {'Dataset Version': [
    'dropna + all + unscaled', 'dropna + all + scaled',
    'imputed + all + unscaled', 'imputed + all + scaled',
    'dropna + selected + unscaled', 'dropna + selected + scaled',
    'imputed + selected + unscaled', 'imputed + selected + scaled'
]}

fig = px.bar(
    df_all_ver, x='Model', y='Test RMSE', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Test RMSE'
)

fig.update_layout(
    title='RMSE Score on Test Set', 
    height=500,
    width=1200
)

fig.show()

In [7]:
fig = px.bar(
    df_all_ver, x='Model', y='Fit Time', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Fit Time',
    log_y=True
)

fig.update_layout(
    title='Training Time on Cross Validation 5 Folds',
    height=500,
    width=1200
)

fig.show()

In [8]:
filtered = df_all_ver[df_all_ver['Dataset Version'].str.contains('all')]

order = {'Dataset Version': ['dropna + all + unscaled', 'dropna + all + scaled',
                             'imputed + all + unscaled', 'imputed + all + scaled']}

fig = px.bar(filtered, x='Model', y='Test RMSE', color='Dataset Version',
             barmode='group', category_orders=order, text='Test RMSE')

fig.update_layout(
    title='Model RMSE Score on Test Set (all features only)',
    height=500,
    width=1200
)

fig.show()

## Best 5 Models

### Before Tuning

In [9]:
df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled
1,RandomForest,3.192,0.988,0.913,0.912,1.232,3.255,3.392,imputed + all + unscaled
2,LightGBM,0.233,0.972,0.913,0.908,1.851,3.264,3.454,imputed + all + unscaled
3,XGBoost,0.468,0.997,0.911,0.906,0.564,3.296,3.505,imputed + all + unscaled
0,LightGBM,0.177,0.956,0.903,0.882,2.295,3.388,4.027,dropna + selected + unscaled
1,CatBoost,3.455,0.98,0.902,0.912,1.555,3.388,3.485,dropna + selected + unscaled


In [10]:
def replace_ver(data):
    if data == "dropna + all + unscaled":
        return "Ver1"
    elif data == "imputed + all + unscaled":
        return "Ver2"
    elif data == "dropna + selected + unscaled":
        return "Ver3"
    else:
        return "Ver4"

In [11]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)
best_10_before

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,CatBoost (Ver1),3.48,0.992,0.925,0.925,0.95,2.939,3.207
0,CatBoost (Ver2),3.53,0.992,0.923,0.912,0.99,3.078,3.385
1,LightGBM (Ver1),0.219,0.973,0.919,0.904,1.793,3.078,3.648
2,XGBoost (Ver1),0.406,0.998,0.914,0.895,0.544,3.151,3.809
3,RandomForest (Ver1),2.914,0.988,0.913,0.902,1.207,3.2,3.675
1,RandomForest (Ver2),3.192,0.988,0.913,0.912,1.232,3.255,3.392
2,LightGBM (Ver2),0.233,0.972,0.913,0.908,1.851,3.264,3.454
3,XGBoost (Ver2),0.468,0.997,0.911,0.906,0.564,3.296,3.505
0,LightGBM (Ver3),0.177,0.956,0.903,0.882,2.295,3.388,4.027
1,CatBoost (Ver3),3.455,0.98,0.902,0.912,1.555,3.388,3.485


In [76]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(5)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train RMSE'],
    text=best_10_before['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV RMSE'],
    text=best_10_before['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test RMSE'],
    text=best_10_before['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Best 5 Model\'s RMSE Score',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()

In [13]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Train R2'],
    text=best_10_before['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['CV R2'],
    text=best_10_before['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_10_before['Model'], y=best_10_before['Test R2'],
    text=best_10_before['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='Best 5 Model\'s R-Squared Score',
    xaxis=None,
    yaxis=dict(title='R2 Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()

### After Tuning

In [54]:
df_tuned = pd.read_csv("../data/processed/tuning_dropna_all (XGB+LGB).csv")
df_tuned

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LGBMRegressor (1244) {'objective': 'regression...,5.9132,0.9887,0.9288,0.9373,1.1671,2.8661,2.9412
1,LGBMRegressor (1244) {'objective': 'regression...,5.9058,0.9886,0.9281,0.9371,1.1696,2.8803,2.9456
2,XGBRegressor (1354) {'alpha': 0.00949506479294...,5.2654,0.9827,0.9252,0.936,1.4419,2.9362,2.9712
3,XGBRegressor (1279) {'alpha': 0.00949506479294...,5.0459,0.9807,0.9248,0.9348,1.5216,2.9431,2.998
4,LGBMRegressor (178) {'objective': 'regression'...,0.61,0.993,0.9214,0.932,0.9174,3.0128,3.0624
5,XGBRegressor (131) {'alpha': 3.028612934937252...,0.6253,0.9805,0.9171,0.9261,1.5295,3.1138,3.1928


In [55]:
df_tuned_top = round(df_tuned.iloc[[0,2]], 3)
df_tuned_top["Model"] = df_tuned_top["Model"].apply(lambda x: x.split()[0]) + "_Tuned (Ver1)"
df_tuned_top["Model"] = df_tuned_top["Model"].str.replace("LGBMRegressor", "LightGBM")
df_tuned_top["Model"] = df_tuned_top["Model"].str.replace("XGBRegressor", "XGBoost")
df_tuned_top

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LightGBM_Tuned (Ver1),5.913,0.989,0.929,0.937,1.167,2.866,2.941
2,XGBoost_Tuned (Ver1),5.265,0.983,0.925,0.936,1.442,2.936,2.971


In [56]:
best_5_after = pd.concat([best_10_before, df_tuned_top], axis=0)
best_5_after = best_5_after.sort_values(by="CV RMSE").head().reset_index(drop=True)
best_5_after

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LightGBM_Tuned (Ver1),5.913,0.989,0.929,0.937,1.167,2.866,2.941
1,XGBoost_Tuned (Ver1),5.265,0.983,0.925,0.936,1.442,2.936,2.971
2,CatBoost (Ver1),3.48,0.992,0.925,0.925,0.95,2.939,3.207
3,CatBoost (Ver2),3.53,0.992,0.923,0.912,0.99,3.078,3.385
4,LightGBM (Ver1),0.219,0.973,0.919,0.904,1.793,3.078,3.648


In [71]:
pio.templates.default = "plotly"
pio.templates

Templates configuration
-----------------------
    Default template: 'plotly'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [72]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['Train RMSE'],
    text=best_5_after['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['CV RMSE'],
    text=best_5_after['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['Test RMSE'],
    text=best_5_after['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Best 5 Model\'s RMSE Score',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()

In [73]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['Train R2'],
    text=best_5_after['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['CV R2'],
    text=best_5_after['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_5_after['Model'], y=best_5_after['Test R2'],
    text=best_5_after['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='Best 5 Model\'s R-Squared Score',
    xaxis=None,
    yaxis=dict(title='R2 Score'),
    height=500,
    width=600,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=17,
    
)
fig.show()