In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "presentation"
pio.templates

Templates configuration
-----------------------
    Default template: 'presentation'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']

In [2]:
ver1 = pd.read_csv('../data/processed/summary_dropna_all.csv')
ver2 = pd.read_csv('../data/processed/summary_dropna_selected.csv')
ver3 = pd.read_csv('../data/processed/summary_imputed_all.csv')
ver4 = pd.read_csv('../data/processed/summary_imputed_selected.csv')

df_all_ver = pd.concat([ver1, ver2, ver3, ver4], axis=0)
df_all_ver.sort_values(by='CV RMSE', inplace=True)
df_all_ver = round(df_all_ver, 3)
df_all_ver = df_all_ver[~((df_all_ver["CV RMSE"] > 100) | (df_all_ver["Test RMSE"] > 100))]
df_all_ver.head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
8,CatBoost,3.275,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + scaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
8,CatBoost,3.249,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + scaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
9,LightGBM,0.23,0.973,0.919,0.903,1.788,3.083,3.649,dropna + all + scaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
10,XGBoost,0.519,0.998,0.914,0.895,0.544,3.151,3.811,dropna + all + scaled
11,RandomForest,2.929,0.988,0.914,0.902,1.21,3.177,3.674,dropna + all + scaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled


In [3]:
df_all_ver['Dataset Version'].unique()

array(['dropna + all + unscaled', 'dropna + all + scaled',
       'imputed + all + unscaled', 'imputed + all + scaled',
       'dropna + selected + unscaled', 'dropna + selected + scaled',
       'imputed + selected + unscaled', 'imputed + selected + scaled'],
      dtype=object)

In [4]:
grouped = df_all_ver.groupby(['Model'], as_index=False)[['Train RMSE', 'CV RMSE', 'Test RMSE']].mean()
grouped = round(grouped.sort_values(by='Test RMSE'), 3)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Train RMSE'],
    text=grouped['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['CV RMSE'],
    text=grouped['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=grouped['Model'], y=grouped['Test RMSE'],
    text=grouped['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Average RMSE Score of Base Model (on 8 dataset version each)',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500,
    width=1200,
    margin=dict(l=100, r=100, t=100, b=100),
    font_size=17
)
fig.add_shape(type="rect",
    xref="x", yref="y",
    x0=0.55, y0=0,
    x1=1.45, y1=3.8,
    line=dict(
        color="Red",
        width=3,
    ),
)
fig.add_shape(type="rect",
    xref="x", yref="y",
    x0=4.55, y0=0,
    x1=7.45, y1=6.1,
    line=dict(
        color="Purple",
        width=3,
    ),
)
fig.show()

In [5]:
order_all = {'Dataset Version': [
    'dropna + all + unscaled', 'dropna + all + scaled',
    'imputed + all + unscaled', 'imputed + all + scaled',
    'dropna + selected + unscaled', 'dropna + selected + scaled',
    'imputed + selected + unscaled', 'imputed + selected + scaled'
]}

fig = px.bar(
    df_all_ver, x='Model', y='Test RMSE', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Test RMSE'
)

fig.update_layout(
    title='RMSE Score on Test Set', 
    height=500,
    width=1200
)

fig.show()

In [6]:
fig = px.bar(
    df_all_ver, x='Model', y='Fit Time', color='Dataset Version',
    barmode='group', category_orders=order_all, text='Fit Time',
    log_y=True
)

fig.update_layout(
    title='Training Time on Cross Validation 5 Folds',
    height=500,
    width=1200
)

fig.show()

In [7]:
filtered = df_all_ver[df_all_ver['Dataset Version'].str.contains('all')]

order = {'Dataset Version': ['dropna + all + unscaled', 'dropna + all + scaled',
                             'imputed + all + unscaled', 'imputed + all + scaled']}

fig = px.bar(filtered, x='Model', y='Test RMSE', color='Dataset Version',
             barmode='group', category_orders=order, text='Test RMSE')

fig.update_layout(
    title='Model RMSE Score on Test Set (all features only)',
    height=500,
    width=1200
)

fig.show()

## Best 5 Models

### Before Tuning

In [8]:
df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE,Dataset Version
0,CatBoost,3.48,0.992,0.925,0.925,0.95,2.939,3.207,dropna + all + unscaled
0,CatBoost,3.53,0.992,0.923,0.912,0.99,3.078,3.385,imputed + all + unscaled
1,LightGBM,0.219,0.973,0.919,0.904,1.793,3.078,3.648,dropna + all + unscaled
2,XGBoost,0.406,0.998,0.914,0.895,0.544,3.151,3.809,dropna + all + unscaled
3,RandomForest,2.914,0.988,0.913,0.902,1.207,3.2,3.675,dropna + all + unscaled
1,RandomForest,3.192,0.988,0.913,0.912,1.232,3.255,3.392,imputed + all + unscaled
2,LightGBM,0.233,0.972,0.913,0.908,1.851,3.264,3.454,imputed + all + unscaled
3,XGBoost,0.468,0.997,0.911,0.906,0.564,3.296,3.505,imputed + all + unscaled
0,LightGBM,0.177,0.956,0.903,0.882,2.295,3.388,4.027,dropna + selected + unscaled
1,CatBoost,3.455,0.98,0.902,0.912,1.555,3.388,3.485,dropna + selected + unscaled


In [9]:
def replace_ver(data):
    if data == "dropna + all + unscaled":
        return "Ver1"
    elif data == "imputed + all + unscaled":
        return "Ver2"
    elif data == "dropna + selected + unscaled":
        return "Ver3"
    else:
        return "Ver4"

In [10]:
best_10_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(10)
best_10_before["Dataset Version"] = best_10_before["Dataset Version"].apply(replace_ver)
best_10_before["Model"] = best_10_before["Model"] + " (" + best_10_before["Dataset Version"] + ")"
best_10_before.drop(columns=["Dataset Version"], inplace=True)
best_10_before

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,CatBoost (Ver1),3.48,0.992,0.925,0.925,0.95,2.939,3.207
0,CatBoost (Ver2),3.53,0.992,0.923,0.912,0.99,3.078,3.385
1,LightGBM (Ver1),0.219,0.973,0.919,0.904,1.793,3.078,3.648
2,XGBoost (Ver1),0.406,0.998,0.914,0.895,0.544,3.151,3.809
3,RandomForest (Ver1),2.914,0.988,0.913,0.902,1.207,3.2,3.675
1,RandomForest (Ver2),3.192,0.988,0.913,0.912,1.232,3.255,3.392
2,LightGBM (Ver2),0.233,0.972,0.913,0.908,1.851,3.264,3.454
3,XGBoost (Ver2),0.468,0.997,0.911,0.906,0.564,3.296,3.505
0,LightGBM (Ver3),0.177,0.956,0.903,0.882,2.295,3.388,4.027
1,CatBoost (Ver3),3.455,0.98,0.902,0.912,1.555,3.388,3.485


In [11]:
best_before = df_all_ver[df_all_ver["Dataset Version"].str.contains("unscaled")].head(7)
best_before["Dataset Version"] = best_before["Dataset Version"].apply(replace_ver)
best_before["Model"] = best_before["Model"] + " (" + best_before["Dataset Version"] + ")"
best_before.drop(columns=["Dataset Version"], inplace=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['Train RMSE'],
    text=best_before['Train RMSE'], textposition='auto',
    name='Train RMSE'
))
fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['CV RMSE'],
    text=best_before['CV RMSE'], textposition='auto',
    name='CV RMSE'
))
fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['Test RMSE'],
    text=best_before['Test RMSE'], textposition='auto',
    name='Test RMSE'
))
fig.update_layout(
    title='Before Tuning',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500*1.2,
    width=600*1.2,
    margin=dict(l=100, r=100, t=70, b=120),
    font_size=15,
    showlegend=False
)
fig.show()

In [12]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['Train R2'],
    text=best_before['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['CV R2'],
    text=best_before['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_before['Model'], y=best_before['Test R2'],
    text=best_before['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='Before Tuning',
    xaxis=None,
    yaxis=dict(title='RMSE Score'),
    height=500*1.2,
    width=600*1.2,
    margin=dict(l=100, r=100, t=100, b=120),
    font_size=15,
)
fig.show()

### After Tuning

In [13]:
ver1_tuned = pd.read_csv("../data/processed/tuning_dropna_all (XGB+LGB).csv")
ver1_tuned

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LGBMRegressor (1244) {'objective': 'regression...,5.9132,0.9887,0.9288,0.9373,1.1671,2.8661,2.9412
1,LGBMRegressor (1244) {'objective': 'regression...,5.9058,0.9886,0.9281,0.9371,1.1696,2.8803,2.9456
2,XGBRegressor (1354) {'alpha': 0.00949506479294...,5.2654,0.9827,0.9252,0.936,1.4419,2.9362,2.9712
3,XGBRegressor (1279) {'alpha': 0.00949506479294...,5.0459,0.9807,0.9248,0.9348,1.5216,2.9431,2.998
4,LGBMRegressor (178) {'objective': 'regression'...,0.61,0.993,0.9214,0.932,0.9174,3.0128,3.0624
5,XGBRegressor (131) {'alpha': 3.028612934937252...,0.6253,0.9805,0.9171,0.9261,1.5295,3.1138,3.1928


In [14]:
ver1_tuned_top = round(ver1_tuned.iloc[[0,2]], 3)
ver1_tuned_top["Model"] = ver1_tuned_top["Model"].apply(lambda x: x.split()[0]) + "_Tuned (Ver1)"
ver1_tuned_top["Model"] = ver1_tuned_top["Model"].str.replace("LGBMRegressor", "LightGBM")
ver1_tuned_top["Model"] = ver1_tuned_top["Model"].str.replace("XGBRegressor", "XGBoost")
ver1_tuned_top

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LightGBM_Tuned (Ver1),5.913,0.989,0.929,0.937,1.167,2.866,2.941
2,XGBoost_Tuned (Ver1),5.265,0.983,0.925,0.936,1.442,2.936,2.971


In [15]:
ver2_tuned = pd.read_csv("../data/processed/tuning_imputed_all (XGB+LGB).csv")
ver2_tuned

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,XGBRegressor (1757) {'objective': 'reg:squared...,10.0355,0.9961,0.9259,0.9087,0.696,2.9985,3.4499
1,XGBRegressor (2000) {'objective': 'reg:squared...,11.6454,0.9968,0.9258,0.9085,0.6248,2.9999,3.4523
2,LGBMRegressor (940) {'objective': 'regression'...,5.3027,0.9907,0.9258,0.9095,1.0706,3.0011,3.4347
3,LGBMRegressor (942) {'objective': 'regression'...,5.3318,0.9909,0.9255,0.9084,1.0593,3.0069,3.4556
4,LGBMRegressor (331) {'objective': 'regression'...,1.4528,0.9924,0.9232,0.9116,0.9643,3.0562,3.3936
5,XGBRegressor (1064) {'objective': 'reg:squared...,2.2883,0.9854,0.9224,0.9129,1.3406,3.0743,3.3692


In [16]:
ver2_tuned_top = round(ver2_tuned.iloc[[0,2]], 3)
ver2_tuned_top["Model"] = ver2_tuned_top["Model"].apply(lambda x: x.split()[0]) + "_Tuned (Ver2)"
ver2_tuned_top["Model"] = ver2_tuned_top["Model"].str.replace("LGBMRegressor", "LightGBM")
ver2_tuned_top["Model"] = ver2_tuned_top["Model"].str.replace("XGBRegressor", "XGBoost")
ver2_tuned_top

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,XGBoost_Tuned (Ver2),10.036,0.996,0.926,0.909,0.696,2.998,3.45
2,LightGBM_Tuned (Ver2),5.303,0.991,0.926,0.91,1.071,3.001,3.435


In [17]:
best_after = pd.concat([best_10_before, ver1_tuned_top, ver2_tuned_top], axis=0)
best_after = best_after.sort_values(by="CV RMSE").head(7).reset_index(drop=True)
best_after

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
0,LightGBM_Tuned (Ver1),5.913,0.989,0.929,0.937,1.167,2.866,2.941
1,XGBoost_Tuned (Ver1),5.265,0.983,0.925,0.936,1.442,2.936,2.971
2,CatBoost (Ver1),3.48,0.992,0.925,0.925,0.95,2.939,3.207
3,XGBoost_Tuned (Ver2),10.036,0.996,0.926,0.909,0.696,2.998,3.45
4,LightGBM_Tuned (Ver2),5.303,0.991,0.926,0.91,1.071,3.001,3.435
5,CatBoost (Ver2),3.53,0.992,0.923,0.912,0.99,3.078,3.385
6,LightGBM (Ver1),0.219,0.973,0.919,0.904,1.793,3.078,3.648


In [18]:
print("D3:", px.colors.qualitative.D3)
print("Plotly:", px.colors.qualitative.Plotly)

D3: ['#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF']
Plotly: ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']


In [19]:
colors1 = ['#636EFA','#636EFA','#636EFA','#636EFA','#636EFA','#636EFA','#636EFA']
colors2 = ['#EF553B','#EF553B','#EF553B','#EF553B','#EF553B','#EF553B','#EF553B']
colors3 = ['#00CC96','#00CC96','#00CC96','#00CC96','#00CC96','#00CC96','#00CC96']

In [20]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['Train RMSE'],
    text=best_after['Train RMSE'], textposition='auto',
    name='Train RMSE',
    #marker_color=colors1
))
fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['CV RMSE'],
    text=best_after['CV RMSE'], textposition='auto',
    name='CV RMSE',
    #marker_color=colors2
))
fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['Test RMSE'],
    text=best_after['Test RMSE'], textposition='auto',
    name='Test RMSE',
    #marker_color=colors3
))
fig.update_layout(
    title='After Tuning',
    xaxis=None,
    yaxis=dict(
        title=None,
        range=[0,4]
    ),
    height=500*1.2,
    width=600*1.2,
    margin=dict(l=70, r=100, t=70, b=120),
    font_size=15,  
)
fig.add_shape(type="rect",
    xref="x", yref="y",
    x0=-0.47, y0=0,
    x1=1.47, y1=3.03,
    line=dict(
        color="Red",
        width=3,
    ),
)
fig.add_shape(type="rect",
    xref="x", yref="y",
    x0=2.53, y0=0,
    x1=4.47, y1=3.5,
    line=dict(
        color="Purple",
        width=3,
    ),
)
fig.show()

In [21]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['Train R2'],
    text=best_after['Train R2'], textposition='auto',
    name='Train R2'
))
fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['CV R2'],
    text=best_after['CV R2'], textposition='auto',
    name='CV R2'
))
fig.add_trace(go.Bar(
    x=best_after['Model'], y=best_after['Test R2'],
    text=best_after['Test R2'], textposition='auto',
    name='Test R2'
))
fig.update_layout(
    title='After Tuning',
    xaxis=None,
    yaxis=None,
    height=500*1.2,
    width=600*1.2,
    margin=dict(l=100, r=100, t=70, b=120),
    font_size=16,  
)
fig.show()

In [32]:
final_model = best_after.loc[1]
final_model = pd.DataFrame(final_model).T

In [33]:
final_model

Unnamed: 0,Model,Fit Time,Train R2,CV R2,Test R2,Train RMSE,CV RMSE,Test RMSE
1,XGBoost_Tuned (Ver1),5.265,0.983,0.925,0.936,1.442,2.936,2.971


In [40]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['Train RMSE'],
    text=final_model['Train RMSE'], textposition='auto',
    name='Train RMSE',
    #marker_color=colors1
))
fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['CV RMSE'],
    text=final_model['CV RMSE'], textposition='auto',
    name='CV RMSE',
    #marker_color=colors2
))
fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['Test RMSE'],
    text=final_model['Test RMSE'], textposition='auto',
    name='Test RMSE',
    #marker_color=colors3
))
fig.update_layout(
    title='RMSE Score',
    xaxis=None,
    yaxis=None,
    height=500*1.2,
    width=300*1.2,
    margin=dict(l=70, r=100, t=70, b=120),
    font_size=15, 
    showlegend=False
)
fig.show()

In [42]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['Train R2'],
    text=final_model['Train R2'], textposition='auto',
    name='Train R2',
    #marker_color=colors1
))
fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['CV R2'],
    text=final_model['CV R2'], textposition='auto',
    name='CV R2',
    #marker_color=colors2
))
fig.add_trace(go.Bar(
    x=final_model['Model'], y=final_model['Test R2'],
    text=final_model['Test R2'], textposition='auto',
    name='Test R2',
    #marker_color=colors3
))
fig.update_layout(
    title='R-Squared Score',
    xaxis=None,
    yaxis=None,
    height=500*1.2,
    width=300*1.2,
    margin=dict(l=70, r=100, t=70, b=120),
    font_size=15, 
    showlegend=False
)
fig.show()