# 2. Load Dataset

In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
import pandas as pd 
import numpy as np 
from joblib import dump 

In [7]:
from draft.data.sets import load_sets

In [9]:
import os 

notebooks_dir = os.getcwd()
project_dir = os.path.dirname(notebooks_dir) 
datasets_dir = os.path.join(project_dir, "data", "processed") 

sets = load_sets(load_dir=datasets_dir)

X_train = sets.get("X_train") 
X_val = sets.get("X_val") 
X_test = sets.get("X_test") 

y_train = sets.get("y_train") 
y_val = sets.get("y_val") 
y_test = sets.get("y_test") 

# 3. Train AdaBoost Regression model

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
reg = LinearRegression()

In [12]:
reg.fit(X_train, y_train)

In [13]:
with open(os.path.join(project_dir, "models", "adaboost_default.joblib"), 'wb') as f: 
    dump(reg, f)

In [14]:
y_train_preds = reg.predict(X_train) 
y_val_preds = reg.predict(X_val) 

In [15]:
from draft.model.eval import assess_regressor_set 

In [21]:
results = [] 

In [16]:
train_scores = assess_regressor_set(
    model=reg, 
    features=X_train,
    target=y_train,
    set_name="Linear default train", 
    metrics=['root_mean_squared_error', 'mean_absolute_error']
)

Regression metrics for Linear default train
--------------------------------------------------
root_mean_squared_error: 11424.212732783762
mean_absolute_error: 3056.5137926117104


In [22]:
results.append(train_scores)

In [17]:
val_scores = assess_regressor_set(
    model=reg, 
    features=X_val,
    target=y_val,
    set_name="Linear default val", 
    metrics=['root_mean_squared_error', 'mean_absolute_error']
)

Regression metrics for Linear default val
--------------------------------------------------
root_mean_squared_error: 10537.362088824131
mean_absolute_error: 3047.9544366946498


In [23]:
results.append(val_scores)

# 4. Hyperparameter Tuning

In [18]:
from draft.model.eval import fit_assess_regressor

**[4.2]** Train an AdaBoost model with random state 0, n_estimators 100 and learning_rate 0.05 and print its scores on the training and validation sets

In [25]:
from sklearn.ensemble import AdaBoostRegressor 

ada_1 = AdaBoostRegressor(
    estimator=reg, 
    random_state=0, 
    n_estimators=100, 
    learning_rate=0.05
)

ada_1_results = fit_assess_regressor(
    model=ada_1, 
    name="ada-1", 
    X_train=X_train, 
    y_train=y_train, 
    X_val=X_val, 
    y_val=y_val
)

Regression metrics for AdaBoostRegressor(estimator=LinearRegression(), learning_rate=0.05,
                  n_estimators=100, random_state=0)_train
--------------------------------------------------
mean_squared_error: 383852063.08375233
root_mean_squared_error: 19592.142891571417
mean_absolute_error: 12990.600443389601
r2_score: -1.8722215888591536
Regression metrics for AdaBoostRegressor(estimator=LinearRegression(), learning_rate=0.05,
                  n_estimators=100, random_state=0)_val
--------------------------------------------------
mean_squared_error: 366829024.04333746
root_mean_squared_error: 19152.78110466826
mean_absolute_error: 12893.583870228957
r2_score: -2.191579611578733


actually much higher rmse and mae than linear def 

In [26]:
results.append(ada_1_results)

In [27]:
ada_2 = AdaBoostRegressor(
    estimator=reg, 
    random_state=0, 
    n_estimators=100, 
    learning_rate=0.05,
    loss='square'
)

ada_2_results = fit_assess_regressor(
    model=ada_2, 
    name="ada-2", 
    X_train=X_train, 
    y_train=y_train, 
    X_val=X_val, 
    y_val=y_val
)

Regression metrics for AdaBoostRegressor(estimator=LinearRegression(), learning_rate=0.05,
                  loss='square', n_estimators=100, random_state=0)_train
--------------------------------------------------
mean_squared_error: 4330674891.841657
root_mean_squared_error: 65807.86344990738
mean_absolute_error: 47744.5941344484
r2_score: -31.404822365026362
Regression metrics for AdaBoostRegressor(estimator=LinearRegression(), learning_rate=0.05,
                  loss='square', n_estimators=100, random_state=0)_val
--------------------------------------------------
mean_squared_error: 4288721483.8907223
root_mean_squared_error: 65488.3308986473
mean_absolute_error: 47503.738334727524
r2_score: -36.3138305602237


error is much much much worse!

In [29]:
with open(os.path.join(project_dir, "models", "ada.joblib"), 'wb') as f: 
    dump(ada_2, f) 