In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time

from joblib import dump
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
plt.style.use('ggplot')

# 8-ensemble-models

In [3]:
X_train = pd.read_csv('../data/processed/train_features.csv')
y_train = pd.read_csv('../data/processed/train_target.csv')

In [4]:
X_test = pd.read_csv('../data/processed/test_features.csv')
y_test = pd.read_csv('../data/processed/test_target.csv')

In [5]:
X_train = X_train.select_dtypes('number')
X_test = X_test.select_dtypes('number')

In [6]:
cols = X_train.columns.tolist()

In [7]:
assert list(X_train.columns) == list(X_test.columns)

In [8]:
feature_scaler = StandardScaler()
target_scaler = StandardScaler()

In [9]:
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [10]:
y_train = target_scaler.fit_transform(y_train)
y_test = target_scaler.transform(y_test)

In [11]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

## Sample from training set

For performance reasons.

In [12]:
# import numpy as np
# np.random.seed(42)
# # mask = np.random.choice(np.arange(X_train.shape[0]), size=5 * 10 ** 4, replace=False)
# # mask = np.ones(shape=(50000,))

## Adaboost

In [13]:
param_grid = {
    'n_estimators': [20, 40, 60, 80, 100],
    'learning_rate': [10** i for i in range(-4, 2)]
}

### With linear regression

In [14]:
est = LinearRegression()

In [15]:
mdl = AdaBoostRegressor(est, random_state=42)

In [16]:
t1 = time.time()
gscv = GridSearchCV(mdl, n_jobs=-1, cv=2, param_grid=param_grid, verbose=1)
# gscv.fit(X_train[mask], y_train[mask].ravel())
gscv.fit(X_train, y_train.ravel())
t2 = time.time()
print(t2 - t1)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
1061.8125150203705


In [17]:
mdl = gscv.best_estimator_
mdl

AdaBoostRegressor(base_estimator=LinearRegression(), learning_rate=0.0001,
                  n_estimators=20, random_state=42)

In [18]:
dump(mdl, '../models/ada_linear.joblib')

['../models/ada_linear.joblib']

In [19]:
y_pred = mdl.predict(X_test)

In [20]:
mean_squared_error(
    target_scaler.inverse_transform(y_train),
    target_scaler.inverse_transform(mdl.predict(X_train).reshape(-1,1)),
    squared=False
)

6127.8302183551705

In [21]:
mean_squared_error(
    target_scaler.inverse_transform(y_test),
    target_scaler.inverse_transform(mdl.predict(X_test).reshape(-1,1)),
    squared=False
)

6155.544662694408

### With a decision stump

In [22]:
est = DecisionTreeRegressor(max_depth=2, random_state=42)

In [23]:
mdl = AdaBoostRegressor(est, random_state=42)

In [24]:
t1 = time.time()
gscv = GridSearchCV(mdl, n_jobs=8, cv=2, param_grid=param_grid, verbose=1)
# gscv.fit(X_train[mask], y_train[mask].ravel())
gscv.fit(X_train, y_train.ravel())
t2 = time.time()
print(t2 - t1)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
199.77643728256226


In [25]:
mdl = gscv.best_estimator_
mdl

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=2,
                                                       random_state=42),
                  learning_rate=0.1, n_estimators=60, random_state=42)

In [26]:
dump(mdl, '../models/ada_stump.joblib')

['../models/ada_stump.joblib']

In [27]:
y_pred = mdl.predict(X_test)

In [28]:
mean_squared_error(
    target_scaler.inverse_transform(y_train),
    target_scaler.inverse_transform(mdl.predict(X_train).reshape(-1,1)),
    squared=False
)

7782.929549234288

In [29]:
mean_squared_error(
    target_scaler.inverse_transform(y_test),
    target_scaler.inverse_transform(mdl.predict(X_test).reshape(-1,1)),
    squared=False
)

7763.976955728294

## Voting Regressor

In [30]:
weights = [i/10 for i in range(1, 10)]
scores = list()

for w in weights:
    vote = VotingRegressor(
        estimators=[
            ('elastic', ElasticNet(random_state=42, max_iter=10_000)),
            ('forest', RandomForestRegressor(max_depth=6, n_estimators=100, random_state=42))
        ],
        weights=[w, 1 - w]
    )

    dump(vote, f'../models/vote{w}.joblib')
    
    t1 = time.time()
#     vote.fit(X_train[mask], y_train[mask].ravel())
    vote.fit(X_train, y_train.ravel())
    t2 = time.time()
    
    y_pred = vote.predict(X_test)
    rmse = mean_squared_error(
        target_scaler.inverse_transform(y_test),
        target_scaler.inverse_transform(y_pred.reshape(-1, 1)),
        squared=False
    )
    
    scores.append((w, rmse))
    print(f'{t2 - t1:5.3f} {w:5.3f} {rmse:5.3f}')

21.470 0.100 6327.966
21.536 0.200 6522.458
21.603 0.300 6806.104
21.339 0.400 7168.328
21.374 0.500 7597.900
22.281 0.600 8084.091
21.664 0.700 8617.323
21.313 0.800 9189.411
21.397 0.900 9793.547
