### Regression ensembling

In [None]:
import pandas as pd
import numpy as np

test = pd.read_csv('log/ensemble/test_true.csv')['Prediction']
bfm_preds = pd.read_csv('log/ensemble/bfm_preds.csv')['Prediction']
als_preds = pd.read_csv('log/ensemble/als_preds.csv')['Prediction']
funk_pred = pd.read_csv('log/ensemble/funk_preds.csv')['Prediction']
sim_preds2 = pd.read_csv('log/ensemble/sim_preds_w_none_n_30.csv')['Prediction']
sim_preds3 = pd.read_csv('log/ensemble/sim_preds_w_none_n_10000.csv')['Prediction']
sim_preds4 = pd.read_csv('log/ensemble/sim_preds_w_normal_n_30.csv')['Prediction']
sim_preds5 = pd.read_csv('log/ensemble/sim_preds_w_normal_n_10000.csv')['Prediction']
sim_preds6 = pd.read_csv('log/ensemble/sim_preds_w_normal_n_30_improved.csv')['Prediction']
sim_cosine = pd.read_csv('log/ensemble/sim_cosine.csv')['Prediction']
sim_sigra = pd.read_csv('log/ensemble/sim_sigra.csv')['Prediction']
deeprec = pd.read_csv('log/ensemble/deeprec.csv')['Prediction']
deeprec300 = pd.read_csv('log/ensemble/deeprec_300.csv')['Prediction']

In [None]:
import numpy as np

X = np.stack((bfm_preds, sim_preds4, sim_preds3, sim_preds2, sim_preds5, sim_preds6), axis=1)
y = test.values
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor

kf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
results, names = [], []

def get_ensemble():
    models = list()
    models.append(('LR', LinearRegression()))
    models.append(('MLP', MLPRegressor(random_state=42, max_iter=1000)))
    ensemble = VotingRegressor(estimators=models, n_jobs=-1)
    return ensemble

models = {
    "LinearReg" : LinearRegression(),
    "Lasso" : Lasso(alpha=0.001),
    "Ridge" : Ridge(alpha=0.01),
    "XGBoost" : XGBRegressor(n_estimators=100, max_depth=7, n_jobs=-1),
    'MLP' : MLPRegressor(random_state=42, max_iter=1000),
    'RF' : RandomForestRegressor(max_depth=2, random_state=0, n_jobs=-1),
    'Ensemble' : get_ensemble(),
}

In [None]:
for name, model in models.items():
    scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=kf, n_jobs=-1)
    if(name == "LinearReg"): model.fit(X, y) 
    if(name == "Lasso"): model.fit(X, y) 
    results.append((-scores))
    names.append(name)
    print(name, ': %.6f (%.6f)' % (np.mean(-scores), np.std(-scores)), "Coef: " + " ".join(["%0.5f" % x for x in model.coef_]) if name == "LinearReg" or name == "Lasso" else "")
    break 

# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

| Models | rmse | std|
|---|---|---|
|BFM|0.970204|0.002051|
|BFM + PCC_30|0.969686|0.002034|
|BFM + all_PCC| **0.969395**|  0.002066 |
|BFM + PCC_30 + SiGra + Cosine| 0.969561  |  0.002044 |   
| BFM + all_similarities | 0.969404 | 0.002067|
|BFM + DeepRec + PCC_30 | 0.969691 | 0.002037 |

## Generate final submission
Load test data.

In [None]:
from utils.utils import generate_submission

bfm_preds = pd.read_csv('log/ensemble_test/bfm.csv')['Prediction']
sim_preds2 = pd.read_csv('log/ensemble_test/sim_preds_w_none_n_30.csv')['Prediction']
sim_preds3 = pd.read_csv('log/ensemble_test/sim_preds_w_none_n_10000.csv')['Prediction']
sim_preds4 = pd.read_csv('log/ensemble_test/sim_preds_w_normal_n_30.csv')['Prediction']
sim_preds5 = pd.read_csv('log/ensemble_test/sim_preds_w_normal_n_10000.csv')['Prediction']
sim_preds6 = pd.read_csv('log/ensemble_test/sim_preds_w_normal_n_30_improved.csv')['Prediction']
deeprec = pd.read_csv('log/ensemble_test/deeprec.csv')['Prediction']

Define the regression model and fit it with training data.

In [None]:
regressor = LinearRegression()
regressor.fit(X, y)

Predict results for test data.

In [None]:
X_test = np.stack((bfm_preds, sim_preds2, sim_preds3, sim_preds4, sim_preds5, sim_preds6), axis=1)
regressor_preds = regressor.predict(X_test)

Save the final submissions.

In [None]:
generate_submission(regressor_preds, 'data/sampleSubmission.csv', name="ensemble_BFM_all_sim.zip")

Submit to Kaggle.

In [None]:
from utils.utils import submit_on_kaggle
submit_on_kaggle(name="ensemble_BFM_all_sim.zip", message="Linear Regression ensembling, BFM + all similarities except sim_preds_w_None_n_10000")