In [1]:
import numpy as np
import pandas as pd
import glob
import os

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import KFold, cross_validate

In [2]:
# prepare data

# validation predictions
files = glob.glob('./training_files/valid_preds/*.csv')
modelnames = [f.split('\\')[1].split('.')[0] for f in files]
dfs = [pd.read_csv(f, index_col=0).reset_index(drop=True) for f in files]

df = pd.concat(dfs, axis=1)
df.columns = modelnames

# test predictions
files = glob.glob('../submissions/*.csv')
modelnames = [f.split('\\')[1].split('.')[0] for f in files]
dfs_test = [pd.read_csv(f, index_col=0).reset_index(drop=True) for f in files]

df_test = pd.concat(dfs_test, axis=1)
df_test.columns = modelnames
if 'stacking' in df_test.columns:
    df_test.drop(columns='stacking', inplace=True)
    modelnames.remove('stacking')

# load validation set
valid = pd.read_csv('../data/final/valid.csv')
TARGET = 'MedHouseVal'
FEATURES = modelnames
df[TARGET] = valid[TARGET]

print('Stacking dataset info:')
print(f'Valid shape: {df.shape}\tTest shape: {df_test.shape}')
df.head()

Stacking dataset info:
Valid shape: (14855, 6)	Test shape: (24759, 5)


Unnamed: 0,catboost,lgb,lgb_dart,randomforest,xtrees,MedHouseVal
0,1.339446,1.434472,1.288268,1.426714,1.444429,0.946
1,2.654929,2.230659,1.979263,2.373727,2.319483,1.576
2,3.038709,2.960533,2.64676,3.0872,3.136938,2.291
3,1.254343,1.21273,1.10209,1.220702,1.304687,1.393
4,3.243742,3.521457,3.019146,3.225335,3.168902,2.092


In [3]:
class cfg:
    seed = 42
    nfolds = 5
    njobs = 4

cv = KFold(n_splits=cfg.nfolds, shuffle=True, random_state=cfg.seed)

In [4]:
# stacking linear model
model = ElasticNetCV(
    l1_ratio = [.1, .2, .3, .5, .7, .8, .9, .95, .99, 1],
    n_alphas = 200,
    fit_intercept = False,
    cv = cv,
    random_state = cfg.seed,
    n_jobs = cfg.njobs)

scores = cross_validate(model, df[FEATURES], df[TARGET], cv=cv, scoring='neg_mean_squared_error')
score = np.mean(np.sqrt(-scores['test_score']))
print(f'Stacking linear model, CV RMSE: {score:.4f}')

Stacking linear model, CV RMSE: 0.5467


In [5]:
# final model and predictions
final_model = model.fit(df[FEATURES], df[TARGET])
print(final_model.coef_)

final_preds = model.predict(df_test)

[0.12933931 0.86781725 0.         0.         0.        ]


In [6]:
# save predictions
sub = pd.read_csv('../data/raw/sample_submission.csv', index_col=0)
sub[TARGET] = final_preds
sub.head()

out_path = '../submissions/'
os.makedirs(out_path, exist_ok=True)
sub.to_csv(out_path + f'stacking.csv')