In [7]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
import re
from sklearn.linear_model import ElasticNet
import pickle
import warnings
warnings.filterwarnings('ignore')

In [8]:
train_input = pd.read_csv('data/3_train_model_input.csv')
test_input = pd.read_csv('data/3_test_model_input.csv')

X_train = train_input.iloc[:,0:6]
y_train = train_input.iloc[:,-1]
X_test = test_input.iloc[:,0:6]
y_test = test_input.iloc[:,-1]

In [9]:
def stack():
    # define the base models
    level0 = list()
    level0.append(('XGB', XGBRegressor()))
    level0.append(('SVR', SVR()))
    level0.append(('GBR', GradientBoostingRegressor()))
    # level0.append(('CATB', CatBoostRegressor())),
    level0.append(('ElNET', ElasticNet())),
    level0.append(('LGBM', LGBMRegressor()))
    # define meta learner model
    level1 = LinearRegression()
    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=10)
    return model

In [10]:
def validate_model(X_train, y_train, X_test, y_test) -> pd.DataFrame:
    
    dfs = []
    models = [
         ('Stack', stack()),
          ('XGB', XGBRegressor(n_estimators = 100, max_depth = 10, learning_rate = 0.05)), 
          ('LGBM', LGBMRegressor(num_leaves = 300, feature_fraction = 1, bagging_freq = 100, learning = 0.05)),
          ('GBR', GradientBoostingRegressor(n_estimators = 200, max_depth = 5,learning_rate=0.05)),
          ('SVR', SVR(C = 1000)),
          ('ElNet', ElasticNet(alpha=6.428073117284319e-05, l1_ratio=0.8, max_iter=5))
#          ('CATB', CatBoostRegressor(n_estimators = 500, max_depth = 4, learning_rate = 0.1)),)
          ]
    results = []
    names = []
    scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

    for name, model in models:
        kfold = model_selection.RepeatedKFold(n_splits=5, n_repeats = 3, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        f = 'models/{}.sav'.format(name)
        pickle.dump(model, open(f, 'wb'))
        y_pred = clf.predict(X_test)
        print(name)
        print(mean_squared_error(y_test, y_pred, squared = False))
        
        results.append(cv_results)
        names.append(name)

        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)

        final = pd.concat(dfs, ignore_index=True)
        

    return final

In [None]:
%%timeit
final_result = validate_model(X_train, y_train, X_test, y_test)

Stack
23.87749073129018
XGB
23.88521840491415
LGBM
24.004477448471974
GBR
24.022670287738382
SVR
24.103654683664708
ElNet
26.260690082555275


| **Model** | **RMSE** |
|---|---|
| Light Gradient Boosting Regressor | 24.0045 |
| Gradient Boosting Regressor | 24.0235 |
| Support Vector Regressor (Radial) | 24.1036 |
| XGBoost Regressor | 23.8852 |
| Elastic Net | 26.2607 |
| Stacked Ensemble |23.8775  |

In [None]:
final_result.to_csv('data/4_validation_result.csv', index = False)  