## Library imports

In [25]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.21")
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import (root_mean_squared_log_error, mean_absolute_error, mean_squared_error, r2_score, 
                             mean_absolute_percentage_error, root_mean_squared_error)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor, plot_tree

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

import xgboost as xgb

import joblib

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category = ConvergenceWarning) # Ignore ConvergenceWarning

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

import multiprocessing

# def porch_func(df):
#     df_porch = df.copy()
#     df_porch["Porch_sum"] = 0
#     for porch in porch_cols:
#         df_porch["Porch_sum"] = df_porch["Porch_sum"] + df_porch[porch]
#     df_porch.drop(porch_cols, axis=1, inplace=True)
#     return df_porch

# porch_cols = ["ScreenPorch",
#             "3SsnPorch",
#             "WoodDeckSF",
#             "OpenPorchSF",
#             "EnclosedPorch"]

def porch_func(X):
    return pd.DataFrame(X.sum(axis=1), columns=['porch_cols'])

# Load Dataset, split, and pre-process

In [26]:
file_path = "../../house-prices-advanced-regression-techniques/input/train.csv"
houses = pd.read_csv(file_path)

X = houses.drop(columns = ["Id", "MiscFeature", "MSSubClass", "SalePrice"])
y = houses["SalePrice"].copy()
y = np.log(y) # taking logarithm for a more normally distributed target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

saved_preprocessor = joblib.load("preprocessor.joblib")

# Transform Data
saved_preprocessor.fit(X_train)
X_train_proc = saved_preprocessor.transform(X_train)
X_test_proc = saved_preprocessor.transform(X_test)

# Begin model creating and testing

In [27]:
def train_test_RMSLE(models_list, X_train, X_test, y_train, y_test):
    metrics = ["Training RMSLE", "Test RMSLE", "Variance"]
    results = pd.DataFrame(columns=metrics, index=[models_list])
    for model in models_list:
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        trainRMSLE = root_mean_squared_log_error(np.clip(y_train, a_min=1, a_max=np.max(y_train)), np.clip(train_pred, a_min=1, a_max=np.max(train_pred)))
        test_pred = model.predict(X_test)
        testRMSLE = root_mean_squared_log_error(np.clip(y_test, a_min=1, a_max=np.max(y_test)), np.clip(test_pred, a_min=1, a_max=np.max(test_pred)))
        results.loc[model, "Training RMSLE"] = trainRMSLE
        results.loc[model, "Test RMSLE"] = testRMSLE
        results.loc[model, "Variance"] = np.abs(trainRMSLE - testRMSLE)

    return results

In [28]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_proc, y_train)
dt_depths = range(1, 10*dt.get_depth()+1)
dt_params = {'max_depth': dt_depths}
dt_gs = GridSearchCV(dt, dt_params, n_jobs=-1)
dt_gs.fit(X_train_proc, y_train)
best_depth_dt = dt_gs.best_estimator_
best_depth_dt

In [44]:
from sklearn.ensemble import AdaBoostRegressor


adb = AdaBoostRegressor(best_depth_dt, random_state=42)
adb.fit(X_train_proc, y_train)

In [29]:
# poly_pipe = Pipeline([("poly", PolynomialFeatures()),
#                     ("linear", LinearRegression())])
# poly_params = {'poly__degree': range(1, 4)}
# poly_gs = GridSearchCV(poly_pipe, poly_params, n_jobs=-1)
# poly_gs.fit(X_train_proc, y_train)
# best_degree_poly = poly_gs.best_estimator_
# best_degree_poly
# # since this is degree 1, it is just linear

In [30]:
lr = LinearRegression()
lr.fit(X_train_proc, y_train)

In [31]:
lasso = Lasso(random_state=42)
lasso.fit(X_train_proc, y_train)
lasso_params = {'alpha': [10**x for x in range(-4,5)],
                'max_iter': [1000, 1200, 1500]}
lasso_gs = GridSearchCV(lasso, lasso_params, n_jobs=-1)
lasso_gs.fit(X_train_proc, y_train)
best_alpha_lasso = lasso_gs.best_estimator_
best_alpha_lasso

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [32]:
ridge = Ridge(random_state=42)
ridge.fit(X_train_proc, y_train)
ridge_params = {'alpha': [10**x for x in range(-4,5)]}
ridge_gs = GridSearchCV(ridge, ridge_params, n_jobs=-1)
ridge_gs.fit(X_train_proc, y_train)
best_alpha_ridge = ridge_gs.best_estimator_
best_alpha_ridge

In [33]:
en = ElasticNet(random_state=42)
en.fit(X_train_proc, y_train)
en_params = {"alpha": [10**x for x in range(-4,5)],
             "l1_ratio": [x/10 for x in range(1,11)],
             "max_iter": [10000]}
en_gs = GridSearchCV(en, en_params, n_jobs=-1)
en_gs.fit(X_train_proc, y_train)
best_alpha_l1_en = en_gs.best_estimator_
best_alpha_l1_en

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [34]:
xgb_model = xgb.XGBRegressor(objective="reg:squaredlogerror",
                             n_jobs=multiprocessing.cpu_count() // 2,
                             eval_metric="rmsle")
xgb_model_params = {"max_depth": [None, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89],
              "eta": [x/10 for x in range(11)],
              "n_estimators": [50, 100, 200, 500, 1000]}
xgb_gs = GridSearchCV(xgb_model, xgb_model_params)

xgb_gs.fit(X_train_proc, y_train)
best_xgb = xgb_gs.best_estimator_
best_xgb

# n_jobs=multiprocessing.cpu_count() // 2
# tree_method="hist", 

In [42]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_proc, y_train)
rf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [43]:
rf_params = {'max_depth': [1, 2, 3, 5, 8, 13, 21, 34, 55, 89],
             "max_features": ["sqrt", "log2", None, 1.0],
             'n_estimators': [10, 50, 100, 200]} 
rf_gs = GridSearchCV(rf, rf_params, n_jobs=-1)
rf_gs.fit(X_train_proc, y_train)
best_rf = rf_gs.best_estimator_
best_rf

In [46]:
from sklearn.neighbors import KNeighborsRegressor


knr = KNeighborsRegressor()
knr.fit(X_train_proc, y_train)
knr.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [47]:
knr_params = {'n_neighbors': [3, 5, 8, 13],
              'leaf_size': [10, 20, 30]}
knr_gs = GridSearchCV(knr, knr_params, n_jobs=-1)
knr_gs.fit(X_train_proc, y_train)
best_knr = knr_gs.best_estimator_
best_knr

In [49]:
from sklearn.ensemble import GradientBoostingRegressor


gbb = GradientBoostingRegressor(random_state=42)
gbb.fit(X_train_proc, y_train)
gbb.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [50]:
gbb_params = {'max_depth': [1, 2, 3, 5, 8, 13, 21, 34, 55, 89],
              "max_features": ["sqrt", "log2", None, 1.0]}
gbb_gs = GridSearchCV(gbb, gbb_params, n_jobs=-1)
gbb_gs.fit(X_train_proc, y_train)
best_gbb = gbb_gs.best_estimator_
best_gbb

In [51]:
from sklearn.linear_model import BayesianRidge


br = BayesianRidge()
br.fit(X_train_proc, y_train)


In [52]:
from sklearn.linear_model import LassoLars


llars = LassoLars()
llars.fit(X_train_proc, y_train)


In [96]:
import lightgbm as lgb


lgb_model = lgb.LGBMRegressor(random_state=42, force_row_wise=True)
lgb_model.fit(X_train_proc, y_train)


[LightGBM] [Info] Total Bins 3009
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 83
[LightGBM] [Info] Start training from score 12.030652


In [97]:
list_of_best_estimators = [best_depth_dt,
                           adb,
                           best_alpha_lasso, 
                           best_alpha_ridge, 
                           best_alpha_l1_en, 
                           best_xgb,
                           best_rf,
                           best_knr,
                           best_gbb,
                           br,
                           llars,
                           lr,
                           lgb_model]

train_test_RMSLE(list_of_best_estimators, X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test RMSLE")

[LightGBM] [Info] Total Bins 3009
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 83
[LightGBM] [Info] Start training from score 12.030652


Unnamed: 0,Training RMSLE,Test RMSLE,Variance
"LGBMRegressor(force_row_wise=True, random_state=42)",0.003477,0.011188,0.007711
"([DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)], [DecisionTreeRegressor(criterion='friedman_mse', max_depth=5,\n max_features='sqrt',\n random_state=RandomState(MT19937) at 0x16B7FC540)])",0.003919,0.011316,0.007396
BayesianRidge(),0.010459,0.011716,0.001257
"ElasticNet(alpha=0.01, l1_ratio=0.1, max_iter=10000, random_state=42)",0.010534,0.011753,0.001219
"Lasso(alpha=0.001, random_state=42)",0.010464,0.011769,0.001304
"Ridge(alpha=10, random_state=42)",0.01031,0.011837,0.001526
"XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eta=0.1, eval_metric='rmsle',\n feature_types=None, feature_weights=None, gamma=None,\n grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=None, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=2, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=1000, n_jobs=5, ...)",0.008529,0.011952,0.003424
LinearRegression(),0.010227,0.01199,0.001763
"(DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1608637542), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1273642419), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1935803228), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=787846414), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=996406378), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1201263687), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=423734972), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=415968276), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=670094950), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1914837113), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=669991378), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=429389014), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=249467210), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1972458954), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1572714583), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1433267572), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=434285667), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=613608295), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=893664919), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=648061058), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=88409749), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=242285876), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2018247425), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=953477463), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1427830251), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1883569565), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=911989541), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=3344769), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=780932287), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2114032571), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=787716372), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=504579232), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1306710475), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=479546681), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=106328085), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=30349564), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1855189739), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=99052376), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1250819632), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=106406362), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=480404538), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1717389822), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=599121577), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=200427519), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1254751707), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2034764475), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1573512143), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=999745294), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1958805693), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=389151677), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1224821422), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=508464061), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=857592370), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1642661739), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=61136438), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2075460851), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=396917567), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2004731384), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=199502978), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1545932260), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=461901618), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=774414982), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=732395540), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1934879560), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=279394470), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=56972561), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1927948675), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1899242072), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1999874363), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=271820813), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1324556529), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1655351289), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1308306184), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=68574553), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=419498548), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=991681409), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=791274835), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1035196507), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1890440558), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=787110843), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=524150214), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=472432043), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=2126768636), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1431061255), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=147697582), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=744595490), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1758017741), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1679592528), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1111451555), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=782698033), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=698027879), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1096768899), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1338788865), DecisionTreeRegressor(max_depth=21, max_features='sqrt',\n random_state=1826030589), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=86191493), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=893102645), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=200619113), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=290770691), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=793943861), DecisionTreeRegressor(max_depth=21, max_features='sqrt', random_state=134489564), ...)",0.004027,0.012125,0.008098
"(DecisionTreeRegressor(max_depth=5, random_state=1608637542), DecisionTreeRegressor(max_depth=5, random_state=829499329), DecisionTreeRegressor(max_depth=5, random_state=1585070673), DecisionTreeRegressor(max_depth=5, random_state=442551518), DecisionTreeRegressor(max_depth=5, random_state=1751393329), DecisionTreeRegressor(max_depth=5, random_state=123700147), DecisionTreeRegressor(max_depth=5, random_state=1313855609), DecisionTreeRegressor(max_depth=5, random_state=1581823662), DecisionTreeRegressor(max_depth=5, random_state=649389236), DecisionTreeRegressor(max_depth=5, random_state=248260616), DecisionTreeRegressor(max_depth=5, random_state=1992285976), DecisionTreeRegressor(max_depth=5, random_state=1487997034), DecisionTreeRegressor(max_depth=5, random_state=1506255036), DecisionTreeRegressor(max_depth=5, random_state=873098516), DecisionTreeRegressor(max_depth=5, random_state=1475246069), DecisionTreeRegressor(max_depth=5, random_state=1955978330), DecisionTreeRegressor(max_depth=5, random_state=1554391404), DecisionTreeRegressor(max_depth=5, random_state=1151797810), DecisionTreeRegressor(max_depth=5, random_state=63580194), DecisionTreeRegressor(max_depth=5, random_state=87759803), DecisionTreeRegressor(max_depth=5, random_state=93767173), DecisionTreeRegressor(max_depth=5, random_state=705913839), DecisionTreeRegressor(max_depth=5, random_state=642124732), DecisionTreeRegressor(max_depth=5, random_state=404365017), DecisionTreeRegressor(max_depth=5, random_state=752562654), DecisionTreeRegressor(max_depth=5, random_state=846097625), DecisionTreeRegressor(max_depth=5, random_state=708269083), DecisionTreeRegressor(max_depth=5, random_state=1963827238), DecisionTreeRegressor(max_depth=5, random_state=425536186), DecisionTreeRegressor(max_depth=5, random_state=1032956233), DecisionTreeRegressor(max_depth=5, random_state=1621547958), DecisionTreeRegressor(max_depth=5, random_state=1807488083), DecisionTreeRegressor(max_depth=5, random_state=52542812), DecisionTreeRegressor(max_depth=5, random_state=1978851827), DecisionTreeRegressor(max_depth=5, random_state=1564023527), DecisionTreeRegressor(max_depth=5, random_state=516530012), DecisionTreeRegressor(max_depth=5, random_state=1496045851), DecisionTreeRegressor(max_depth=5, random_state=1949765396), DecisionTreeRegressor(max_depth=5, random_state=610047358), DecisionTreeRegressor(max_depth=5, random_state=1960467697), DecisionTreeRegressor(max_depth=5, random_state=542532488), DecisionTreeRegressor(max_depth=5, random_state=1228678302), DecisionTreeRegressor(max_depth=5, random_state=981662970), DecisionTreeRegressor(max_depth=5, random_state=956748525), DecisionTreeRegressor(max_depth=5, random_state=175162603), DecisionTreeRegressor(max_depth=5, random_state=1937748223), DecisionTreeRegressor(max_depth=5, random_state=1521290213), DecisionTreeRegressor(max_depth=5, random_state=362113584), DecisionTreeRegressor(max_depth=5, random_state=897677894), DecisionTreeRegressor(max_depth=5, random_state=1141829407))",0.007257,0.01252,0.005263


In [98]:
estimators = [("linear_regression", lr),
              ("lasso", best_alpha_lasso),
              ("elastic_net", best_alpha_l1_en),
              ("xgboost", best_xgb),
              ("random_forest", best_rf),
              ("bayesian", br),
              ("ridge", best_alpha_ridge),
              ("gradient boosting", gbb),
              ("LARS", llars),
              ("lightGBM", lgb_model)
              ]
# listing all the best estimators and dropping dt (lowest RMSLE)
best_stacked = StackingRegressor(estimators=estimators).fit(X_train_proc, y_train)

[LightGBM] [Info] Total Bins 3009
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 83
[LightGBM] [Info] Start training from score 12.030652
[LightGBM] [Info] Total Bins 2845
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 82
[LightGBM] [Info] Start training from score 12.029308
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 81
[LightGBM] [Info] Start training from score 12.020792
[LightGBM] [Info] Total Bins 2843
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 82
[LightGBM] [Info] Start training from score 12.041222
[LightGBM] [Info] Total Bins 2837
[LightGBM] [Info] Number of data points in the train set: 935, number of used features: 81
[LightGBM] [Info] Start training from score 12.034284
[LightGBM] [Info] Total Bins 2843
[LightGBM] [Info] Number of data points in the train set: 935, number 

In [99]:
train_test_RMSLE([best_stacked], X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test RMSLE")

[LightGBM] [Info] Total Bins 3009
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 83
[LightGBM] [Info] Start training from score 12.030652
[LightGBM] [Info] Total Bins 2845
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 82
[LightGBM] [Info] Start training from score 12.029308
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 81
[LightGBM] [Info] Start training from score 12.020792
[LightGBM] [Info] Total Bins 2843
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 82
[LightGBM] [Info] Start training from score 12.041222
[LightGBM] [Info] Total Bins 2837
[LightGBM] [Info] Number of data points in the train set: 935, number of used features: 81
[LightGBM] [Info] Start training from score 12.034284
[LightGBM] [Info] Total Bins 2843
[LightGBM] [Info] Number of data points in the train set: 935, number 



Unnamed: 0,Training RMSLE,Test RMSLE,Variance
"StackingRegressor(estimators=[('linear_regression', LinearRegression()),\n ('lasso', Lasso(alpha=0.001, random_state=42)),\n ('elastic_net',\n ElasticNet(alpha=0.01, l1_ratio=0.1,\n max_iter=10000, random_state=42)),\n ('xgboost',\n XGBRegressor(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, device=None,\n earl...\n n_estimators=1000, n_jobs=5, ...)),\n ('random_forest',\n RandomForestRegressor(max_depth=21,\n max_features='sqrt',\n n_estimators=200,\n random_state=42)),\n ('bayesian', BayesianRidge()),\n ('ridge', Ridge(alpha=10, random_state=42)),\n ('gradient boosting',\n GradientBoostingRegressor(random_state=42)),\n ('LARS', LassoLars()),\n ('lightGBM',\n LGBMRegressor(force_row_wise=True,\n random_state=42))])",0.004811,0.010655,0.005845


# Submission staging

In [100]:
test_file_path = "../../house-prices-advanced-regression-techniques/input/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop("Id")

test_data.drop(columns=["MiscFeature", "MSSubClass"], inplace=True)
test_data_proc = saved_preprocessor.transform(test_data)

preds = best_stacked.predict(test_data_proc)
preds = np.exp(preds)

output = pd.DataFrame({"Id": ids,
                       "SalePrice": preds})

output.head()



Unnamed: 0,Id,SalePrice
0,1461,120795.365938
1,1462,157366.832163
2,1463,184363.673844
3,1464,192473.210166
4,1465,195618.402427


# Submission export

In [101]:
sample_submission_file_path = "../../house-prices-advanced-regression-techniques/input/sample_submission.csv"
sample_submission_df = pd.read_csv(sample_submission_file_path)
sample_submission_df["SalePrice"] = preds
sample_submission_df.to_csv("submissions/" + "02t_lgbm.csv", index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120795.365938
1,1462,157366.832163
2,1463,184363.673844
3,1464,192473.210166
4,1465,195618.402427


# Save model fo future use

In [102]:
# Fitted preprocessor on all data
joblib.dump(best_stacked, "model.joblib")

['model.joblib']