## Library imports

In [40]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.25")
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import (root_mean_squared_log_error, mean_absolute_error, mean_squared_error, r2_score, 
                             mean_absolute_percentage_error, root_mean_squared_error)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor, plot_tree

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

import xgboost as xgb

import joblib

# importing stacking lib
from vecstack import stacking, StackingTransformer

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category = ConvergenceWarning) # Ignore ConvergenceWarning

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

import multiprocessing


# Load Dataset, split, and pre-process

In [7]:
file_path = "../../house-prices-advanced-regression-techniques/input/train.csv"
houses = pd.read_csv(file_path)

X = houses.drop(columns = "SalePrice")
y = houses["SalePrice"].copy()
y = np.log(y) # taking logarithm for a more normally distributed target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# column selection
ohe_cols = ["MoSold",
            "GarageFinish",
            "CentralAir",
            "Street",
            "Alley",
            "LotShape",
            "LandContour",
            "Utilities",
            "LotConfig",
            "LandSlope"]
num_cols = ["MiscVal",
            "YrSold",
            "PoolArea",
            "GarageCars",
            "GarageArea",
            "GarageYrBlt",
            "Fireplaces",
            "TotRmsAbvGrd",
            "1stFlrSF",
            "2ndFlrSF",
            "LowQualFinSF",
            "GrLivArea",
            "BsmtFullBath",
            "BsmtHalfBath",
            "FullBath",
            "HalfBath",
            "BedroomAbvGr",
            "KitchenAbvGr",
            "BsmtFinSF1",
            "BsmtFinSF2",
            "BsmtUnfSF",
            "TotalBsmtSF",
            "MasVnrArea",
            "LotFrontage",
            "LotArea",
            "YearBuilt",
            "YearRemodAdd",
            "OverallQual",
            "OverallCond"]
porch_cols = ["ScreenPorch",
            "3SsnPorch",
            "WoodDeckSF",
            "OpenPorchSF",
            "EnclosedPorch"]
hash_cols = ["SaleCondition",
            "SaleType",
            "GarageType",
            "Heating",
            "MSSubClass",
            "MSZoning",
            "Neighborhood",		
            "Condition1",
            "Condition2",
            "BldgType",
            "HouseStyle",
            "RoofStyle",
            "RoofMatl",
            "Exterior1st",
            "Exterior2nd",
            "MasVnrType",
            "Foundation",
            "BsmtFinType1",
            "BsmtFinType2"]
ord_cols = ["Fence",
            "PavedDrive",
            "Functional",
            "Electrical",
            "BsmtExposure",
            "HeatingQC",
            "KitchenQual",
            "GarageCond",
            "GarageQual",
            "FireplaceQu",
            "BsmtQual",
            "BsmtCond",
            "ExterCond",
            "ExterQual",
            "PoolQC"]

# Porch sum function
def porch_func(df):
    df = df.copy()
    df["Porch_sum"] = 0
    for porch in porch_cols:
        df["Porch_sum"] = df["Porch_sum"] + df[porch]
    df.drop(porch_cols, axis=1, inplace=True)
    return df

# find hash components needed and add one for safety
hash_n = math.ceil(math.log2(max(houses.select_dtypes(include="object").nunique()))) + 1

# ordinal map
ord_map = [{"col": "Fence", "mapping": {"Missing":0, "MnWw":1, "GdWo":2, "MnPrv":3, "GdPrv":4}},
             {"col": "PavedDrive", "mapping": {"N":1, "P":2, "Y":3}},
             {"col": "Functional", "mapping": {"Sal":-7, "Sev":-6, "Maj2":-5, "Maj1":-4, "Mod":-3, "Min2":-2, "Min1":-1, "Missing": 0, "Typ":0}},
             {"col": "Electrical", "mapping": {"Missing":0, "FuseP": -2, "FuseF":-1, "Mix":0, "FuseA":1, "SBrkr":2}},
             {"col": "BsmtExposure", "mapping": {"Missing":0, "No":1, "Mn":2, "Av":3, "Gd":4}},
             {"col": "HeatingQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "KitchenQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "FireplaceQu", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "PoolQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}}]

# Instantiate Transformers
zero_imputer = SimpleImputer(strategy="constant", fill_value=0)
missing_imputer = SimpleImputer(strategy="constant", fill_value="Missing")

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop="if_binary")
porch_sum = FunctionTransformer(porch_func)
hash = HashingEncoder(cols=hash_cols, n_components=hash_n)
ord = OrdinalEncoder(cols=ord_cols, mapping=ord_map)

# pipelines
ohe_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("One Hot Encoder", ohe)])
hash_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Hashing Encoder", hash)])
ord_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Ordinal Encoder", ord)])


# Column Transformer Tuples
num_tuple = ("Numeric Imputation", zero_imputer, num_cols)
ohe_tuple = ("One Hot Encoder", ohe_pipe, ohe_cols)
porch_tuple = ("Sum of Porches", porch_sum, porch_cols)
hash_tuple = ("Hashing Encoder", hash_pipe, hash_cols)
ord_tuple = ("Ordinal Encoder", ord_pipe, ord_cols)

# Column Selector
preprocessor = ColumnTransformer([num_tuple, ohe_tuple, porch_tuple, hash_tuple, ord_tuple], remainder='drop',
                                 verbose_feature_names_out=False)

# Transform Data
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Begin model creating and testing

In [None]:
def train_test_RMSLE(models_list, X_train, X_test, y_train, y_test):
    metrics = ["Training RMSLE", "Test RMSLE", "Variance"]
    results = pd.DataFrame(columns=metrics, index=[models_list])
    for model in models_list:
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        trainRMSLE = root_mean_squared_log_error(np.clip(y_train, a_min=1, a_max=np.max(y_train)), np.clip(train_pred, a_min=1, a_max=np.max(train_pred)))
        test_pred = model.predict(X_test)
        testRMSLE = root_mean_squared_log_error(np.clip(y_test, a_min=1, a_max=np.max(y_test)), np.clip(test_pred, a_min=1, a_max=np.max(test_pred)))
        results.loc[model, "Training RMSLE"] = trainRMSLE
        results.loc[model, "Test RMSLE"] = testRMSLE
        results.loc[model, "Variance"] = np.abs(trainRMSLE - testRMSLE)

    return results

In [9]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_proc, y_train)
dt_depths = range(1, dt.get_depth()+1)
dt_params = {'max_depth': dt_depths}
dt_gs = GridSearchCV(dt, dt_params)
dt_gs.fit(X_train_proc, y_train)
best_depth_dt = dt_gs.best_estimator_
best_depth_dt

In [10]:
poly_pipe = Pipeline([("poly", PolynomialFeatures()),
                    ("linear", LinearRegression())])
poly_params = {'poly__degree': range(1, 4)}
poly_gs = GridSearchCV(poly_pipe, poly_params)
poly_gs.fit(X_train_proc, y_train)
best_degree_poly = poly_gs.best_estimator_
best_degree_poly
# since this is degree 1, it is just linear

In [76]:
lr = LinearRegression()
lr.fit(X_train_proc, y_train)

In [61]:
lasso = Lasso(random_state=42)
lasso_params = {'alpha': [1000, 100, 10, 1, .1, .01, .001, .0001]}
lasso_gs = GridSearchCV(lasso, lasso_params)
lasso_gs.fit(X_train_proc, y_train)
best_alpha_lasso = lasso_gs.best_estimator_
best_alpha_lasso

In [25]:
ridge = Ridge(random_state=42)
ridge_params = {'alpha': [1000, 100, 10, 1, .1, .01, .001, .0001]}
ridge_gs = GridSearchCV(ridge, ridge_params)
ridge_gs.fit(X_train_proc, y_train)
best_alpha_ridge = ridge_gs.best_estimator_
best_alpha_ridge

In [24]:
en = ElasticNet(random_state=42)
en_params = {"alpha": [.0001, .001, .01, .1, 1, 10, 100, 1000],
                 "l1_ratio": [x/100 for x in range(0, 101, 5)]}
en_gs = GridSearchCV(en, en_params)
en_gs.fit(X_train_proc, y_train)
best_alpha_l1_en = en_gs.best_estimator_
best_alpha_l1_en

In [49]:
xgb_model = xgb.XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, 
                             tree_method="hist", 
                             objective="reg:squaredlogerror")
xgb_model_params = {"max_depth": range(11),
                    "learning_rate": [x/10 for x in range(11)],
                    "n_estimators": [50, 100, 200]}
xgb_model_gs = GridSearchCV(xgb_model, xgb_model_params, n_jobs=2)

xgb_model_gs.fit(X_train_proc, y_train)
best_xgb = xgb_model_gs.best_estimator_
best_xgb

In [63]:
rf = RandomForestRegressor(random_state=42)
rf_params = {'max_depth': range(11),
             "max_features": [x/10 for x in range(11)]}
rf_gs = GridSearchCV(rf, rf_params)
rf_gs.fit(X_train_proc, y_train)
best_rf = rf_gs.best_estimator_
best_rf

105 fits failed out of a total of 605.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/bharat/Documents/GitHub/kaggle/.conda/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/bharat/Documents/GitHub/kaggle/.conda/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/bharat/Documents/GitHub/kaggle/.conda/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/bharat/Documents/GitHub/kaggle/.conda/lib/pytho

In [77]:
list_of_best_estimators = [best_depth_dt, 
                           best_alpha_lasso, 
                           best_alpha_ridge, 
                           best_alpha_l1_en, 
                           best_xgb,
                           best_rf,
                           lr]

train_test_RMSLE(list_of_best_estimators, X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test RMSLE")

Unnamed: 0,Training RMSLE,Test RMSLE,Variance
"Lasso(alpha=0.001, random_state=42)",0.010463,0.149857,0.139394
"ElasticNet(alpha=0.01, l1_ratio=0.05, random_state=42)",0.010412,0.149942,0.13953
"Ridge(alpha=10, random_state=42)",0.010311,0.151036,0.140725
"(DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1608637542), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1273642419), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1935803228), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=787846414), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=996406378), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1201263687), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=423734972), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=415968276), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=670094950), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1914837113), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=669991378), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=429389014), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=249467210), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1972458954), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1572714583), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1433267572), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=434285667), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=613608295), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=893664919), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=648061058), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=88409749), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=242285876), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2018247425), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=953477463), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1427830251), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1883569565), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=911989541), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=3344769), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=780932287), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2114032571), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=787716372), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=504579232), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1306710475), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=479546681), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=106328085), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=30349564), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1855189739), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=99052376), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1250819632), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=106406362), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=480404538), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1717389822), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=599121577), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=200427519), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1254751707), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2034764475), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1573512143), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=999745294), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1958805693), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=389151677), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1224821422), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=508464061), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=857592370), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1642661739), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=61136438), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2075460851), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=396917567), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2004731384), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=199502978), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1545932260), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=461901618), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=774414982), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=732395540), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1934879560), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=279394470), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=56972561), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1927948675), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1899242072), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1999874363), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=271820813), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1324556529), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1655351289), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1308306184), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=68574553), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=419498548), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=991681409), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=791274835), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1035196507), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1890440558), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=787110843), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=524150214), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=472432043), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=2126768636), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1431061255), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=147697582), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=744595490), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1758017741), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1679592528), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1111451555), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=782698033), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=698027879), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1096768899), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1338788865), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=1826030589), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=86191493), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=893102645), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=200619113), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=290770691), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=793943861), DecisionTreeRegressor(max_depth=10, max_features=0.3, random_state=134489564))",0.004911,0.15108,0.146169
LinearRegression(),0.010224,0.152658,0.142434
"XGBRegressor(base_score=None, booster=None, callbacks=None,\n colsample_bylevel=None, colsample_bynode=None,\n colsample_bytree=None, device=None, early_stopping_rounds=None,\n enable_categorical=False, eval_metric=None, feature_types=None,\n gamma=None, grow_policy=None, importance_type=None,\n interaction_constraints=None, learning_rate=0.3, max_bin=None,\n max_cat_threshold=None, max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3, max_leaves=None,\n min_child_weight=None, missing=nan, monotone_constraints=None,\n multi_strategy=None, n_estimators=200, n_jobs=5,\n num_parallel_tree=None, objective='reg:squaredlogerror', ...)",0.008555,0.156331,0.147776
"DecisionTreeRegressor(max_depth=5, random_state=42)",0.012213,0.196248,0.184035


In [78]:
estimators = [("xgb", best_xgb), # listing all the best estimators and dropping dt and ridge
                ("linear_regression", lr),
                ("lasso", best_alpha_lasso),
                ("elastic_net", best_alpha_l1_en),
                ("random_forest", best_rf)]

best_stacked = StackingRegressor(estimators=estimators)

In [79]:
train_test_RMSLE([best_stacked], X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test RMSLE")



Unnamed: 0,Training RMSLE,Test RMSLE,Variance
"StackingRegressor(estimators=[('xgb',\n XGBRegressor(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, device=None,\n early_stopping_rounds=None,\n enable_categorical=False,\n eval_metric=None,\n feature_types=None, gamma=None,\n grow_policy=None,\n importance_type=None,\n interaction_constraints=None,\n learning_ra...\n multi_strategy=None,\n n_estimators=200, n_jobs=5,\n num_parallel_tree=None,\n objective='reg:squaredlogerror', ...)),\n ('linear_regression', LinearRegression()),\n ('lasso', Lasso(alpha=0.001, random_state=42)),\n ('elastic_net',\n ElasticNet(alpha=0.01, l1_ratio=0.05,\n random_state=42)),\n ('random_forest',\n RandomForestRegressor(max_depth=10,\n max_features=0.3,\n random_state=42))])",0.005631,0.140311,0.13468


# Submission staging

In [80]:
test_file_path = "../../house-prices-advanced-regression-techniques/input/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop("Id")

test_data.drop(columns="MiscFeature", inplace=True)
test_data_proc = preprocessor.transform(test_data)

preds = best_stacked.predict(test_data_proc)
preds = np.exp(preds)

output = pd.DataFrame({"Id": ids,
                       "SalePrice": preds})

output.head()



Unnamed: 0,Id,SalePrice
0,1461,120517.278512
1,1462,160098.994987
2,1463,175956.526083
3,1464,191864.036516
4,1465,197522.387249


# Submission export

In [72]:
sample_submission_file_path = "../../house-prices-advanced-regression-techniques/input/sample_submission.csv"
sample_submission_df = pd.read_csv(sample_submission_file_path)
sample_submission_df["SalePrice"] = preds
sample_submission_df.to_csv("02e_stacked_model_submission_with_rf.csv", index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120517.278512
1,1462,160098.994987
2,1463,175956.526083
3,1464,191864.036516
4,1465,197522.38725


In [None]:
"""
to_save = {"knn" : knn,
           "decision_tree": best_dt,
           "logistic_regression": log_reg,
           "X_train": X_train} # can add the data if wanted

filename = "class_algos.joblib"

joblib.dump(to_save, filename)
"""

In [None]:
"""
saved = joblib.load(filename)
dt_saved = saved["decision_tree"]
dt_saved.predict(X_test)
"""