## Library imports

In [54]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.25")
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, 
                             mean_absolute_percentage_error, root_mean_squared_error)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor, plot_tree

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

from xgboost import XGBRegressor

import joblib

# importing stacking lib
from vecstack import stacking, StackingTransformer

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category = ConvergenceWarning) # Ignore ConvergenceWarning

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

# Load Dataset, split, and pre-process

In [38]:
file_path = "../../house-prices-advanced-regression-techniques/input/train.csv"
houses = pd.read_csv(file_path)

X = houses.drop(columns = "SalePrice")
y = houses["SalePrice"].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# column selection
ohe_cols = ["MoSold",
            "GarageFinish",
            "CentralAir",
            "Street",
            "Alley",
            "LotShape",
            "LandContour",
            "Utilities",
            "LotConfig",
            "LandSlope"]
num_cols = ["MiscVal",
            "YrSold",
            "PoolArea",
            "GarageCars",
            "GarageArea",
            "GarageYrBlt",
            "Fireplaces",
            "TotRmsAbvGrd",
            "1stFlrSF",
            "2ndFlrSF",
            "LowQualFinSF",
            "GrLivArea",
            "BsmtFullBath",
            "BsmtHalfBath",
            "FullBath",
            "HalfBath",
            "BedroomAbvGr",
            "KitchenAbvGr",
            "BsmtFinSF1",
            "BsmtFinSF2",
            "BsmtUnfSF",
            "TotalBsmtSF",
            "MasVnrArea",
            "LotFrontage",
            "LotArea",
            "YearBuilt",
            "YearRemodAdd",
            "OverallQual",
            "OverallCond"]
porch_cols = ["ScreenPorch",
            "3SsnPorch",
            "WoodDeckSF",
            "OpenPorchSF",
            "EnclosedPorch"]
hash_cols = ["SaleCondition",
            "SaleType",
            "GarageType",
            "Heating",
            "MSSubClass",
            "MSZoning",
            "Neighborhood",		
            "Condition1",
            "Condition2",
            "BldgType",
            "HouseStyle",
            "RoofStyle",
            "RoofMatl",
            "Exterior1st",
            "Exterior2nd",
            "MasVnrType",
            "Foundation",
            "BsmtFinType1",
            "BsmtFinType2"]
ord_cols = ["Fence",
            "PavedDrive",
            "Functional",
            "Electrical",
            "BsmtExposure",
            "HeatingQC",
            "KitchenQual",
            "GarageCond",
            "GarageQual",
            "FireplaceQu",
            "BsmtQual",
            "BsmtCond",
            "ExterCond",
            "ExterQual",
            "PoolQC"]

# Porch sum function
def porch_func(df):
    df = df.copy()
    df["Porch_sum"] = 0
    for porch in porch_cols:
        df["Porch_sum"] = df["Porch_sum"] + df[porch]
    df.drop(porch_cols, axis=1, inplace=True)
    return df

# find hash components needed and add one for safety
hash_n = math.ceil(math.log2(max(houses.select_dtypes(include="object").nunique()))) + 1

# ordinal map
ord_map = [{"col": "Fence", "mapping": {"Missing":0, "MnWw":1, "GdWo":2, "MnPrv":3, "GdPrv":4}},
             {"col": "PavedDrive", "mapping": {"N":1, "P":2, "Y":3}},
             {"col": "Functional", "mapping": {"Sal":-7, "Sev":-6, "Maj2":-5, "Maj1":-4, "Mod":-3, "Min2":-2, "Min1":-1, "Missing": 0, "Typ":0}},
             {"col": "Electrical", "mapping": {"Missing":0, "FuseP": -2, "FuseF":-1, "Mix":0, "FuseA":1, "SBrkr":2}},
             {"col": "BsmtExposure", "mapping": {"Missing":0, "No":1, "Mn":2, "Av":3, "Gd":4}},
             {"col": "HeatingQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "KitchenQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "GarageQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "FireplaceQu", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "BsmtCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterCond", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "ExterQual", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}},
             {"col": "PoolQC", "mapping": {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, "Missing": 0}}]

# Instantiate Transformers
zero_imputer = SimpleImputer(strategy="constant", fill_value=0)
missing_imputer = SimpleImputer(strategy="constant", fill_value="Missing")

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop="if_binary")
porch_sum = FunctionTransformer(porch_func)
hash = HashingEncoder(cols=hash_cols, n_components=hash_n)
ord = OrdinalEncoder(cols=ord_cols, mapping=ord_map)

# pipelines
ohe_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("One Hot Encoder", ohe)])
hash_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Hashing Encoder", hash)])
ord_pipe = Pipeline([("Missing Imputer", missing_imputer),
                     ("Ordinal Encoder", ord)])


# Column Transformer Tuples
num_tuple = ("Numeric Imputation", zero_imputer, num_cols)
ohe_tuple = ("One Hot Encoder", ohe_pipe, ohe_cols)
porch_tuple = ("Sum of Porches", porch_sum, porch_cols)
hash_tuple = ("Hashing Encoder", hash_pipe, hash_cols)
ord_tuple = ("Ordinal Encoder", ord_pipe, ord_cols)

# Column Selector
preprocessor = ColumnTransformer([num_tuple, ohe_tuple, porch_tuple, hash_tuple, ord_tuple], remainder='drop',
                                 verbose_feature_names_out=False)

# Transform Data
preprocessor.fit(X_train)
X_train_proc = preprocessor.transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Begin model creating and testing

In [7]:
def logRMSE(models_list, X_train, X_test, y_train, y_test):
    metrics = ["Training log RMSE", "Test log RMSE", "Distance from training"]
    results = pd.DataFrame(columns=metrics, index=[models_list])
    for model in models_list:
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        trainRMSE = root_mean_squared_error(np.log1p(np.clip(y_train, a_min=0, a_max=np.max(y_train))), np.log1p(np.clip(train_pred, a_min=0, a_max=np.max(train_pred))))
        test_pred = model.predict(X_test)
        testRMSE = root_mean_squared_error(np.log1p(np.clip(y_test, a_min=0, a_max=np.max(y_test))), np.log1p(np.clip(test_pred, a_min=0, a_max=np.max(test_pred))))
        results.loc[model, "Training log RMSE"] = trainRMSE
        results.loc[model, "Test log RMSE"] = testRMSE
        results.loc[model, "Distance from training"] = np.abs(trainRMSE - testRMSE)

    return results

In [8]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_proc, y_train)
depths = range(1, dt.get_depth()+1)
dt_params = {'max_depth': depths}
dt_gs = GridSearchCV(dt, dt_params)
dt_gs.fit(X_train_proc, y_train)
best_depth_dt = dt_gs.best_estimator_
best_depth_dt

In [9]:
poly_pipe = Pipeline([("poly", PolynomialFeatures()),
                    ("linear", LinearRegression())])
poly_params = {'poly__degree': range(1, 4)}
poly_gs = GridSearchCV(poly_pipe, poly_params)
poly_gs.fit(X_train_proc, y_train)
best_degree_poly = poly_gs.best_estimator_
best_degree_poly

In [None]:
lasso = Lasso(random_state=42)
lasso.fit(X_train_proc, y_train)
lasso_params = {'alpha': [100, 10, 1, .1, .01, .001]}
lasso_gs = GridSearchCV(lasso, lasso_params)
lasso_gs.fit(X_train_proc, y_train)
best_alpha_lasso = lasso_gs.best_estimator_
best_alpha_lasso

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [12]:
ridge = Ridge(random_state=42)
ridge.fit(X_train_proc, y_train)
ridge_params = {'alpha': [1000, 100, 10, 1, .1, .01, .001, .0001]}
ridge_gs = GridSearchCV(ridge, ridge_params)
ridge_gs.fit(X_train_proc, y_train)
best_alpha_ridge = ridge_gs.best_estimator_
best_alpha_ridge

In [18]:
en = ElasticNet(random_state=42)
en.fit(X_train_proc, y_train)
en_params = {"alpha": [.0001, .001, .01, .1, 1, 10, 100, 1000],
                 "l1_ratio": [x/100 for x in range(0, 101, 5)]}
en_gs = GridSearchCV(en, en_params)
en_gs.fit(X_train_proc, y_train)
best_alpha_l1_en = en_gs.best_estimator_
best_alpha_l1_en

In [None]:
list_of_best_estimators = [best_depth_dt, best_degree_poly, best_alpha_lasso, best_alpha_ridge, best_alpha_l1_en]

logRMSE(list_of_best_estimators, X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test log RMSE")

Unnamed: 0,Training log RMSE,Test log RMSE,Distance from training
"DecisionTreeRegressor(max_depth=9, random_state=42)",0.078888,0.184906,0.106018
"Ridge(alpha=100, random_state=42)",0.35057,0.186941,0.163629
"Lasso(alpha=100, max_iter=100000, random_state=42)",0.352352,0.189304,0.163049
"ElasticNet(alpha=0.1, l1_ratio=0.4, random_state=42)",0.350717,0.190194,0.160523
"(PolynomialFeatures(degree=1), LinearRegression())",0.353535,0.195056,0.158479


In [59]:
estimators = [("decision_tree", best_depth_dt), # listing all the best estimators
                ("poly", best_degree_poly),
                ("lasso", best_alpha_lasso),
                ("ridge", best_alpha_ridge),
                ("elasctic_net", best_alpha_l1_en)
                ]

best_stacked = StackingRegressor( # using a random forest regressor to combine the best estimators
    estimators=estimators,
    final_estimator=RandomForestRegressor(random_state=42)
    )

In [60]:
logRMSE([best_stacked], X_train_proc, X_test_proc, y_train, y_test).sort_values(by="Test log RMSE")



Unnamed: 0,Training log RMSE,Test log RMSE,Distance from training
"StackingRegressor(estimators=[('decision_tree',\n DecisionTreeRegressor(max_depth=9,\n random_state=42)),\n ('poly',\n Pipeline(steps=[('poly',\n PolynomialFeatures(degree=1)),\n ('linear',\n LinearRegression())])),\n ('lasso',\n Lasso(alpha=100, max_iter=100000,\n random_state=42)),\n ('ridge', Ridge(alpha=100, random_state=42)),\n ('elasctic_net',\n ElasticNet(alpha=0.1, l1_ratio=0.4,\n random_state=42))],\n final_estimator=RandomForestRegressor(random_state=42))",0.11763,0.149076,0.031446


# New Submission Time!

In [61]:
test_file_path = "../../house-prices-advanced-regression-techniques/input/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop("Id")

test_data.drop(columns="MiscFeature", inplace=True)
test_data_proc = preprocessor.transform(test_data)

preds = best_stacked.predict(test_data_proc)
output = pd.DataFrame({"Id": ids,
                       "SalePrice": preds})

output.head()



Unnamed: 0,Id,SalePrice
0,1461,133293.0
1,1462,150849.5
2,1463,157020.0
3,1464,185845.76
4,1465,205968.05


In [65]:
sample_submission_file_path = "../../house-prices-advanced-regression-techniques/input/sample_submission.csv"
sample_submission_df = pd.read_csv(sample_submission_file_path)
sample_submission_df["SalePrice"] = preds
sample_submission_df.to_csv("02_stacked_model_submission.csv", index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,133293.0
1,1462,150849.5
2,1463,157020.0
3,1464,185845.76
4,1465,205968.05


In [None]:
"""
to_save = {"knn" : knn,
           "decision_tree": best_dt,
           "logistic_regression": log_reg,
           "X_train": X_train} # can add the data if wanted

filename = "class_algos.joblib"

joblib.dump(to_save, filename)
"""

In [None]:
"""
saved = joblib.load(filename)
dt_saved = saved["decision_tree"]
dt_saved.predict(X_test)
"""