In [None]:
import sys
import os

# Add project root to pythonpath
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import numpy as np
import pandas as pd

from src.data.load_data import load_data
from src.features.engineering import feature_engineering
from src.features.preprocess import pipeline
from src.utils import model_summary

In [None]:
# Load data
data_path = '/data/train/train.csv'
data = load_data(data_path)

In [None]:
data_train = feature_engineering(data)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [None]:
# OneHotEncoder columns
ohe_cols = ['Street','Neighborhood','Condition1','Condition2','RoofMatl','Exterior1st',
               'Exterior2nd','Foundation','Heating','Electrical','GarageType', 'RoofStyle',
               'SaleType','SaleCondition','LotConfig','BldgType','HouseStyle', 'MSZoning']

# OrdinalEncoder columns
ore_cols = ['LotShape','LandContour','Utilities','LandSlope','ExterQual','ExterCond',
               'BsmtQual','BsmtExposure','BsmtFinType1','HeatingQC','CentralAir','KitchenQual',
               'Functional','FireplaceQu','GarageFinish','PavedDrive']

number_cols = data_train.select_dtypes(include=['number']).columns
number_cols = number_cols.drop('SalePrice')

In [None]:
X = data_train.drop('SalePrice', axis=1)
y = data_train['SalePrice']

In [None]:
dspipeline = pipeline(number_cols, ore_cols, ohe_cols)

In [None]:
X_prepro = dspipeline.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prepro, y, test_size=0.2, random_state=25)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_lr)

In [None]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    'max_depth': [12, 15, 18],
    'n_estimators': [250, 500, 750],
    'min_samples_split': [3, 5, 10]
}

rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rfr_cv.fit(X_train, y_train)

In [None]:
xgb_rmse = model_summary("RFR", rfr_cv, X_test, y_test)

# Best RFR MRSE result: 0.1488
# Test RFR MRSE result: 0.1345
# Best RFR MRSE result: 0.14
# Test RFR MRSE result: 0.1396


In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(random_state=13)

param_grid_xgb = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_cv = GridSearchCV(
    xgb, 
    param_grid_xgb, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1)

xgb_cv.fit(X_train, y_train)

In [None]:
xgb_rmse = model_summary("XGB", xgb_cv, X_test, y_test)

# Best XGB MRSE result: 0.1349
# Test XGB MRSE result: 0.1201
# Best XGB MRSE result: 0.1147
# Test XGB MRSE result: 0.1222


In [None]:
ridge = Ridge()

param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_cv.fit(X_train, y_train)

In [None]:
ridge_rmse = model_summary("Ridge", ridge_cv, X_test, y_test)

# Best Ridge MRSE result: 0.1494
# Test Ridge MRSE result: 0.1219
# Best Ridge MRSE result: 0.1104
# Test Ridge MRSE result: 0.1236


In [None]:
gbr = GradientBoostingRegressor()

param_grid_gbr = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1], 
    'max_features': [0.01, 0.1, 0.7]
}

gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_cv.fit(X_train, y_train)

In [None]:
gbr_rmse = model_summary("GBR", gbr_cv, X_test, y_test)

# Best GBR MRSE result: 0.1302
# Test GBR MRSE result: 0.1195
# Best GBR MRSE result: 0.1166
# Test GBR MRSE result: 0.1218


In [None]:
import lightgbm as lgb

In [None]:
lgbm_regressor = lgb.LGBMRegressor()

param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

lgbm_cv = GridSearchCV(
    lgbm_regressor, 
    param_grid_lgbm, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

lgbm_cv.fit(X_train, y_train)

In [None]:
lgbm_rmse = model_summary("LGBM", lgbm_cv, X_test, y_test)

# Best LGBM MRSE result: 0.1403
# Test LGBM MRSE result: 0.1276
# Best LGBM MRSE result: 0.1283
# Test LGBM MRSE result: 0.1253


In [None]:
from catboost import CatBoostRegressor

In [None]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

param_grid_cat ={
    'iterations': [100, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.5]
}

cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
cat_cv.fit(X_train, y_train)

In [None]:
cat_cv_rmse = model_summary("CatBoost", cat_cv, X_test, y_test)

# Best CatBoost MRSE result: 0.1308
# Test CatBoost MRSE result: 0.1138
# Best CatBoost MRSE result: 0.1128
# Test CatBoost MRSE result: 0.1188


In [None]:
#Print RMSE values

print(f"cat_cv_rmse: {round(cat_cv_rmse, 4)}")
print(f"lgbm_rmse: {round(lgbm_rmse, 4)}")
print(f"xgb_rmse: {round(xgb_rmse, 4)}")
print(f"ridge_rmse: {round(ridge_rmse, 4)}")
print(f"gbr_rmse: {round(gbr_rmse, 4)}")

In [None]:
vr = VotingRegressor([('gbr', gbr_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_),
                      ],
                    weights=[2,1,1])

vr.fit(X_train, y_train)

In [None]:
y_pred_vr = vr.predict(X_test)
mean_squared_error(y_test, y_pred_vr, squared=False)

# weights=[3,2,1] :: 0.1141, 0.1149
# weights=[2,2,1] :: 0.1156, 0.1139
# weights=[2,1,2] :: 0.1146, 0.1145

In [None]:
estimators = [
    ('gbr', gbr_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

In [None]:
stackreg = StackingRegressor(
            estimators = estimators,
            final_estimator = vr
)

In [None]:
stackreg.fit(X_train, y_train)

In [None]:
y_pred_stack = stackreg.predict(X_test)
mean_squared_error(y_test, y_pred_stack, squared=False)

In [None]:
dtest = pd.read_csv('../data/test/test.csv')
data_test_f = feature_engineering(dtest)

In [None]:
X_test_f = dspipeline.transform(data_test_f)

In [None]:
y_test_f = np.exp(stackreg.predict(X_test_f))


In [None]:
result = dtest[['Id']].copy()
result.loc[:, 'SalePrice'] = y_test_f

In [None]:
result.to_csv('../predictions/submission_v2_20250114.csv', index=False)

# Scored Mean Squared Error (with data cleaning) => 0.13110
# Scored Mean Squared Error (no data cleaning) => 0.13011