In [1]:
import sys
import os

# Add project root to pythonpath
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data.load_data import load_data
from src.features.engineering import feature_engineering
from src.features.preprocess import pipeline
from src.utils import model_summary

In [3]:
# Load data
data_path = '/data/train/train.csv'
data = load_data(data_path)

In [4]:
data_train = feature_engineering(data)

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

In [6]:
# Algunos serán OneHotEncoder otros OrdinalEncoder
# Yo borré estas y el chico, no  'GarageQual', 'MasVnrType'
# OneHotEncoder:
ohe_cols = ['Street','Neighborhood','Condition1','Condition2','RoofMatl','Exterior1st',
               'Exterior2nd','Foundation','Heating','Electrical','GarageType', 'RoofStyle',
               'SaleType','SaleCondition','LotConfig','BldgType','HouseStyle', 'MSZoning']

# OrdinalEncoder 
ore_cols = ['LotShape','LandContour','Utilities','LandSlope','ExterQual','ExterCond',
               'BsmtQual','BsmtExposure','BsmtFinType1','HeatingQC','CentralAir','KitchenQual',
               'Functional','FireplaceQu','GarageFinish','PavedDrive']

number_cols = data_train.select_dtypes(include=['number']).columns
number_cols = number_cols.drop('SalePrice')

In [7]:
X = data_train.drop('SalePrice', axis=1)
y_train = data_train['SalePrice']

In [8]:
dspipeline = pipeline(number_cols, ore_cols, ohe_cols)

In [9]:
X_train = dspipeline.fit_transform(X)

In [10]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    'max_depth': [12, 15, 18],
    'n_estimators': [250, 500, 750],
    'min_samples_split': [3, 5, 10]
}
rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rfr_cv.fit(X_train, y_train)

# Ojo, primero me daba: RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=500,random_state=13)
# Y despues: RandomForestRegressor(max_depth=18, min_samples_split=3, n_estimators=750,random_state=13)
# Que diferencia de resultados y rendimiento equivale?

In [11]:
xgb_rmse = model_summary("RFR", rfr_cv, X_train, y_train)
'''
Best RFR MRSE result: 0.14
Test RFR MRSE result: 0.1396

Best RFR MRSE result: 0.1323
'''

Best RFR MRSE result: 0.1323
Test RFR MRSE result: 0.0512


'\nBest RFR MRSE result: 0.14\nTest RFR MRSE result: 0.1396\n\nBest RFR MRSE result: 0.1323\n'

In [12]:
xgb = XGBRegressor(random_state=13)

param_grid_xgb = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_cv = GridSearchCV(
    xgb, 
    param_grid_xgb, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1)

xgb_cv.fit(X_train, y_train)

In [13]:
xgb_rmse = model_summary("XGB", xgb_cv, X_train, y_train)
'''
Best XGB MRSE result: 0.1147
Test XGB MRSE result: 0.1222

Best XGB MRSE result: 0.1169
'''

Best XGB MRSE result: 0.1169
Test XGB MRSE result: 0.0575


'\nBest XGB MRSE result: 0.1147\nTest XGB MRSE result: 0.1222\n\nBest XGB MRSE result: 0.1169\n'

In [14]:
ridge = Ridge()

param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_cv.fit(X_train, y_train)



In [15]:
ridge_rmse = model_summary("Ridge", ridge_cv, X_train, y_train)

'''
Best Ridge MRSE result: 0.1104
Test Ridge MRSE result: 0.1236

Best Ridge MRSE result: 0.1124
'''

Best Ridge MRSE result: 0.1124
Test Ridge MRSE result: 0.1006


'\nBest Ridge MRSE result: 0.1104\nTest Ridge MRSE result: 0.1236\n\nBest Ridge MRSE result: 0.1124\n'

In [16]:
gbr = GradientBoostingRegressor()

param_grid_gbr = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1], 
    'max_features': [0.01, 0.1, 0.7]
}

gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_cv.fit(X_train, y_train)

In [17]:
gbr_rmse = model_summary("GBR", gbr_cv, X_train, y_train)

'''
Best GBR MRSE result: 0.1166
Test GBR MRSE result: 0.1218

Best GBR MRSE result: 0.1163
'''

Best GBR MRSE result: 0.1165
Test GBR MRSE result: 0.063


'\nBest GBR MRSE result: 0.1166\nTest GBR MRSE result: 0.1218\n\nBest GBR MRSE result: 0.1163\n'

In [18]:
lgbm_regressor = lgb.LGBMRegressor()

param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

lgbm_cv = GridSearchCV(
    lgbm_regressor, 
    param_grid_lgbm, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

lgbm_cv.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1631
[LightGBM] [Info] Number of data points in the train set: 961, number of used features: 102
[LightGBM] [Info] Start training from score 12.018491
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1636
[LightGBM] [Info] Number of data points in the train set: 961, number of used features: 102
[LightGBM] [Info] Start training from score 12

In [19]:
lgbm_rmse = model_summary("LGBM", lgbm_cv, X_train, y_train)

'''
Best LGBM MRSE result: 0.1283
Test LGBM MRSE result: 0.1253
'''

Best LGBM MRSE result: 0.1274
Test LGBM MRSE result: 0.0577


'\nBest LGBM MRSE result: 0.1283\nTest LGBM MRSE result: 0.1253\n'

In [20]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

param_grid_cat ={
    'iterations': [100, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.5]
}

cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
cat_cv.fit(X_train, y_train)



In [21]:
cat_cv_rmse = model_summary("CatBoost", cat_cv, X_train, y_train)

'''
Best CatBoost MRSE result: 0.1128
Test CatBoost MRSE result: 0.1188
'''

Best CatBoost MRSE result: 0.115
Test CatBoost MRSE result: 0.0562


'\nBest CatBoost MRSE result: 0.1128\nTest CatBoost MRSE result: 0.1188\n'

In [22]:
print(f"cat_cv_rmse: {round(cat_cv_rmse, 4)}")
print(f"lgbm_rmse: {round(lgbm_rmse, 4)}")
print(f"xgb_rmse: {round(xgb_rmse, 4)}")
print(f"ridge_rmse: {round(ridge_rmse, 4)}")
print(f"gbr_rmse: {round(gbr_rmse, 4)}")

'''
cat_cv_rmse: 0.1188
gbr_rmse: 0.1218
xgb_rmse: 0.1222
ridge_rmse: 0.1236
lgbm_rmse: 0.1253
'''


Best CatBoost MRSE result: 0.115
Best Ridge MRSE result: 0.1124
Best XGB MRSE result: 0.1169
Best GBR MRSE result: 0.1165
Best LGBM MRSE result: 0.1274
Best RFR MRSE result: 0.1323

cat_cv_rmse: 0.0562
lgbm_rmse: 0.0577
xgb_rmse: 0.0575
ridge_rmse: 0.1006
gbr_rmse: 0.063


'\ncat_cv_rmse: 0.1188\ngbr_rmse: 0.1218\nxgb_rmse: 0.1222\nridge_rmse: 0.1236\nlgbm_rmse: 0.1253\n'

In [35]:
vr = VotingRegressor([('gbr', gbr_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_),
                      ],
                    weights=[2,1,1])

vr.fit(X_train, y_train)

In [36]:
y_pred_vr = vr.predict(X_train)
mean_squared_error(y_train, y_pred_vr, squared=False)

# weights=[3,2,1] :: 0.1141, 0.1149
# weights=[2,2,1] :: 0.1156, 0.1139
# weights=[2,1,2] :: 0.1146, 0.1145
# 0.11694689658197516
#0.06728136313385406



0.0672134755844182

In [37]:
estimators = [
    ('gbr', gbr_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

In [38]:
stackreg = StackingRegressor(
            estimators = estimators,
            final_estimator = vr
)

In [40]:
stackreg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001868 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1804
[LightGBM] [Info] Number of data points in the train set: 1441, number of used features: 117
[LightGBM] [Info] Start training from score 12.016632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1713
[LightGBM] [Info] Number of data points in the train set: 1152, number of used features: 109
[LightGBM] [Info] Start training from score 12.013256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 1153, number of used features: 108
[LightGBM] [Info] Start t

In [41]:
y_pred_stack = stackreg.predict(X_train)
mean_squared_error(y_train, y_pred_stack, squared=False)

#0.11742040344241811



0.06351155565188486

In [42]:
dtest = pd.read_csv('../data/test/test.csv')
data_test_f = feature_engineering(dtest)

In [43]:
X_test_f = dspipeline.transform(data_test_f)

In [44]:
y_test_f = np.exp(stackreg.predict(X_test_f))


In [45]:
result = dtest[['Id']].copy()
result.loc[:, 'SalePrice'] = y_test_f

In [46]:
result.to_csv('../predictions/submission_v1_wholeds_20250115.csv', index=False)

#score:  0.13011 => 21%