In [1]:
import sys
import os

# Add project root to pythonpath
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
import seaborn as sns

from src.data.load_data import load_data
from src.features.engineering import feature_engineering
from src.features.preprocess import pipeline
from src.utils import model_summary

In [3]:
# Load data
data_path = '/data/train/train.csv'
data = load_data(data_path)

In [4]:
data_train = feature_engineering(data)

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor

In [6]:
# Algunos serán OneHotEncoder otros OrdinalEncoder
# Yo borré estas y el chico, no  'GarageQual', 'MasVnrType'
# OneHotEncoder:
ohe_cols = ['Street','Neighborhood','Condition1','Condition2','RoofMatl','Exterior1st',
               'Exterior2nd','Foundation','Heating','Electrical','GarageType', 'RoofStyle',
               'SaleType','SaleCondition','LotConfig','BldgType','HouseStyle', 'MSZoning']

# OrdinalEncoder 
ore_cols = ['LotShape','LandContour','Utilities','LandSlope','ExterQual','ExterCond',
               'BsmtQual','BsmtExposure','BsmtFinType1','HeatingQC','CentralAir','KitchenQual',
               'Functional','FireplaceQu','GarageFinish','PavedDrive']

number_cols = data_train.select_dtypes(include=['number']).columns
number_cols = number_cols.drop('SalePrice')

In [7]:
X = data_train.drop('SalePrice', axis=1)
y = data_train['SalePrice']

In [45]:
dspipeline = pipeline(number_cols, ore_cols, ohe_cols)

In [46]:
X_prepro = dspipeline.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_prepro, y, test_size=0.2, random_state=25)

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [12]:
mean_squared_error(y_test, y_pred_lr) #0.013550181855841515

3.2141785865053786e+17

In [13]:
rfr = RandomForestRegressor(random_state=13)
param_grid_rfr = {
    'max_depth': [12, 15, 18],
    'n_estimators': [250, 500, 750],
    'min_samples_split': [3, 5, 10]
}
rfr_cv = GridSearchCV(rfr, param_grid_rfr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rfr_cv.fit(X_train, y_train)

# Ojo, primero me daba: RandomForestRegressor(max_depth=15, min_samples_split=3, n_estimators=500,random_state=13)
# Y despues: RandomForestRegressor(max_depth=18, min_samples_split=3, n_estimators=750,random_state=13)
# Que diferencia de resultados y rendimiento equivale?

In [14]:
xgb_rmse = model_summary("RFR", rfr_cv, X_test, y_test)
'''
Best RFR MRSE result: 0.1488
Test RFR MRSE result: 0.1345

Best RFR MRSE result: 0.14
Test RFR MRSE result: 0.1396
'''

Best RFR MRSE result: 0.14
Test RFR MRSE result: 0.1396


In [15]:
from xgboost import XGBRegressor

In [16]:
xgb = XGBRegressor(random_state=13)

param_grid_xgb = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_cv = GridSearchCV(
    xgb, 
    param_grid_xgb, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1)

xgb_cv.fit(X_train, y_train)

In [18]:
xgb_rmse = model_summary("XGB", xgb_cv, X_test, y_test)
'''
Best XGB MRSE result: 0.1349
Test XGB MRSE result: 0.1201

Best XGB MRSE result: 0.1147
Test XGB MRSE result: 0.1222
'''

Best XGB MRSE result: 0.1147
Test XGB MRSE result: 0.1222


'\nBest XGB MRSE result: 0.1349\nTest XGB MRSE result: 0.1201\n'

In [19]:
ridge = Ridge()

param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

ridge_cv.fit(X_train, y_train)



In [20]:
ridge_rmse = model_summary("Ridge", ridge_cv, X_test, y_test)

'''
Best Ridge MRSE result: 0.1494
Test Ridge MRSE result: 0.1219

Best Ridge MRSE result: 0.1104
Test Ridge MRSE result: 0.1236
'''

Best Ridge MRSE result: 0.1104
Test Ridge MRSE result: 0.1236


'\nBest Ridge MRSE result: 0.1494\nTest Ridge MRSE result: 0.1219\n'

In [21]:
gbr = GradientBoostingRegressor()

param_grid_gbr = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1], 
    'max_features': [0.01, 0.1, 0.7]
}

gbr_cv = GridSearchCV(gbr, param_grid_gbr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gbr_cv.fit(X_train, y_train)

In [22]:
gbr_rmse = model_summary("GBR", gbr_cv, X_test, y_test)

'''
Best GBR MRSE result: 0.1302
Test GBR MRSE result: 0.1195

Best GBR MRSE result: 0.1166
Test GBR MRSE result: 0.1218
'''

Best GBR MRSE result: 0.1166
Test GBR MRSE result: 0.1218


'\nBest GBR MRSE result: 0.1302\nTest GBR MRSE result: 0.1195\n'

In [23]:
import lightgbm as lgb

In [24]:
lgbm_regressor = lgb.LGBMRegressor()

param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

lgbm_cv = GridSearchCV(
    lgbm_regressor, 
    param_grid_lgbm, 
    cv=3, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1
)

lgbm_cv.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1541
[LightGBM] [Info] Number of data points in the train set: 768, number of used features: 99
[LightGBM] [Info] Start training from score 12.008633
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1547
[LightGBM] [Info] Number of data points in the train set: 768, number of us

In [25]:
lgbm_rmse = model_summary("LGBM", lgbm_cv, X_test, y_test)

'''
Best LGBM MRSE result: 0.1403
Test LGBM MRSE result: 0.1276

Best LGBM MRSE result: 0.1283
Test LGBM MRSE result: 0.1253
'''

Best LGBM MRSE result: 0.1283
Test LGBM MRSE result: 0.1253


'\nBest LGBM MRSE result: 0.1403\nTest LGBM MRSE result: 0.1276\n'

In [26]:
from catboost import CatBoostRegressor

In [27]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

param_grid_cat ={
    'iterations': [100, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.5]
}

cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
cat_cv.fit(X_train, y_train)



  _data = np.array(data, dtype=dtype, copy=copy,


In [28]:
cat_cv_rmse = model_summary("CatBoost", cat_cv, X_test, y_test)

'''
Best CatBoost MRSE result: 0.1308
Test CatBoost MRSE result: 0.1138

Best CatBoost MRSE result: 0.1128
Test CatBoost MRSE result: 0.1188
'''

Best CatBoost MRSE result: 0.1128
Test CatBoost MRSE result: 0.1188


'\nBest CatBoost MRSE result: 0.1308\nTest CatBoost MRSE result: 0.1138\n'

In [31]:
print(f"cat_cv_rmse: {round(cat_cv_rmse, 4)}")
print(f"lgbm_rmse: {round(lgbm_rmse, 4)}")
print(f"xgb_rmse: {round(xgb_rmse, 4)}")
print(f"ridge_rmse: {round(ridge_rmse, 4)}")
print(f"gbr_rmse: {round(gbr_rmse, 4)}")

'''
cat_cv_rmse: 0.1138
gbr_rmse: 0.1195
xgb_rmse: 0.1201
ridge_rmse: 0.1219
lgbm_rmse: 0.1276

cat_cv_rmse: 0.1188
gbr_rmse: 0.1218
xgb_rmse: 0.1222
ridge_rmse: 0.1236
lgbm_rmse: 0.1253
'''

cat_cv_rmse: 0.1188
lgbm_rmse: 0.1253
xgb_rmse: 0.1222
ridge_rmse: 0.1236
gbr_rmse: 0.1218


'\ncat_cv_rmse: 0.1138\ngbr_rmse: 0.1195\nxgb_rmse: 0.1201\nridge_rmse: 0.1219\nlgbm_rmse: 0.1276\n\ncat_cv_rmse: 0.1188\ngbr_rmse: 0.1218\nxgb_rmse: 0.1222\nridge_rmse: 0.1236\nlgbm_rmse: 0.1253\n'

In [32]:
vr = VotingRegressor([('gbr', gbr_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_),
                      ],
                    weights=[2,1,1])

vr.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1547
[LightGBM] [Info] Number of data points in the train set: 768, number of used features: 96
[LightGBM] [Info] Start training from score 12.015949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1547
[LightGBM] [Info] Number of data points in the train set: 768, number of used features: 96
[LightGBM] [Info] Start training from score 12.015949
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [34]:
y_pred_vr = vr.predict(X_test)
mean_squared_error(y_test, y_pred_vr, squared=False)

# weights=[3,2,1] :: 0.1141, 0.1149
# weights=[2,2,1] :: 0.1156, 0.1139
# weights=[2,1,2] :: 0.1146, 0.1145
# 0.11694689658197516



0.11694689658197516

In [35]:
estimators = [
    ('gbr', gbr_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

In [36]:
stackreg = StackingRegressor(
            estimators = estimators,
            final_estimator = vr
)

In [37]:
stackreg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1703
[LightGBM] [Info] Number of data points in the train set: 1152, number of used features: 107
[LightGBM] [Info] Start training from score 12.009198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1625
[LightGBM] [Info] Number of data points in the train set: 921, number of used features: 101
[LightGBM] [Info] Start training from score 12.011340
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002787 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1617
[LightGBM] [Info] Number of data points in the train se

In [38]:
y_pred_stack = stackreg.predict(X_test)
mean_squared_error(y_test, y_pred_stack, squared=False)

#0.1190100021681962
#0.11742040344241811



0.11742040344241811

In [39]:
dtest = pd.read_csv('../data/test/test.csv')
data_test_f = feature_engineering(dtest)

In [47]:
X_test_f = dspipeline.transform(data_test_f)

In [48]:
y_test_f = np.exp(stackreg.predict(X_test_f))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['SalePrice'] = y_test_f


In [52]:
result = dtest[['Id']].copy()
result.loc[:, 'SalePrice'] = y_test_f

In [53]:
result.to_csv('../predictions/submission_v2_20250114.csv', index=False)

#score:  0.13011 => 21%