### 1. Import Libraries

Importing essential libraries for data processing, visualization, and modeling.


In [1]:
# Import libraries for data manipulation, visualization, preprocessing, and modeling


import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb


### 2. Load Training Data

Loading the training dataset from the Kaggle competition input path.


In [2]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv') 

### 3. Load Test Data

Loading the test dataset for final predictions and submission.


In [3]:
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')


### 4. Define Outlier Indices

Based on separate visual analysis: identified outliers by their row indices and stored them in a list.


In [4]:
# List of indices corresponding to outliers to remove.

values = [598, 955, 935, 1299, 250, 314, 336, 707, 379, 1183, 692, 186, 441, 186, 524, 739, 598, 955, 636, 1062, 1191, 496, 198, 1338]

### 5. Remove Outliers

Removing the outlier rows from the training dataset using their IDs.


In [5]:
# Remove rows with matching IDs (outliers) from the training data
train_df = train_df[train_df.Id.isin(values) == False] 

### 6. Handle Missing 'Alley' Values


In [6]:

# Fill missing values in the 'Alley' column with 'No' to indicate no alley access in both training and test datasets.

train_df['Alley'] = train_df['Alley'].fillna('No')
test_df['Alley'] = test_df['Alley'].fillna('No')


### 7. Handle Missing 'Fence' Values


In [7]:
# Fill missing values in the 'Fence' column with 'No' to indicate no fence 

train_df['Fence']=train_df['Fence'].fillna('No')
test_df['Fence']=test_df['Fence'].fillna('No')         


### 8. Handle Missing 'MasVnrType' Values


In [8]:
# Fill missing values with 'No' to indicate no masonry veneer
train_df['MasVnrType']=train_df['MasVnrType'].fillna('No')
test_df['MasVnrType']=test_df['MasVnrType'].fillna('No')  

### 9. Handle Missing 'MasVnrArea' Values


In [9]:
# Fill missing values with 0 to indicate no masonry veneer area
train_df['MasVnrArea']=train_df['MasVnrArea'].fillna(0)
test_df['MasVnrArea']=test_df['MasVnrArea'].fillna(0)  

### 10. Handle Missing 'FireplaceQu' Values


In [10]:
# Fill missing with 'No' (no fireplace)

train_df['FireplaceQu']=train_df['FireplaceQu'].fillna('No')
test_df['FireplaceQu']=test_df['FireplaceQu'].fillna('No')  

### 11. Handle Missing 'LotFrontage' Values


In [11]:
# Fill missing frontage with 0 (no measurable frontage)
train_df['LotFrontage']=train_df['LotFrontage'].fillna(0)
test_df['LotFrontage']=test_df['LotFrontage'].fillna(0)  

### 12. Handle Missing 'GarageCond' Values


In [12]:
# Fill missing with 'No' (no garage condition info)
train_df['GarageCond']=train_df['GarageCond'].fillna('No')
test_df['GarageCond']=test_df['GarageCond'].fillna('No')  

### 13. Handle Missing 'GarageType' Values


In [13]:
# Fill missing with 'No' (no garage type)
train_df['GarageType']=train_df['GarageType'].fillna('No')
test_df['GarageType']=test_df['GarageType'].fillna('No')  

### 14. Handle Missing 'GarageFinish' Values


In [14]:
# Fill missing with 'No' (no garage finish)
train_df['GarageFinish']=train_df['GarageFinish'].fillna('No')
test_df['GarageFinish']=test_df['GarageFinish'].fillna('No')  

### 15. Handle Missing 'GarageQual' Values


In [15]:
# Fill missing with 'No' (no garage quality info)
train_df['GarageQual']=train_df['GarageQual'].fillna('No')
test_df['GarageQual']=test_df['GarageQual'].fillna('No')  

### 16. Handle Missing 'BsmtFinType2' Values


In [16]:
# Fill missing with 'Unf' (unfinished basement)
train_df['BsmtFinType2']=train_df['BsmtFinType2'].fillna('Unf')
test_df['BsmtFinType2']=test_df['BsmtFinType2'].fillna('Unf')  

### 17. Handle Missing 'BsmtExposure' Values


In [17]:
# Fill missing with 'No' (no basement exposure)

train_df['BsmtExposure']=train_df['BsmtExposure'].fillna('No')
test_df['BsmtExposure']=test_df['BsmtExposure'].fillna('No')  

### 18. Handle Missing 'BsmtQual' Values


In [18]:
# Fill missing with 'No' (no basement quality info)

train_df['BsmtQual']=train_df['BsmtQual'].fillna('No')
test_df['BsmtQual']=test_df['BsmtQual'].fillna('No')  

### 19. Handle Missing 'BsmtCond' Values


In [19]:
# Fill missing with 'No' (no basement condition info)

train_df['BsmtCond']=train_df['BsmtCond'].fillna('No')
test_df['BsmtCond']=test_df['BsmtCond'].fillna('No')  

### 20. Handle Missing 'BsmtFinType1' Values


In [20]:
# Fill missing with 'Unf' (unfinished basement)

train_df['BsmtFinType1']=train_df['BsmtFinType1'].fillna('Unf')
test_df['BsmtFinType1']=test_df['BsmtFinType1'].fillna('Unf')  

### 21. Re-handle 'MasVnrArea' Values


In [21]:
# Ensure missing masonry area is set to 0 (already handled earlier, possibly repeated)

train_df['MasVnrArea']=train_df['MasVnrArea'].fillna(0)
test_df['MasVnrArea']=test_df['MasVnrArea'].fillna(0)  

### 22. Handle Missing 'Electrical' Values


In [22]:
# Fill missing with 'SBrkr' (most common electrical system)

train_df['Electrical']=train_df['Electrical'].fillna('SBrkr')
test_df['Electrical']=test_df['Electrical'].fillna('SBrkr')  

### 23. Drop Sparse or Irrelevant Columns


In [23]:
# Drop features with too many missing values or low predictive value

train_df = train_df.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'GarageYrBlt', 'GarageCond', 'BsmtFinType2'])
test_df = test_df.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'GarageYrBlt', 'GarageCond', 'BsmtFinType2'])

## Feature Engineering


### 24. Create 'houseage' Feature


In [24]:
# Calculate age of the house at time of sale

train_df['houseage'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['houseage'] = test_df['YrSold'] - test_df['YearBuilt']

### 25. Create 'houseremodelage' Feature


In [25]:
# Calculate age since last remodel at time of sale

train_df['houseremodelage'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['houseremodelage'] = test_df['YrSold'] - test_df['YearRemodAdd']

### 26. Create 'totalsf' Feature


In [26]:
# Total finished square footage (1st + 2nd floor + basement)

train_df['totalsf'] = train_df['1stFlrSF'] + train_df['2ndFlrSF'] + train_df['BsmtFinSF1'] + train_df['BsmtFinSF2']
test_df['totalsf'] = test_df['1stFlrSF'] + test_df['2ndFlrSF'] + test_df['BsmtFinSF1'] + test_df['BsmtFinSF2']

### 27. Create 'totalarea' Feature


In [27]:
# Combined above-ground and basement living area

train_df['totalarea'] = train_df['GrLivArea'] + train_df['TotalBsmtSF']
test_df['totalarea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF']

### 28. Create 'totalbaths' Feature


In [28]:
# Total number of bathrooms (full + 0.5 × half baths, including basement)

train_df['totalbaths'] = train_df['BsmtFullBath'] + train_df['FullBath'] + 0.5 * (train_df['BsmtHalfBath'] + train_df['HalfBath']) 
test_df['totalbaths'] = test_df['BsmtFullBath'] + test_df['FullBath'] + 0.5 * (test_df['BsmtHalfBath'] + test_df['HalfBath']) 

### 29. Create 'totalporchsf' Feature


In [29]:
# Total outdoor space (porches + decks)

train_df['totalporchsf'] = train_df['OpenPorchSF'] + train_df['3SsnPorch'] + train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF']
test_df['totalporchsf'] = test_df['OpenPorchSF'] + test_df['3SsnPorch'] + test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF']

### 30. Drop Redundant Features


In [30]:
# Drop original features used to create engineered ones

train_df = train_df.drop(columns=['Id','YrSold', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'GrLivArea', 'TotalBsmtSF','BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF'])
test_df = test_df.drop(columns=['YrSold', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'GrLivArea', 'TotalBsmtSF','BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF'])

### 31. Drop 'GarageArea'


In [31]:
# Removed due to high correlation with other engineered features

train_df = train_df.drop(columns=['GarageArea'])
test_df = test_df.drop(columns=['GarageArea'])

### 32. Log-Transform Target Variable 'SalePrice'


In [32]:
# Apply a log transformation to the 'SalePrice' target variable to reduce skewness and stabilize variance.

train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

### 33. Define Categorical Columns for Ordinal Encoding


In [33]:
# Ordered categorical features to be encoded with OrdinalEncoder

ode_cols = ['LotShape', 'LandContour','Utilities','LandSlope',  'BsmtQual',  'BsmtFinType1',  'CentralAir',  'Functional', \
           'FireplaceQu', 'GarageFinish', 'GarageQual', 'PavedDrive', 'ExterCond', 'KitchenQual', 'BsmtExposure', 'HeatingQC','ExterQual', 'BsmtCond']

### 34. Define Categorical Columns for One-Hot Encoding


In [34]:
# Nominal categorical features to be encoded with OneHotEncoder


ohe_cols = ['Street', 'LotConfig','Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', \
           'MasVnrType','Foundation',  'Electrical',  'SaleType', 'MSZoning', 'SaleCondition', 'Heating', 'GarageType', 'RoofMatl']

### 35. Select Numerical Features (Excluding Target)


In [35]:
# Identify numeric input features for modeling


num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols.drop('SalePrice')

### 36. Create Pipeline for Numerical Features


In [36]:
# Impute missing values with mean and scale features


num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

### 37. Create Pipeline for Ordinal Categorical Features


In [37]:
# Impute with mode and apply ordinal encoding (handle unknowns as -1)

ode_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

### 38. Create Pipeline for One-Hot Encoded Features


In [38]:
# Impute with mode and apply one-hot encoding (ignore unknowns)


ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

### 39. Combine Preprocessing Steps with ColumnTransformer


In [39]:
# Apply numerical, ordinal, and one-hot pipelines; passthrough remaining features



col_trans = ColumnTransformer(transformers=[
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)

### 40. Create Main Preprocessing Pipeline


In [40]:
# Wrap the column transformer into a unified pipeline

pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

### 41. Split Features and Target


In [41]:
# Separate predictors and target variable

X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

### 42. Preprocess Features with Pipeline


In [42]:
# Fit and transform X using the full preprocessing pipeline

X_preprocessed = pipeline.fit_transform(X)

### 43. Train-Test Split


In [43]:
# Split data into training and validation sets (80/20)


X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=25)   #TOOL

## Model Building


### 44. Initialize Linear Regression Model


In [44]:
# Initialize a Linear Regression model for predicting house prices.

lr = LinearRegression()

### 45. Fit Linear Regression Model


In [45]:
# Train the Linear Regression model on the training data (X_train and y_train).

lr.fit(X_train, y_train)

### 46. Predict with Linear Regression


In [46]:
# Predict on validation set using trained linear model

y_pred_lr = lr.predict(X_test)

### 47. Evaluate Linear Regression (MSE)


In [47]:
# Evaluate prediction error using Mean Squared Error

mean_squared_error(y_test, y_pred_lr)

6.668901938823623e+18

### 48. Initialize Random Forest Regressor


In [48]:
# Create random forest model with fixed random state

RFR = RandomForestRegressor(random_state=13)

### 49. Define Hyperparameter Grid for Random Forest


In [49]:
# Set grid of values for max_depth, n_estimators, and min_samples_split


param_grid_RFR = {
    'max_depth': [5, 10, 15],     
    'n_estimators': [100, 250, 500],   
    'min_samples_split': [3, 5, 10]    
}

### 50. Set Up Grid Search for Random Forest


In [50]:
# Configure GridSearchCV with 5-fold CV and MSE scoring


rfr_cv = GridSearchCV(RFR, param_grid_RFR, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

### 51. Fit Grid Search for Random Forest


In [51]:
# Train model using grid search and cross-validation

rfr_cv.fit(X_train, y_train)

### 52. Evaluate Best Random Forest Model (Cross-Validated RMSE)


In [52]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * rfr_cv.best_score_)

0.13395680935228135

### 53. Display Best Hyperparameters for Random Forest


In [53]:
# Show optimal parameters from grid search


rfr_cv.best_params_

{'max_depth': 15, 'min_samples_split': 3, 'n_estimators': 500}

### 54. Initialize XGBoost Regressor


In [54]:
# Create XGBoost model with fixed random state

XGB = XGBRegressor(random_state=13)

### 55. Define Hyperparameter Grid for XGBoost


In [55]:
# Set grid of values to tune learning rate, tree depth, and sampling

param_grid_XGB = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

### 56. Set Up Grid Search for XGBoost


In [56]:
# Configure GridSearchCV with 3-fold CV and MSE scoring for XGBoost

xgb_cv = GridSearchCV(XGB, param_grid_XGB, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

### 57. Fit Grid Search for XGBoost


In [57]:
# Train XGBoost model using grid search and cross-validation

xgb_cv.fit(X_train, y_train)

### 58. Save Best XGBoost Parameters


In [83]:
# Save tuned XGBoost model with best parameters from grid search


XGB_best = XGBRegressor(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    min_child_weight=2,
    gamma=0,
    subsample=0.9,
    colsample_bytree=0.8,
    random_state=42
)


### 59. Evaluate Best XGBoost Model (Cross-Validated RMSE)


In [58]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * xgb_cv.best_score_)         

0.11933683622521173

### 60. Initialize Ridge Regression Model


In [59]:
# Create ridge regression model with regularization to reduce overfitting

ridge = Ridge()

### 61. Define Hyperparameter Grid for Ridge Regression


In [60]:
# Set grid for alpha (regularization strength) and solver type


param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}

### 62. Set Up Grid Search for Ridge Regression


In [61]:
# Configure GridSearchCV with 5-fold CV and MSE scoring


ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

### 63. Fit Grid Search for Ridge Regression


In [62]:
# Train ridge regression model using grid search and cross-validation

ridge_cv.fit(X_train, y_train)

### 64. Save Best Ridge Regression Parameters


In [85]:
# Save tuned Ridge model with best parameters from grid search

Ridge_best = Ridge(
    alpha=0.1,            
    solver='auto',        
    random_state=42       
)



### 65. Evaluate Best Ridge Regression Model (Cross-Validated RMSE)


In [63]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * ridge_cv.best_score_)

0.10909197852328456

### 66. Initialize Gradient Boosting Regressor


In [64]:
# Create gradient boosting model to combine weak learners


GBR = GradientBoostingRegressor()

### 67. Define Hyperparameter Grid for Gradient Boosting


In [66]:
# Set grid to tune depth, trees, leaf size, learning rate, and feature sampling

param_grid_GBR = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_features': [0.01, 0.1, 0.7]
}

### 68. Set Up Grid Search for Gradient Boosting


In [67]:
 # Configure GridSearchCV with 5-fold CV and MSE scoring for GBR

GBR_cv = GridSearchCV(GBR, param_grid_GBR, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

### 69. Fit Grid Search for Gradient Boosting


In [68]:
# Train GBR model using grid search and cross-validation

GBR_cv.fit(X_train, y_train) 

### 70. Save Best Gradient Boosting Parameters (Manually Set)


In [69]:
# # Save GBR model with likely optimal parameters due to long grid search time

GBR_best = GradientBoostingRegressor(
    max_depth=12,           
    n_estimators=200,       
    min_samples_leaf=10,    
    learning_rate=0.1,      
    max_features=0.7,       
    random_state=42         
)





### 71. Evaluate Best Gradient Boosting Model (Cross-Validated RMSE)


In [70]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * GBR_cv.best_score_)

0.11340695631838857

### 72. Initialize LightGBM Regressor


In [71]:
# Create LightGBM model optimized for speed and performance


lgbm_regressor = lgb.LGBMRegressor()

### 73. Define Hyperparameter Grid for LightGBM


In [72]:
# Set grid to tune boosting type, leaves, learning rate, and number of estimators


param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}

### 74. Set Up Grid Search for LightGBM


In [73]:
# Configure GridSearchCV with 3-fold CV and MSE scoring for LightGBM


lgbm_cv = GridSearchCV(lgbm_regressor, param_grid_lgbm, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

### 75. Fit Grid Search for LightGBM


In [74]:
# Train LightGBM model using grid search and cross-validation

lgbm_cv.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1994
[LightGBM] [Info] Number of data points in the train set: 1151, number of used features: 112
[LightGBM] [Info] Start training from score 12.023259


### 76. Evaluate Best LightGBM Model (Cross-Validated RMSE)


In [75]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * lgbm_cv.best_score_)

0.12728980697034

### 77. Initialize CatBoost Regressor


In [76]:
# Initialize a CatBoost Regressor with RMSE as the loss function and suppress verbose output for cleaner logs.


catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

### 78. Define Hyperparameter Grid for CatBoost


In [77]:
# Set grid to tune iterations, tree depth, and learning rate (reduced for speed)


param_grid_cat = {
    'iterations': [500],           
    'depth': [6, 8],               
    'learning_rate': [0.05, 0.1] 
}

### 79. Set Up Grid Search for CatBoost


In [78]:
# Configure GridSearchCV with 2-fold CV and MSE scoring for CatBoost

cat_cv = GridSearchCV(catboost, param_grid_cat, cv=2, scoring='neg_mean_squared_error', n_jobs=-1)

### 80. Fit Grid Search for CatBoost


In [79]:
# Train CatBoost model using 2-fold CV and defined hyperparameter grid
cat_cv.fit(X_train, y_train)

**Result:**  
GridSearchCV successfully ran for the CatBoost Regressor using 2-fold cross-validation.  
It tested the parameter grid:
- `depth`: [6, 8]  
- `iterations`: [500]  
- `learning_rate`: [0.05, 0.1]  

This ensured optimal hyperparameters were selected for house price prediction.


### 81. Evaluate Best CatBoost Model (Cross-Validated RMSE)


In [80]:
# Compute RMSE from best cross-validation score

np.sqrt(-1 * cat_cv.best_score_)

0.1176743641321184

In [81]:
vr = VotingRegressor([('gbr', GBR_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_)],
                    weights=[2,3,1])



### 83. Define Voting Regressor with Saved Parameters


In [86]:
# Create ensemble model using manually saved best models and custom weights

vr = VotingRegressor([
    ('gbr', GBR_best),
    ('xgb', XGB_best),
    ('ridge', Ridge_best)
], weights=[2, 3, 1])


### 84. Fit Voting Regressor


In [87]:
vr.fit(X_train, y_train)

**Result:**  
The Voting Regressor was successfully trained using Gradient Boosting, XGBoost, and Ridge.  
- Weights `[2, 3, 1]` emphasize XGBoost and Gradient Boosting, while including Ridge for regularization.  
- This ensemble approach balances bias and variance, enhancing model stability and predictive power.


### 85. Predict with Voting Regressor


In [88]:
## Generate predictions on the validation set using the ensemble model


y_pred_vr = vr.predict(X_test)

### 86. Evaluate Voting Regressor (Validation RMSE)


In [89]:
# Compute RMSE on validation set to assess ensemble performance

mean_squared_error(y_test, y_pred_vr, squared=False)

0.12120598069926954

**Result:**  
Voting Regressor achieved an RMSE of **0.12225**, showing competitive performance and validating the ensemble’s balanced approach.


### 87. Save Final Set of Tuned Regressors


In [90]:
# Define a list of all best-performing models with saved parameters for reuse or comparison


estimators = [
    ('gbr', GradientBoostingRegressor(
        max_depth=12, n_estimators=200, min_samples_leaf=10, 
        learning_rate=0.1, max_features=0.7, random_state=42)),
    
    ('xgb', XGBRegressor(
        learning_rate=0.05, n_estimators=300, max_depth=3, 
        min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.8, random_state=42)),
    
    ('cat', CatBoostRegressor(
        iterations=500, depth=8, learning_rate=0.1, 
        loss_function='RMSE', verbose=False, random_state=42)),
    
    ('lgb', lgb.LGBMRegressor(
        boosting_type='gbdt', num_leaves=30, learning_rate=0.05, 
        n_estimators=200, random_state=42)),
    
    ('rfr', RandomForestRegressor(
        max_depth=15, n_estimators=500, min_samples_split=3, random_state=42)),
]


### 88. Save Best Estimators from GridSearchCV


In [91]:
# Collect best estimators from all tuned models for final ensemble or comparison

estimators = [
    ('gbr', GBR_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]

### 89. Save Static Estimators for Deployment or Reuse


In [92]:
# Define fixed models with known good parameters for reproducible use


s_estimators = [
    ('gbr', GBR_best),
    ('xgb', XGB_best),
    ('cat', CatBoostRegressor(
        iterations=500, depth=8, learning_rate=0.1, 
        loss_function='RMSE', verbose=False, random_state=42)),
    ('lgb', lgb.LGBMRegressor(
        boosting_type='gbdt', num_leaves=30, learning_rate=0.05, 
        n_estimators=200, random_state=42)),
    ('rfr', RandomForestRegressor(
        max_depth=15, n_estimators=500, min_samples_split=3, random_state=42)),
]


### 90. Initialize Stacking Regressor


In [93]:
# Build stacked model using saved base estimators and Voting Regressor as final estimator


stackreg = StackingRegressor(
            estimators = s_estimators,
            final_estimator = vr
)

### 91. Fit Stacking Regressor


In [94]:
# Train stacking model using base estimators and voting ensemble as final estimator

stackreg.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1994
[LightGBM] [Info] Number of data points in the train set: 1151, number of used features: 112
[LightGBM] [Info] Start training from score 12.023259
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1896
[LightGBM] [Info] Number of data points in the train set: 920, number of used features: 106
[LightGBM] [Info] Start training from score 12.032469
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000994 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enou

**Result:**  
Stacking Regressor successfully trained using 5 diverse base models and the Voting Regressor as meta-learner.  
This approach combines model diversity with ensemble stability to improve predictive performance.


### 92. Predict with Stacking Regressor


In [95]:
# Generate predictions using the full stacking ensemble model

y_pred_stack = stackreg.predict(X_test)

### 93. Evaluate Stacking Regressor (Validation RMSE)


In [96]:
# Compute RMSE on validation set to assess stacked model performance

mean_squared_error(y_test, y_pred_stack, squared=False)

0.12608905097053247

**Result:**  
Stacking Regressor achieved an RMSE of **0.12464**, showing strong and balanced performance through model ensembling.


### 94. Preprocess Final Test Data


In [97]:
# Apply fitted preprocessing pipeline to test dataset for consistent transformation

df_test_preprocess = pipeline.transform(test_df)

### 95. Generate Final Predictions and Submission File


In [98]:
# Predict with stacking model, reverse log-transform, and prepare submission CSV


y_stacking = np.exp(stackreg.predict(df_test_preprocess))

df_y_stacking_out = test_df[['Id']].copy()
df_y_stacking_out.loc[:, 'SalePrice'] = y_stacking

df_y_stacking_out.to_csv('submission.csv', index=False)
