In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings

In [40]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Train data shape: (1460, 81)
Test data shape: (1459, 80)


In [41]:
train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [42]:
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [43]:
train_IDs = train_df['Id']
test_IDs = test_df['Id']

In [44]:
y_train = np.log1p(train_df['SalePrice'])

In [45]:
train_features = train_df.drop(['Id', 'SalePrice'], axis=1)
test_features = test_df.drop(['Id'], axis=1)

In [46]:
all_features = pd.concat([train_features, test_features], axis=0)
print(f"Combined features shape: {all_features.shape}")

Combined features shape: (2919, 79)


In [47]:
numeric_features = all_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = all_features.select_dtypes(include=['object']).columns.tolist()

In [None]:
print(f"Number of numeric features: {len(numeric_features)}")
print(f"Number of categorical features: {len(categorical_features)}")

Number of numeric features: 36
Number of categorical features: 43


In [49]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [50]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [51]:
# Root Mean Squared Logarithmic Error (RMSLE)

def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [52]:
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [53]:
# XGBoost Pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
])

# LightGBM Pipeline
lgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMRegressor(objective='regression', random_state=42))
])

# CatBoost Pipeline
cb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cb.CatBoostRegressor(loss_function='RMSE', random_seed=42, verbose=False))
])

In [54]:
xgb_param_grid = {
    'model__n_estimators': [100, 500],
    'model__learning_rate': [0.01, 0.05],
    'model__max_depth': [3, 5],
    'model__subsample': [0.8],
    'model__colsample_bytree': [0.8]
}

lgb_param_grid = {
    'model__n_estimators': [100, 500],
    'model__learning_rate': [0.01, 0.05],
    'model__num_leaves': [31, 50],
    'model__subsample': [0.8]
}

cb_param_grid = {
    'model__iterations': [100, 500],
    'model__learning_rate': [0.01, 0.05],
    'model__depth': [6, 8]
}

In [55]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [57]:
def run_grid_search(pipeline, param_grid, name):
    print(f"\nTuning {name} model...")
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=kf,
        scoring=rmsle_scorer,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(train_features, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {-grid_search.best_score_:.4f} RMSLE")

    return grid_search.best_estimator_

In [58]:
xgb_pipeline.set_params(
    model__n_estimators=500,
    model__learning_rate=0.05,
    model__max_depth=5,
    model__subsample=0.8,
    model__colsample_bytree=0.8
)
xgb_scores = cross_val_score(xgb_pipeline, train_features, y_train, cv=kf, scoring=rmsle_scorer)
print(f"XGBoost CV RMSLE: {-np.mean(xgb_scores):.4f} (±{np.std(xgb_scores):.4f})")
xgb_pipeline.fit(train_features, y_train)

XGBoost CV RMSLE: 0.1285 (±0.0196)


In [59]:
lgb_pipeline.set_params(
    model__n_estimators=500,
    model__learning_rate=0.05,
    model__num_leaves=31,
    model__subsample=0.8
)
lgb_scores = cross_val_score(lgb_pipeline, train_features, y_train, cv=kf, scoring=rmsle_scorer)
print(f"LightGBM CV RMSLE: {-np.mean(lgb_scores):.4f} (±{np.std(lgb_scores):.4f})")
lgb_pipeline.fit(train_features, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3233
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 193
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3233
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 196
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3215
[LightGBM] [Info] Number of data points in the train s

In [60]:
cb_pipeline.set_params(
    model__iterations=500,
    model__learning_rate=0.05,
    model__depth=6
)
cb_scores = cross_val_score(cb_pipeline, train_features, y_train, cv=kf, scoring=rmsle_scorer)
print(f"CatBoost CV RMSLE: {-np.mean(cb_scores):.4f} (±{np.std(cb_scores):.4f})")
cb_pipeline.fit(train_features, y_train)

CatBoost CV RMSLE: 0.1243 (±0.0166)


In [61]:
xgb_preds = np.expm1(xgb_pipeline.predict(test_features))
lgb_preds = np.expm1(lgb_pipeline.predict(test_features))
cb_preds = np.expm1(cb_pipeline.predict(test_features))