In [74]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [75]:
df_cleaned = pd.read_csv('./data/cleaned_train.csv')
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1357 entries, 0 to 1356
Columns: 248 entries, Id to SaleCondition_Partial
dtypes: bool(207), float64(3), int64(38)
memory usage: 709.1 KB


In [76]:
df_test = pd.read_csv('./data/cleaned_test.csv')
df_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,False,False,False,False,True,False,False,False,True,False
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,False,False,False,False,True,False,False,False,True,False
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,False,False,False,False,True,False,False,False,True,False
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,False,False,False,False,True,False,False,False,True,False
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,False,False,False,False,True,False,False,False,True,False


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor

# ---- Features & data ----
features = [
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "FullBath",
    "TotRmsAbvGrd",
    "YearBuilt",
    "YearRemodAdd",
    "MasVnrArea",
    "Fireplaces",
    "HasMasVnr",
    "BsmtFinSF1",
    "2ndFlrSF",
    "WoodDeckSF",
    "LotFrontage",
    "OpenPorchSF",
    "HalfBath",
    "LotArea",
    "GarageYrBlt",
    "GarageYrBltMissing",
    "HasGarage",
    "BsmtUnfSF",
    "BedroomAbvGr",
    "EnclosedPorch",
    "KitchenAbvGr",
    "ScreenPorch",
    "PoolArea",
    "OverallCond",
    "MSSubClass",
    "3SsnPorch",
    "MoSold",
    "LowQualFinSF",
    "YrSold",
    "BsmtHalfBath",
    "MiscVal",
    "BsmtFinSF2"
]

columns = features[:16]

X = df_cleaned[columns]
y = df_cleaned["SalePrice"]

X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---- Base pipeline ----
pipe = Pipeline([
    ("gboost", GradientBoostingRegressor(random_state=45))
])

# ---- GridSearch over GBoost hyperparameters ----
param_grid = {
    "gboost__n_estimators": [500, 1000, 1500],
    "gboost__learning_rate": [0.01, 0.03, 0.1],
    "gboost__max_depth": [3, 4, 5],
    "gboost__subsample": [0.8, 1.0],
    "gboost__min_samples_leaf": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",  # or "r2"
    n_jobs=-1,
    verbose=2,
)

# Fit GridSearch on training part
grid_search.fit(X_train_val, y_train_val)

print("Best params:", grid_search.best_params_)
print("Best CV score (RMSE):", (-grid_search.best_score_)**0.5)

# ---- Evaluate best model on held-out validation set ----
best_model = grid_search.best_estimator_

y_pred_val = best_model.predict(X_test_val)

val_mse = mean_squared_error(y_test_val, y_pred_val)
val_r2 = r2_score(y_test_val, y_pred_val)

print(f"Validation R-squared: {val_r2:.4f}")
print(f"Validation MSE: {val_mse:.2f}")
print(f"Validation RMSE: {val_mse**0.5:.2f}")

# Refit best model on ALL available labeled data
best_model.fit(X, y)

# ---- Predict on test data ----
X_test_final = df_test[columns]
test_predictions = best_model.predict(X_test_final)

df_test["SalePrice_Pred"] = test_predictions
print(df_test[["SalePrice_Pred"]].head())

submission = pd.DataFrame({
    "Id": df_test["Id"],
    "SalePrice": test_predictions
})

submission.to_csv("results/submission.csv", index=False)
print(len(submission))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=0.8; total time=   0.6s
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=0.8; total time=   0.6s
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=0.8; total time=   0.6s
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=0.8; total time=   0.6s
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=0.8; total time=   0.6s
[CV] END gboost__learning_rate=0.01, gboost__max_depth=3, gboost__min_samples_leaf=1, gboost__n_estimators=500, gboost__subsample=1.0; total time=   0.7s
[CV] END gboo