In [98]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import time
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from tqdm import tqdm  # ✅ Progress bar
import warnings
warnings.filterwarnings("ignore")
start_time = time.time()  # Start timer


In [99]:

# -------------------------------
# 1. Load Data (Including `training_extra.csv`)
# -------------------------------
seed = 87

train_path = 'processed/train_eng.csv'
test_path = 'processed/test_eng.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)



In [100]:

# 2. Apply Subsampling (Faster Training)
# -------------------------------
subsample_fraction = 0.2  # Train on 50% of the data (adjust as needed)
df_train = df_train.sample(frac=subsample_fraction, random_state=seed)

# df_train['MSSubClass'] = df_train['MSSubClass'].astype(str)
# df_test['MSSubClass'] = df_test['MSSubClass'].astype(str)

# Remove Id columns
if "Id" in df_train.columns:
    df_train.drop(columns=["Id"], inplace=True)

if "Id" in df_test.columns:
    test_ids = df_test["Id"]
    df_test.drop(columns=["Id"], inplace=True)
# else:
#     raise ValueError("The test.csv file must contain an 'id' column.")


In [102]:

# -------------------------------
# 2. Data Preprocessing
# -------------------------------
categorical_cols = df_train.select_dtypes(include=["object"]).columns
numerical_cols = df_train.select_dtypes(exclude=["object"]).columns

# Remove target column from numerical columns
if "SalePrice" in numerical_cols:
    numerical_cols = numerical_cols.drop("SalePrice")

# Handle missing values
df_train[categorical_cols] = df_train[categorical_cols].fillna("Unknown")
df_test[categorical_cols] = df_test[categorical_cols].fillna("Unknown")

imputer = SimpleImputer(strategy="mean")
df_train[numerical_cols] = imputer.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = imputer.transform(df_test[numerical_cols])

# One-hot encoding (Ensures consistent feature columns)
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Ensure train & test have the same columns (align columns)
df_test = df_test.reindex(columns=df_train.columns.drop("SalePrice"), fill_value=0)  # ✅ Ensures matching columns


KeyError: "['MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 'Street_Pave', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub', 'Utilities_NoSeWa', 'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside', 'LandSlope_Gtl', 'LandSlope_Mod', 'LandSlope_Sev', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide', 'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR', 'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst', 'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker', 'Condition1_Artery', 'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosA', 'Condition1_PosN', 'Condition1_RRAe', 'Condition1_RRAn', 'Condition1_RRNe', 'Condition1_RRNn', 'Condition2_Artery', 'Condition2_Feedr', 'Condition2_Norm', 'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'BldgType_1Fam', 'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE', 'HouseStyle_1.5Fin', 'HouseStyle_1.5Unf', 'HouseStyle_1Story', 'HouseStyle_2.5Fin', 'HouseStyle_2.5Unf', 'HouseStyle_2Story', 'HouseStyle_SFoyer', 'HouseStyle_SLvl', 'RoofStyle_Flat', 'RoofStyle_Gable', 'RoofStyle_Gambrel', 'RoofStyle_Hip', 'RoofStyle_Mansard', 'RoofStyle_Shed', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'RoofMatl_Tar&Grv', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 'Exterior1st_AsbShng', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd', 'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd', 'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing', 'Exterior2nd_AsbShng', 'Exterior2nd_AsphShn', 'Exterior2nd_Brk Cmn', 'Exterior2nd_BrkFace', 'Exterior2nd_CBlock', 'Exterior2nd_CmentBd', 'Exterior2nd_HdBoard', 'Exterior2nd_ImStucc', 'Exterior2nd_MetalSd', 'Exterior2nd_Other', 'Exterior2nd_Plywood', 'Exterior2nd_Stone', 'Exterior2nd_Stucco', 'Exterior2nd_VinylSd', 'Exterior2nd_Wd Sdng', 'Exterior2nd_Wd Shng', 'ExterQual_Ex', 'ExterQual_Fa', 'ExterQual_Gd', 'ExterQual_TA', 'ExterCond_Ex', 'ExterCond_Fa', 'ExterCond_Gd', 'ExterCond_Po', 'ExterCond_TA', 'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc', 'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood', 'BsmtQual_Ex', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_TA', 'BsmtQual_nan', 'BsmtCond_Fa', 'BsmtCond_Gd', 'BsmtCond_Po', 'BsmtCond_TA', 'BsmtCond_nan', 'BsmtExposure_Av', 'BsmtExposure_Gd', 'BsmtExposure_Mn', 'BsmtExposure_No', 'BsmtExposure_nan', 'BsmtFinType1_ALQ', 'BsmtFinType1_BLQ', 'BsmtFinType1_GLQ', 'BsmtFinType1_LwQ', 'BsmtFinType1_Rec', 'BsmtFinType1_Unf', 'BsmtFinType1_nan', 'BsmtFinType2_ALQ', 'BsmtFinType2_BLQ', 'BsmtFinType2_GLQ', 'BsmtFinType2_LwQ', 'BsmtFinType2_Rec', 'BsmtFinType2_Unf', 'BsmtFinType2_nan', 'Heating_Floor', 'Heating_GasA', 'Heating_GasW', 'Heating_Grav', 'Heating_OthW', 'Heating_Wall', 'HeatingQC_Ex', 'HeatingQC_Fa', 'HeatingQC_Gd', 'HeatingQC_Po', 'HeatingQC_TA', 'CentralAir_N', 'CentralAir_Y', 'Electrical_FuseA', 'Electrical_FuseF', 'Electrical_FuseP', 'Electrical_Mix', 'Electrical_SBrkr', 'Electrical_nan', 'KitchenQual_Ex', 'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'Functional_Maj1', 'Functional_Maj2', 'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Sev', 'Functional_Typ', 'GarageType_2Types', 'GarageType_Attchd', 'GarageType_Basment', 'GarageType_BuiltIn', 'GarageType_CarPort', 'GarageType_Detchd', 'GarageType_nan', 'GarageFinish_Fin', 'GarageFinish_RFn', 'GarageFinish_Unf', 'GarageFinish_nan', 'GarageQual_Ex', 'GarageQual_Fa', 'GarageQual_Gd', 'GarageQual_Po', 'GarageQual_TA', 'GarageQual_nan', 'GarageCond_Ex', 'GarageCond_Fa', 'GarageCond_Gd', 'GarageCond_Po', 'GarageCond_TA', 'GarageCond_nan', 'PavedDrive_N', 'PavedDrive_P', 'PavedDrive_Y', 'SaleType_COD', 'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD', 'SaleCondition_Abnorml', 'SaleCondition_AdjLand', 'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal', 'SaleCondition_Partial'] not in index"

In [None]:

# -------------------------------
# 3. Remove Outliers: Fashion Backpacks & IQR Filtering
# -------------------------------
# df_train = df_train[(df_train["Weight"] >= 0.75) | (df_train["SalePrice"] > df_train["SalePrice"].median())]

# Q1 = df_train["SalePrice"].quantile(0.25)
# Q3 = df_train["SalePrice"].quantile(0.75)
# IQR = Q3 - Q1
# lower_bound = Q1 - 1.5 * IQR  # Adjust if needed (1.5 → 2.0 for stricter filtering)
# upper_bound = Q3 + 1.5 * IQR
# df_train = df_train[(df_train["SalePrice"] >= lower_bound) & (df_train["SalePrice"] <= upper_bound)]


In [None]:

# -------------------------------
# 4. Define Features & Target Variable
# -------------------------------
X_train = df_train.drop(columns=["SalePrice"])  # ✅ Removes 'Price' to prevent feature mismatch
y_train = df_train["SalePrice"].astype(float)
X_test = df_test.copy()  # ✅ Ensures same columns

# # Standardization
# scaler = StandardScaler()
# X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
# X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [None]:

# -------------------------------
# 5. Feature Engineering
# -------------------------------
# skewed_features = ["Weight", "Compartments"]
# for col in skewed_features:
#     X_train[col] = X_train[col].clip(lower=0)  # ✅ Replace negative values with 0
#     X_train[col].fillna(0, inplace=True)  # ✅ Replace NaNs with 0
#     X_train[col + "_log"] = np.log1p(X_train[col])  # ✅ Apply log1p safely

#     X_test[col] = X_test[col].clip(lower=0)  # ✅ Replace negative values with 0
#     X_test[col].fillna(0, inplace=True)  # ✅ Replace NaNs with 0
#     X_test[col + "_log"] = np.log1p(X_test[col])  # ✅ Apply log1p safely

# X_train["Weight_Compartments"] = X_train["Weight"] * X_train["Compartments"]
# X_test["Weight_Compartments"] = X_test["Weight"] * X_test["Compartments"]


In [None]:
# Fix objects in X_test
for col in X_test.columns:
    if X_test[col].dtype == "object":
        X_test[col] = X_test[col].astype(float)


In [None]:
# check if data is prepared for cat boost
print(X_train.dtypes.value_counts())
cat_features = X_train.select_dtypes(include=['object']).columns
print(cat_features)

In [None]:

# -------------------------------
# 6. Train Models with Hardcoded Best Parameters
# -------------------------------

## 🔹 XGBoost Best Parameters
# xgb_best = xgb.XGBRegressor(
#     objective="reg:squarederror",
#     subsample=0.8,
#     n_estimators=1000,
#     max_depth=4,
#     learning_rate=0.01,
#     colsample_bytree=0.6,
#     tree_method="gpu_hist",
#     random_state=seed
# )
# xgb_best.fit(X_train, y_train)

# ## 🔹 LightGBM Best Parameters
# use_gpu = lgb.__version__ >= "3.2.0"
# lgb_best = lgb.LGBMRegressor(
#     num_leaves=50,
#     n_estimators=500,
#     learning_rate=0.01,
#     subsample=0.8,
#     device="gpu" if use_gpu else "cpu",
#     random_state=seed
# )
# lgb_best.fit(X_train, y_train)

## 🔹 CatBoost (Default Manually Set)
cat_model = CatBoostRegressor(iterations=1000, 
                              learning_rate=0.05, 
                              depth=6, random_state=seed, 
                              verbose=False)
cat_model.fit(X_train, y_train)

# -------------------------------
# 7. Use Stacking for Final Predictions
# -------------------------------
# stacked_model = StackingRegressor(
#     estimators=[
#         # ("xgb", xgb_best),
#         # ("lgb", lgb_best),
#         ("cat", cat_model),
#     ],
#     # final_estimator=Ridge()
# )
# stacked_model.fit(X_train, y_train)

# -------------------------------
# 8. Evaluate RMSE for Each Model
# -------------------------------
def evaluate_model(model, name):
    """Calculate and print RMSE for a given model."""
    y_pred = model.predict(X_train)
    rmse = np.sqrt(mean_squared_error(y_train, y_pred))
    print(f"📊 {name} RMSE: {rmse:.4f}")
    return rmse

# xgb_rmse = evaluate_model(xgb_best, "XGBoost")
# lgb_rmse = evaluate_model(lgb_best, "LightGBM")
cat_rmse = evaluate_model(cat_model, "CatBoost")
# stacked_rmse = evaluate_model(stacked_model, "Stacked Model")

# Print all RMSEs
# print("\n📈 RMSE Comparison:")
# print(f"🔹 XGBoost RMSE: {xgb_rmse:.4f}")
# print(f"🔹 LightGBM RMSE: {lgb_rmse:.4f}")
print(f"🔹 CatBoost RMSE: {cat_rmse:.4f}")
# print(f"🔥 Stacked Model RMSE: {stacked_rmse:.4f}")

# -------------------------------
# 9. Make Predictions & Save Submission
# -------------------------------
# y_pred = stacked_model.predict(X_test)
y_pred = cat_model.predict(X_test)

submission = pd.DataFrame({"Id": test_ids, "SalePrice": y_pred})



submission.to_csv("submission.csv", index=False)

print("\n✅ Predictions saved to 'submission.csv' with correct test ids.")
# -------------------------------
# 14. Print Best Hyperparameters
# -------------------------------
print("\n🔍 Best Hyperparameters (Hardcoded):")
# print("📌 XGBoost Best Params: {'subsample': 0.8, 'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.6}")
# print("📌 LightGBM Best Params: {'num_leaves': 50, 'n_estimators': 500, 'learning_rate': 0.01}")
print("📌 CatBoost Params: Default (Manually Set)")
end_time = time.time()  # End timer
total_time = end_time - start_time
print(f"\n⏳ Total Runtime: {total_time:.2f} seconds")