In [1]:
import pandas as pd
import xgboost as xgb

# ======================
# 1) Load datasets
# ======================
train = pd.read_csv("../data/train_cleaned.csv")
test = pd.read_csv("../data/test.csv")

# Separate target and features
y = train["SalePrice"]
X = train.drop("SalePrice", axis=1)

# ======================
# 2) Prepare test dataset
# ======================
# Fill numeric missing values with median
num_cols = test.select_dtypes(include=["number"]).columns
for col in num_cols:
    test[col] = test[col].fillna(test[col].median())

# Fill categorical missing values with "Missing"
cat_cols = test.select_dtypes(include=["object"]).columns
for col in cat_cols:
    test[col] = test[col].fillna("Missing")

# One-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns
X_encoded, test_encoded = X_encoded.align(test_encoded, join="left", axis=1)
test_encoded = test_encoded.fillna(0)

# Convert to DMatrix
dtrain = xgb.DMatrix(X_encoded, label=y)
dtest = xgb.DMatrix(test_encoded)

# ======================
# 3) Train final XGBoost model
# ======================
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

final_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=313   # same as best iteration found
)

# ======================
# 4) Predict on test set
# ======================
predictions = final_model.predict(dtest)

# ======================
# 5) Save submission
# ======================
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": predictions
})

submission.to_csv("../data/submission_xgboost.csv", index=False)
print("✅ Saved submission file to ../data/submission_xgboost.csv")


✅ Saved submission file to ../data/submission_xgboost.csv
