In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# ======================
# 1) Load cleaned dataset
# ======================
df = pd.read_csv("../data/train_cleaned.csv")

X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape, "Validation set:", X_val.shape)

# ======================
# 2) Train XGBoost model
# ======================
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,       # number of trees
    learning_rate=0.05,      # step size
    max_depth=4,             # depth of each tree
    subsample=0.8,           # % of samples used per tree
    colsample_bytree=0.8,    # % of features used per tree
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train, y_train,
    early_stopping_rounds=50,      # stop if no improvement
    eval_set=[(X_val, y_val)],
    verbose=100
)

# ======================
# 3) Evaluate model
# ======================
y_pred = xgb_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("XGBoost RMSE:", rmse)

# ======================
# 4) Save model (optional)
# ======================
xgb_model.save_model("../src/xgb_model.json")
print("✅ XGBoost model saved to ../src/xgb_model.json")
