In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

In [2]:
#Load data
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

y = train["SalePrice"]
X = train.drop(["SalePrice", "Id"], axis=1)
X_test = test.drop(["Id"], axis=1)

In [3]:
# -----------------------------
# FEATURE ENGINEERING (add here)
# -----------------------------
# Total square footage
train["TotalSF"] = train["TotalBsmtSF"] + train["1stFlrSF"] + train["2ndFlrSF"]
test["TotalSF"]  = test["TotalBsmtSF"] + test["1stFlrSF"] + test["2ndFlrSF"]

# Total bathrooms (full + half as 0.5)
train["TotalBath"] = (
    train["FullBath"] + 0.5*train["HalfBath"] +
    train["BsmtFullBath"] + 0.5*train["BsmtHalfBath"]
)
test["TotalBath"] = (
    test["FullBath"] + 0.5*test["HalfBath"] +
    test["BsmtFullBath"] + 0.5*test["BsmtHalfBath"]
)

# House age, years since remodel, garage age
train["HouseAge"] = train["YrSold"] - train["YearBuilt"]
test["HouseAge"]  = test["YrSold"] - test["YearBuilt"]

train["RemodAge"] = train["YrSold"] - train["YearRemodAdd"]
test["RemodAge"]  = test["YrSold"] - test["YearRemodAdd"]

train["GarageYrBlt"].fillna(train["YearBuilt"], inplace=True)
test["GarageYrBlt"].fillna(test["YearBuilt"], inplace=True)
train["GarageAge"] = train["YrSold"] - train["GarageYrBlt"]
test["GarageAge"]  = test["YrSold"] - test["GarageYrBlt"]

# -----------------------------
# TARGET + DESIGN MATRIX
# (for the Learn Users competition, DO NOT log-transform y)
# -----------------------------
y = train["SalePrice"]
X = train.drop(["SalePrice", "Id"], axis=1)
X_test = test.drop(["Id"], axis=1)

# -----------------------------
# PREPROCESSING (fill + encode)
# -----------------------------
for col in X.columns:
    if X[col].dtype == "object":
        X[col].fillna(X[col].mode()[0], inplace=True)
        X_test[col].fillna(X[col].mode()[0], inplace=True)
    else:
        X[col].fillna(X[col].median(), inplace=True)
        X_test[col].fillna(X[col].median(), inplace=True)

from sklearn.preprocessing import LabelEncoder
for col in X.select_dtypes(include="object").columns:
    lbl = LabelEncoder()
    lbl.fit(list(X[col]) + list(X_test[col]))
    X[col] = lbl.transform(X[col])
    X_test[col] = lbl.transform(X_test[col])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["GarageYrBlt"].fillna(train["YearBuilt"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test["GarageYrBlt"].fillna(test["YearBuilt"], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on

In [4]:

# Train XGBoost with CV
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [5]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

model = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=4,
    min_child_weight=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

# CV (raw SalePrice target for the Learn Users competition)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in kf.split(X):
    Xtr, Xva = X.iloc[tr], X.iloc[va]
    ytr, yva = y.iloc[tr], y.iloc[va]
    model.fit(
        Xtr, ytr,
        eval_set=[(Xva, yva)],
        early_stopping_rounds=200,
        verbose=False
    )
    preds = model.predict(Xva)
    rmse = np.sqrt(mean_squared_error(yva, preds))
    scores.append(rmse)

print("CV RMSE:", np.mean(scores))

# Train on full data and predict
model.fit(X, y, verbose=False)
preds_test = model.predict(X_test)

# sanity checks (like you did)
assert len(preds_test) == len(X_test)
assert (preds_test > 0).all()

submission = pd.DataFrame({"Id": test["Id"], "SalePrice": preds_test})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved")




CV RMSE: 26877.280010964936
submission.csv saved
