In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# ======================
# 1) Load cleaned train and test datasets
# ======================
train = pd.read_csv("../data/train_cleaned.csv")
test = pd.read_csv("../data/test.csv")

# ======================
# 2) Prepare test dataset (same cleaning as train)
# ======================
# Fill numeric missing values with median
num_cols = test.select_dtypes(include=["number"]).columns
for col in num_cols:
    test[col] = test[col].fillna(test[col].median())

# Fill categorical missing values with "Missing"
cat_cols = test.select_dtypes(include=["object"]).columns
for col in cat_cols:
    test[col] = test[col].fillna("Missing")

# One-hot encoding
train_encoded = pd.get_dummies(train, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns between train and test
train_encoded, test_encoded = train_encoded.align(test_encoded, join="left", axis=1)
test_encoded = test_encoded.fillna(0)

# ======================
# 3) Train model on full training set
# ======================
X = train_encoded.drop("SalePrice", axis=1)
y = train_encoded["SalePrice"]

rf = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
rf.fit(X, y)

# ======================
# 4) Predict on test set
# ======================
predictions = rf.predict(test_encoded)

# ======================
# 5) Create submission file
# ======================
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": predictions
})

submission.to_csv("../data/submission.csv", index=False)
print("Saved submission file to ../data/submission.csv")


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- SalePrice
