# House Prices — ChatGPT Workflow (Colab)
This notebook recreates the full assignment pipeline:
- Kaggle auth (optional) and dataset download
- Preprocessing (impute/scale/OHE)
- Baseline **RandomForest** (CV)
- **XGBoost** (CV)
- Save submissions + experiment log
- (Optional) Submit to Kaggle from Colab

> Tip: If you already have `data/train.csv` and `data/test.csv`, you can **skip** the Kaggle cells and run from “Load data & common setup”.


In [None]:
# (Optional) Install tools
!pip -q install kaggle xgboost
print("Installed kaggle & xgboost (if not already).")

## Kaggle credentials (optional)

In [None]:
# Run this to set Kaggle API creds (or skip if data already present)
import json, os, pathlib, getpass
cfg_dir = pathlib.Path("/root/.kaggle"); cfg_dir.mkdir(parents=True, exist_ok=True)
u = input("Kaggle username (leave blank to skip): ").strip()
if u:
    k = getpass.getpass("Kaggle API key (hidden): ").strip()
    with open(cfg_dir/"kaggle.json","w") as f: json.dump({"username": u, "key": k}, f)
    os.chmod(cfg_dir/"kaggle.json", 0o600)
    print("✅ Saved /root/.kaggle/kaggle.json")
else:
    print("Skipping Kaggle setup.")

## Download House Prices data (requires Join on Kaggle)

In [None]:
# If you see 403, open the comp page in a browser & click **Join/Accept Rules** with the SAME account as your token.
!mkdir -p data
try:
    # list files (works only if joined)
    print("Listing competition files…")
    _ = get_ipython().getoutput("kaggle competitions files -c house-prices-advanced-regression-techniques")
    # download + unzip
    print("Downloading…")
    _ = get_ipython().getoutput("kaggle competitions download -c house-prices-advanced-regression-techniques -p data -q")
    _ = get_ipython().getoutput("unzip -o data/house-prices-advanced-regression-techniques.zip -d data")
    _ = get_ipython().getoutput("ls -l data | head")
except Exception as e:
    print("If this failed with 403, join the competition then re-run this cell. Otherwise, you can upload train/test manually in the next cell.")

## (Fallback) Upload `train.csv` and `test.csv` manually

In [None]:
import pathlib, os, shutil
from google.colab import files

DATA = pathlib.Path("data"); DATA.mkdir(exist_ok=True)
need_train = not (DATA/'train.csv').exists()
need_test  = not (DATA/'test.csv').exists()
if need_train or need_test:
    print("Upload train.csv and test.csv now…")
    up = files.upload()
    for name in up:
        low = name.lower()
        if low.startswith("train") and low.endswith(".csv"): shutil.move(name, DATA/"train.csv")
        elif low.startswith("test") and low.endswith(".csv"): shutil.move(name, DATA/"test.csv")
print("Data files:", os.listdir("data"))

## Load data & common setup

In [None]:
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import make_scorer, mean_squared_log_error

DATA = Path("data")
train = pd.read_csv(DATA/"train.csv")
test  = pd.read_csv(DATA/"test.csv")

TARGET = "SalePrice"
y = train[TARGET].values
X = train.drop(columns=[TARGET])

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# OneHotEncoder: handle both old/new sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

preprocess = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")),
                      ("scaler", StandardScaler(with_mean=False))]), num_cols),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                      ("ohe", ohe)]), cat_cols),
])

def rmsle(y_true, y_pred):
    return float(np.sqrt(mean_squared_log_error(y_true, np.maximum(0, y_pred))))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=2025)
print("Rows:", train.shape, "| Test rows:", test.shape)

## Baseline: RandomForest (CV → submission)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from pathlib import Path

rf = RandomForestRegressor(n_estimators=600, random_state=2025, n_jobs=-1)
pipe_rf = Pipeline([("prep", preprocess), ("model", rf)])

scores_rmse = -cross_val_score(pipe_rf, X, y, scoring="neg_root_mean_squared_error", cv=cv, n_jobs=-1)
scores_rmsle = -cross_val_score(pipe_rf, X, y, scoring=rmsle_scorer, cv=cv, n_jobs=-1)
print(f"RF CV RMSE : {scores_rmse.mean():.2f} ± {scores_rmse.std():.2f}")
print(f"RF CV RMSLE: {scores_rmsle.mean():.5f} ± {scores_rmsle.std():.5f}")

pipe_rf.fit(X, y)
preds_rf = pipe_rf.predict(test)

OUT = Path("outputs"); OUT.mkdir(exist_ok=True)
import pandas as pd
pd.DataFrame({"Id": test.get("Id"), "SalePrice": preds_rf}).to_csv(OUT/"submission.csv", index=False)
pd.DataFrame([{"model":"rf_baseline","cv_rmse":round(float(scores_rmse.mean()),2),"cv_rmsle":round(float(scores_rmsle.mean()),5),"note":"5-fold"}]).to_csv(OUT/"experiment_log.csv", index=False)
print("Wrote:", list(OUT.iterdir()))

## XGBoost baseline (CV → submission)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=1200, learning_rate=0.04, max_depth=4,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    random_state=2025, tree_method="hist"
)
pipe_xgb = Pipeline([("prep", preprocess), ("model", xgb)])
scores_xgb = -cross_val_score(pipe_xgb, X, y, scoring=rmsle_scorer, cv=cv, n_jobs=-1)
print(f"XGB CV RMSLE: {scores_xgb.mean():.5f} ± {scores_xgb.std():.5f}")

pipe_xgb.fit(X, y)
import pandas as pd
pd.DataFrame({"Id": test.get("Id"), "SalePrice": pipe_xgb.predict(test)}).to_csv(OUT/"submission_xgb.csv", index=False)
print("Wrote:", list(OUT.iterdir()))

## (Optional) Submit to Kaggle

In [None]:
# Submit a file to Kaggle (requires you joined the comp & set kaggle.json)
# Change filename to submission_xgb.csv if that's better.
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f outputs/submission.csv -m "colab run"
# View recent submissions:
!kaggle competitions submissions -c house-prices-advanced-regression-techniques | head -n 10

## Feature importance (from RF)

In [None]:
import numpy as np, matplotlib.pyplot as plt
from pathlib import Path
Path("figures").mkdir(exist_ok=True)

model = pipe_rf.named_steps["model"]
ohe = pipe_rf.named_steps["prep"].named_transformers_["cat"].named_steps["ohe"]
feat_names = np.array(list(num_cols) + list(ohe.get_feature_names_out(cat_cols)))
imps = model.feature_importances_

top = imps.argsort()[-25:]
plt.figure(figsize=(8,6))
plt.barh(feat_names[top], imps[top])
plt.title("Top Feature Importances (RF)")
plt.tight_layout()
plt.savefig("figures/feature_importance.png", dpi=200)
print("Saved figures/feature_importance.png")

## Artifacts summary

In [None]:
import os
print("Outputs:", os.listdir("outputs"))
print("Figures:", os.listdir("figures") if os.path.isdir("figures") else [])
print("Done.")