In [None]:
# === Import required libraries ===
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import lightgbm as lgb

# === Function to try multiple file paths ===
def try_read(paths):
    for p in paths:
        if os.path.exists(p):
            print("Loading:", p)
            return pd.read_csv(p)
    return None

# === File paths for train.csv and test.csv ===
train_paths = [
    "C:/Users/alanm/.kaggle/input/train-data/train.csv",  # Your actual file location
    "./train.csv",
    "./input/train.csv",
    "./data/train.csv"
]

test_paths = [
    "C:/Users/alanm/.kaggle/input/test-data/test.csv",   # Your actual file location
    "./test.csv",
    "./input/test.csv",
    "./data/test.csv"
]

# === Load the data ===
train = try_read(train_paths)
test = try_read(test_paths)  # optional (used for prediction)

if train is None:
    raise FileNotFoundError("train.csv not found; check your train_paths list.")

# === Target variable ===
TARGET = "log_price"
train_ids = train.get("id")
test_ids = test.get("id") if test is not None and "id" in test.columns else None

# === Feature preprocessing ===
X = train.drop([TARGET, "id"] if "id" in train.columns else [TARGET], axis=1, errors="ignore")
y = train[TARGET].values

if test is not None:
    X_test = test.drop(["id"] if "id" in test.columns else [], axis=1, errors="ignore")
else:
    X_test = None

# === Combine train + test for consistent encoding ===
if X_test is not None:
    combined = pd.concat([X, X_test], axis=0, ignore_index=True)
else:
    combined = X.copy().reset_index(drop=True)

# === Encode categorical variables ===
for col in combined.columns:
    if combined[col].dtype == "object" or combined[col].dtype.name == "category":
        combined[col] = pd.factorize(combined[col].astype(str))[0]

# === Split data back ===
X = combined.iloc[: len(X)].reset_index(drop=True)
if X_test is not None:
    X_test = combined.iloc[len(X):].reset_index(drop=True)

# === Handle missing values ===
X = X.fillna(-999)
if X_test is not None:
    X_test = X_test.fillna(-999)

# === K-Fold cross-validation ===
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof = np.zeros(len(X))
preds = np.zeros(len(X_test)) if X_test is not None else None
best_its = []

# === Training with LightGBM ===
for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    model = LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=0)
        ]
    )

    best_it = model.best_iteration_ or model.n_estimators
    best_its.append(best_it)

    oof[val_idx] = model.predict(X_val, num_iteration=best_it)
    if X_test is not None:
        preds += model.predict(X_test, num_iteration=best_it) / NFOLDS

# === Cross-validation result ===
cv_rmse_log = np.sqrt(mean_squared_error(y, oof))
print(f"CV RMSE (log_price): {cv_rmse_log:.6f}")

# === Final model trained on all data ===
final_best_it = int(np.mean(best_its))
print(f"Average best iteration across folds: {final_best_it}")

final_model = LGBMRegressor(
    n_estimators=final_best_it,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
final_model.fit(X, y)

# === Save submission if test is available ===
if X_test is not None and test_ids is not None:
    submission = pd.DataFrame({
        "id": test_ids,
        "log_price": preds  # predictions are in log scale
    })
    submission.to_csv("submission.csv", index=False)
    print("Wrote submission.csv with log_price predictions.")
else:
    print("No test set found; skipping submission.")