In [None]:
# === Import required libraries ===
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import lightgbm as lgb

# === Function to try multiple file paths and load the CSV ===
def try_read(paths):
    # Loop over the list of possible paths
    for p in paths:
        # Check if file exists at this path
        if os.path.exists(p):
            print("Loading:", p)
            # Load and return the CSV as DataFrame
            return pd.read_csv(p)
    # If none of the paths exist, return None
    return None

# === Possible file paths for train.csv and test.csv ===
train_paths = [
    "C:/Users/alanm/Downloads/Predicting-Airbnb-List-Prices/train-data/train.csv",  # example user path
    "./train.csv",
    "./input/train.csv",
    "./data/train.csv"
]

test_paths = [
    "C:/Users/alanm/Downloads/Predicting-Airbnb-List-Prices/test-data/test.csv",   # example user path
    "./test.csv",
    "./input/test.csv",
    "./data/test.csv"
]

# === Load the training and test data ===
train = try_read(train_paths)
test = try_read(test_paths)  # test data is optional (for predictions)

# If train data not found, stop the program with an error
if train is None:
    raise FileNotFoundError("train.csv not found; check your train_paths list.")

# === Convert 'log_price' to 'price' if 'log_price' column exists ===
# Because we want to predict actual price, not log(price)
if "log_price" in train.columns:
    train["price"] = np.exp(train["log_price"])
else:
    # If no 'log_price' or 'price' column found, raise an error
    if "price" not in train.columns:
        raise ValueError("No 'price' or 'log_price' column found in train data.")

# === Define the target variable name ===
TARGET = "price"

# Get IDs for train and test datasets if available
train_ids = train.get("id")
test_ids = test.get("id") if test is not None and "id" in test.columns else None

# === Prepare feature matrix X and target vector y ===
# Drop columns not used as features: TARGET, 'log_price' (if exists), and 'id'
X = train.drop([TARGET, "log_price", "id"] if "id" in train.columns else [TARGET, "log_price"], axis=1, errors="ignore")
y = train[TARGET].values  # target values as numpy array

# Prepare test feature matrix (if test data is loaded)
if test is not None:
    X_test = test.drop(["id"] if "id" in test.columns else [], axis=1, errors="ignore")
else:
    X_test = None

# === Combine train and test features for consistent categorical encoding ===
# This ensures categories are encoded consistently across train and test
if X_test is not None:
    combined = pd.concat([X, X_test], axis=0, ignore_index=True)
else:
    combined = X.copy().reset_index(drop=True)

# === Encode categorical variables as integers ===
for col in combined.columns:
    if combined[col].dtype == "object" or combined[col].dtype.name == "category":
        # Convert categories to integer codes
        combined[col] = pd.factorize(combined[col].astype(str))[0]

# === Split combined data back to train and test feature sets ===
X = combined.iloc[: len(X)].reset_index(drop=True)
if X_test is not None:
    X_test = combined.iloc[len(X):].reset_index(drop=True)

# === Fill missing values with a placeholder (-999) ===
X = X.fillna(-999)
if X_test is not None:
    X_test = X_test.fillna(-999)

# === Set up K-Fold cross-validation ===
NFOLDS = 5  # number of folds
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# Initialize arrays to store out-of-fold predictions and test predictions
oof = np.zeros(len(X))  # out-of-fold predictions for train data
preds = np.zeros(len(X_test)) if X_test is not None else None  # test set predictions

best_its = []  # to store best iteration for each fold

# === Training with LightGBM using K-Fold cross-validation ===
for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y), 1):
    # Split data into training and validation sets for this fold
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # Initialize LightGBM regressor with chosen hyperparameters
    model = LGBMRegressor(
        n_estimators=10000,       # max number of boosting rounds
        learning_rate=0.05,       # step size shrinkage
        num_leaves=31,            # max number of leaves in one tree
        subsample=0.8,            # fraction of data used per iteration
        colsample_bytree=0.8,     # fraction of features used per iteration
        random_state=42,
        n_jobs=-1                 # use all CPU cores
    )
    
    # Train model with early stopping on validation set to prevent overfitting
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="rmse",  # Root Mean Squared Error as metric
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),  # stop if no improvement for 100 rounds
            lgb.log_evaluation(period=0)  # no periodic logging during training
        ]
    )

    # Get best iteration (number of trees) found by early stopping
    best_it = model.best_iteration_ or model.n_estimators
    best_its.append(best_it)

    # Predict on validation data using best iteration and store out-of-fold predictions
    oof[val_idx] = model.predict(X_val, num_iteration=best_it)

    # Predict on test data and accumulate predictions for averaging
    if X_test is not None:
        preds += model.predict(X_test, num_iteration=best_it) / NFOLDS

# === Calculate and print cross-validation RMSE ===
cv_rmse = np.sqrt(mean_squared_error(y, oof))
print(f"CV RMSE (price): {cv_rmse:.6f}")

# === Train final model on entire training data using average best iteration ===
final_best_it = int(np.mean(best_its))
print(f"Average best iteration across folds: {final_best_it}")

final_model = LGBMRegressor(
    n_estimators=final_best_it,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
final_model.fit(X, y)

# === Save submission file if test data is available ===
if X_test is not None and test_ids is not None:
    submission = pd.DataFrame({
        "id": test_ids,
        "price": preds  # predictions of actual price
    })
    submission.to_csv("submission.csv", index=False)
    print("Wrote submission.csv with price predictions.")
else:
    print("No test set found; skipping submission.")

FileNotFoundError: train.csv not found; check your train_paths list.