In [1]:
# --- Mount Google Drive (Colab) ---
from google.colab import drive
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted successfully.")
except Exception as e:
    print(f"‚ö†Ô∏è Could not mount Google Drive. Error: {e}")

%cd /content/drive/MyDrive/Colab\ Notebooks/

# -------------------------
# Imports
# -------------------------
import os, time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

# --- Import the shared data split function ---
from data_processing import load_and_split_data

# -------------------------
# Config
# -------------------------
config = {
    "CITY": "toronto",
    "SEED": 42,
    "VAL_SIZE": 0.05,
    "DRIVE_SAVE_PATH": "/content/drive/MyDrive/Colab_Notebooks/Airbnb_Project",
}

# =========================================================
# 1Ô∏è‚É£ Load the consistent split using load_and_split_data
# =========================================================
train_df, val_df, neighborhood_log_means, train_ids_set, val_ids_set = load_and_split_data(config)

# =========================================================
# 2Ô∏è‚É£ Build target as log deviation from neighborhood mean
# =========================================================
def attach_neigh_log_mean(df, mapping):
    gmean = float(np.mean(list(mapping.values())))
    out = df.copy()
    out["neigh_log_mean"] = out["neighbourhood_cleansed"].map(mapping).fillna(gmean)
    return out

train_df = attach_neigh_log_mean(train_df, neighborhood_log_means)
val_df = attach_neigh_log_mean(val_df, neighborhood_log_means)

train_df["target_dev"] = np.log1p(train_df["price"]) - train_df["neigh_log_mean"]
val_df["target_dev"]   = np.log1p(val_df["price"])   - val_df["neigh_log_mean"]

# =========================================================
# 3Ô∏è‚É£ Feature Engineering for RF
# =========================================================
TEXT_COL = "amenities"
CAT_COLS = ["property_type", "room_type", "neighbourhood_cleansed"]
NUM_COLS = [
    "accommodates", "review_scores_rating", "review_scores_cleanliness",
    "review_scores_checkin", "review_scores_communication",
    "review_scores_location", "review_scores_value",
    "bedrooms", "beds", "bathrooms"
]

def amenities_tokenizer(x):
    if pd.isna(x):
        return []
    return [a.strip().lower() for a in str(x).split(",")]

preprocessor = ColumnTransformer(
    transformers=[
        ("amenities", HashingVectorizer(
            tokenizer=amenities_tokenizer,
            n_features=256,
            alternate_sign=False,
            binary=True
        ), TEXT_COL),
        ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=True), CAT_COLS),
        ("numeric", "passthrough", NUM_COLS),
    ]
)

X_train = preprocessor.fit_transform(train_df)
X_val   = preprocessor.transform(val_df)

y_train = train_df["target_dev"].values
y_val   = val_df["target_dev"].values

# =========================================================
# 4Ô∏è‚É£ Train RF with early stopping on Val MAPE
# =========================================================
print("\nüöÄ Training Random Forest with early stopping on validation MAPE...")

patience = 3
best_mape = float("inf")
patience_counter = 0
chunk_size = 20
max_estimators = 300
best_model = None

for n_trees in tqdm(range(chunk_size, max_estimators + 1, chunk_size), desc="Training Progress", colour="blue", ncols=90):
    rf_partial = RandomForestRegressor(
        n_estimators=n_trees,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features="sqrt",
        bootstrap=True,
        random_state=config["SEED"],
        n_jobs=-1,
        warm_start=True,
    )
    rf_partial.fit(X_train, y_train)

    # Evaluate on validation
    preds_val_dev = rf_partial.predict(X_val)
    preds_val_price = np.expm1(preds_val_dev + val_df["neigh_log_mean"].values)
    val_mape = mean_absolute_percentage_error(val_df["price"], preds_val_price) * 100
    tqdm.write(f"Trees: {n_trees:>3} | Val MAPE: {val_mape:.2f}%")

    if val_mape < best_mape:
        best_mape = val_mape
        best_model = rf_partial
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"‚èπÔ∏è Early stopping at {n_trees} trees (Best MAPE: {best_mape:.2f}%).")
            break

print(f"\n‚úÖ Best Validation MAPE: {best_mape:.2f}%")

# =========================================================
# 5Ô∏è‚É£ Build Full Dataset for Predictions
# =========================================================
full_df = pd.concat([train_df, val_df], axis=0, ignore_index=True)

def split_label(listing_id):
    if listing_id in train_ids_set: return "train"
    elif listing_id in val_ids_set: return "val"
    return "unknown"

full_df["split"] = full_df["id"].apply(split_label)
full_df = attach_neigh_log_mean(full_df, neighborhood_log_means)

X_full = preprocessor.transform(full_df)
preds_dev_full = best_model.predict(X_full)
full_df["predicted_price"] = np.expm1(preds_dev_full + full_df["neigh_log_mean"].values)

Mounting Google Drive...
Mounted at /content/drive
‚úÖ Google Drive mounted successfully.
/content/drive/MyDrive/Colab Notebooks
Data split: 82,065 train records, 4,327 validation records.





üöÄ Training Random Forest with early stopping on validation MAPE...


Training Progress:   0%|                                           | 0/15 [00:00<?, ?it/s]

Trees:  20 | Val MAPE: 32.20%
Trees:  40 | Val MAPE: 31.67%
Trees:  60 | Val MAPE: 31.57%
Trees:  80 | Val MAPE: 31.46%
Trees: 100 | Val MAPE: 31.23%
Trees: 120 | Val MAPE: 31.23%
Trees: 140 | Val MAPE: 31.19%
Trees: 160 | Val MAPE: 31.18%
Trees: 180 | Val MAPE: 31.08%
Trees: 200 | Val MAPE: 31.09%
Trees: 220 | Val MAPE: 31.13%
Trees: 240 | Val MAPE: 31.16%
‚èπÔ∏è Early stopping at 240 trees (Best MAPE: 31.08%).

‚úÖ Best Validation MAPE: 31.08%


In [3]:
# 6Ô∏è‚É£ Artifact 1: Save Predictions DataFrame
# =========================================================
required_cols = [
    "id", "name", "latitude", "longitude", "neighbourhood_cleansed",
    "month", "split", "price", "predicted_price"
]
predictions_df = full_df[required_cols].copy()

os.makedirs(config["DRIVE_SAVE_PATH"], exist_ok=True)
pred_path = os.path.join(config["DRIVE_SAVE_PATH"], f"{config['CITY']}_rf_model_predictions.parquet")
predictions_df.to_parquet(pred_path, index=False)
print(f"\nüì¶ Saved {config['CITY']}_rf_model_predictions.parquet to:\n{pred_path}")

# =========================================================
# 7Ô∏è‚É£ Artifact 2: Feature Importances
# =========================================================
cat_encoder = preprocessor.named_transformers_["categorical"]
cat_feature_names = list(cat_encoder.get_feature_names_out(CAT_COLS))
n_hash = preprocessor.named_transformers_["amenities"].n_features
amenity_feature_names = [f"amenity_{i}" for i in range(n_hash)]
num_feature_names = NUM_COLS
feature_names = amenity_feature_names + cat_feature_names + num_feature_names

importances = best_model.feature_importances_
importance_df = pd.DataFrame({"feature": feature_names, "importance": importances}).sort_values("importance", ascending=False)

imp_path = os.path.join(config["DRIVE_SAVE_PATH"], f"{config['CITY']}_rf_feature_importances.csv")
importance_df.to_csv(imp_path, index=False)
print(f"üìä Saved {config['CITY']}_rf_feature_importances.csv to:\n{imp_path}")

print("\n‚úÖ All done! Toronto Random Forest baseline successfully trained.")


üì¶ Saved toronto_rf_model_predictions.parquet to:
/content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/toronto_rf_model_predictions.parquet
üìä Saved toronto_rf_feature_importances.csv to:
/content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/toronto_rf_feature_importances.csv

‚úÖ All done! Toronto Random Forest baseline successfully trained.
