In [37]:
%reload_ext autoreload
%autoreload 2

## Setup environment

In [38]:
import sys

import mlflow
import pandas as pd
from dotenv import load_dotenv

from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split

from mlops.operators.preprocessor import DataPreprocessor

sys.path.append("..")
pd.set_option("display.precision", 3)
load_dotenv("../config.env")

True

## Load dataset

In [39]:
df = pd.read_csv("../assets/AB_NYC_2019.csv")

data_preprocessor = DataPreprocessor(df)
data_preprocessor.preprocess_airbnb_data()

processed_data = data_preprocessor.get_preprocessed_data()
processed_data["price"] /= processed_data["price"].max()
processed_data.head()

Unnamed: 0,latitude,longitude,price,number_of_reviews,availability_365,hosts_multiple_apts,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,...,neighbourhood_small districts in Queens,neighbourhood_small districts in Staten Island,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,review_recency_Last month,review_recency_Last quarter,review_recency_Last year,review_recency_No reviews,review_recency_Over a year ago
0,0.357,0.512,0.015,0.014,1,1,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0.614,0.49,0.022,0.072,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,0.748,0.569,0.015,0.0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
3,0.448,0.536,0.009,0.429,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,0.723,0.565,0.008,0.014,0,0,0,0,1,0,...,0,0,1,0,0,0,0,1,0,0


## Train-test split

In [40]:
test_size = 0.3
random_state = 0
data_train, data_test = train_test_split(
    processed_data, test_size=test_size, random_state=random_state
)

## Drop outliers

In [41]:
# Drop outliers to improve model performance
iqr = data_train["price"].quantile(0.75) - data_train["price"].quantile(0.25)
statistical_max = data_train["price"].quantile(0.75) + 1.5 * iqr

data_train = data_train[data_train["price"] < statistical_max]
data_test = data_test[data_test["price"] < statistical_max]

y_train = data_train[["price"]].to_numpy().ravel()
X_train = data_train.drop(["price"], axis=1)

y_test = data_test[["price"]].to_numpy().ravel()
X_test = data_test.drop(["price"], axis=1)

## Base: Bagging Regressor

In [42]:
# Base experiment: Bagging Regressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from datetime import datetime
import os

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))
mlflow.set_experiment("baseline")

param_distributions = {
    "n_estimators": randint(10, 100),
    "max_samples": uniform(0.1, 0.9),
    "max_features": uniform(0.1, 0.9),
    "bootstrap": [True, False],
    "bootstrap_features": [True, False],
}

base_model = BaggingRegressor(random_state=random_state)

random_search = RandomizedSearchCV(
    base_model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    random_state=random_state,
    scoring="r2",
    n_jobs=-1,
)

print("Starting base experiment...")
with mlflow.start_run(run_name=f"baseline_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    t1 = datetime.now()
    random_search.fit(X_train, y_train)
    t2 = datetime.now()

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_train_pred = best_model.predict(X_train)

    train_r2_score = best_model.score(X_train, y_train)
    test_r2_score = best_model.score(X_test, y_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_pred)

    mlflow.log_params(
        {
            "model": "Bagging Regressor",
            "remove_outliers": True,
            "test_size": test_size,
            "random_state": random_state,
            **random_search.best_params_,
        }
    )

    mlflow.log_metrics(
        {
            "train_mse": train_mse,
            "test_mse": test_mse,
            "train_r2_score": train_r2_score,
            "test_r2_score": test_r2_score,
            "fitting_time_ms": (t2 - t1).microseconds / 1000,
            "best_cv_score": random_search.best_score_,
        }
    )

Starting base experiment...


2025/04/03 16:30:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run baseline_20250403_162953 at: http://localhost:5005/#/experiments/1/runs/f608078b14b54459bd01ea0ef347aa53
🧪 View experiment at: http://localhost:5005/#/experiments/1


In [43]:
# Experiment 1: XGBoost with hyperparameter tuning
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

print("Starting XGBoost experiment...")
mlflow.set_experiment("xgboost_tuned")

param_distributions = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.3),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_weight": randint(1, 7),
}

model = xgb.XGBRegressor(objective="reg:squarederror", random_state=random_state)

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    random_state=random_state,
    scoring="r2",
    n_jobs=-1,
)

with mlflow.start_run(run_name=f"xgboost_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Enable autologging
    mlflow.xgboost.autolog()

    t1 = datetime.now()
    random_search.fit(X_train, y_train)
    t2 = datetime.now()

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_train_pred = best_model.predict(X_train)

    train_r2_score = best_model.score(X_train, y_train)
    test_r2_score = best_model.score(X_test, y_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_pred)

    mlflow.log_metrics(
        {
            "train_mse": train_mse,
            "test_mse": test_mse,
            "train_r2_score": train_r2_score,
            "test_r2_score": test_r2_score,
            "fitting_time_ms": (t2 - t1).microseconds / 1000,
            "best_cv_score": random_search.best_score_,
        }
    )

    mlflow.log_params(
        {
            "model": "XGBoost Regressor",
            "remove_outliers": True,
            "test_size": test_size,
            "random_state": random_state,
            **random_search.best_params_,
        }
    )

print("XGBoost experiment completed!")



Starting XGBoost experiment...


2025/04/03 16:30:43 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run xgboost_20250403_163034 at: http://localhost:5005/#/experiments/2/runs/f087cb3ac02242618197692d5d6a3c98
🧪 View experiment at: http://localhost:5005/#/experiments/2
XGBoost experiment completed!


In [None]:
# Experiment 2: Random Forest with hyperparameter tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

print("Starting Random Forest experiment...")
mlflow.set_experiment("random_forest_tuned")

param_distributions = {
    "n_estimators": randint(100, 500),
    "max_depth": randint(3, 20),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 5),
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

model = RandomForestRegressor(random_state=random_state)

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    random_state=random_state,
    scoring="r2",
    n_jobs=-1,
)

with mlflow.start_run(
    run_name=f"random_forest_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
):
    # Enable autologging
    mlflow.sklearn.autolog()

    t1 = datetime.now()
    random_search.fit(X_train, y_train)
    t2 = datetime.now()

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_train_pred = best_model.predict(X_train)

    train_r2_score = best_model.score(X_train, y_train)
    test_r2_score = best_model.score(X_test, y_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_pred)

    mlflow.log_metrics(
        {
            "train_mse": train_mse,
            "test_mse": test_mse,
            "train_r2_score": train_r2_score,
            "test_r2_score": test_r2_score,
            "fitting_time_ms": (t2 - t1).microseconds / 1000,
            "best_cv_score": random_search.best_score_,
        }
    )

    mlflow.log_params(
        {
            "model": "Random Forest Regressor",
            "remove_outliers": True,
            "test_size": test_size,
            "random_state": random_state,
            **random_search.best_params_,
        }
    )

print("Random Forest experiment completed!")

Starting Random Forest experiment...


2025/04/03 16:32:04 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run random_forest_20250403_163044 at: http://localhost:5005/#/experiments/3/runs/9a188c6db7ca4ba2980bbae6da161a9e
🧪 View experiment at: http://localhost:5005/#/experiments/3
Random Forest experiment completed!


In [None]:
# Experiment 3: LightGBM with hyperparameter tuning
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

print("Starting LightGBM experiment...")
mlflow.set_experiment("lightgbm_tuned")

# Define parameter space for random search
param_distributions = {
    "n_estimators": randint(100, 500),
    "learning_rate": uniform(0.01, 0.3),
    "num_leaves": randint(20, 100),
    "max_depth": randint(3, 10),
    "min_child_samples": randint(10, 50),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "reg_alpha": uniform(0, 1),  # L1 regularization
    "reg_lambda": uniform(0, 1),  # L2 regularization
}

# Create base model to tune
model = lgb.LGBMRegressor(
    objective="regression",
    random_state=random_state,
    verbose=-1,  # Suppress training output
)

# Random search with cross-validation
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_distributions,
    n_iter=10,
    cv=5,
    random_state=random_state,
    scoring="r2",
    n_jobs=-1,
)

with mlflow.start_run(run_name=f"lightgbm_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Enable autologging
    mlflow.lightgbm.autolog()

    t1 = datetime.now()
    random_search.fit(X_train, y_train)
    t2 = datetime.now()

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_train_pred = best_model.predict(X_train)

    train_r2_score = best_model.score(X_train, y_train)
    test_r2_score = best_model.score(X_test, y_test)

    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_pred)

    mlflow.log_metrics(
        {
            "train_mse": train_mse,
            "test_mse": test_mse,
            "train_r2_score": train_r2_score,
            "test_r2_score": test_r2_score,
            "fitting_time_ms": (t2 - t1).microseconds / 1000,
            "best_cv_score": random_search.best_score_,
        }
    )

    mlflow.log_params(
        {
            "model": "LightGBM Regressor",
            "remove_outliers": True,
            "test_size": test_size,
            "random_state": random_state,
            **random_search.best_params_,
        }
    )

print("LightGBM experiment completed!")

Starting LightGBM experiment...


2025/04/03 16:32:30 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run lightgbm_20250403_163206 at: http://localhost:5005/#/experiments/4/runs/4533777704004dd49ccf7a786a9ddfe3
🧪 View experiment at: http://localhost:5005/#/experiments/4
LightGBM experiment completed!
