In [1]:
mlflow --version

mlflow, version 2.13.0


In [51]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import os
import pickle
import click

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid
import numpy as np
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

In [3]:
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [5]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [6]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

In [13]:
def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
    )

    # Extract the target
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

In [17]:
run_data_prep('','/data2')

In [18]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [30]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment2023")

2024/05/30 02:07:37 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment2023' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/ASUS/Documents/GitHub/mlops-zoomcamp/week2/homework/mlruns/1', creation_time=1717024057620, experiment_id='1', last_update_time=1717024057620, lifecycle_stage='active', name='nyc-taxi-experiment2023', tags={}>

In [41]:

def run_train(data_path: str):

   # Define the hyperparameter grid
    param_grid = {
        "max_depth": [5, 10],
        "n_estimators": [50, 100],
        "min_samples_split": [2, 5, 10],
        "random_state": [0]
    }

    # Create a parameter grid
    grid = ParameterGrid(param_grid)
    
    best_rmse = float("inf")
    best_params = None
    
    for params in grid:
        with mlflow.start_run():
            mlflow.set_tag("model", "RandomForestRegressor")
            mlflow.log_params(params)

            X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
            X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)

            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

            if rmse < best_rmse:
                best_rmse = rmse
                best_params = params

            # Optionally log the model artifact
            model_path = "mlruns/models/random.bin"
            with open(model_path, "wb") as f:
                pickle.dump(rf, f)
            mlflow.log_artifact(local_path=model_path, artifact_path="models_pickle")

    print(f"Best RMSE: {best_rmse} with parameters: {best_params}")
    return best_rmse, best_params

In [42]:
run_train('/data2')



Best RMSE: 5.429788415830813 with parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50, 'random_state': 0}




(5.429788415830813,
 {'max_depth': 10,
  'min_samples_split': 10,
  'n_estimators': 50,
  'random_state': 0})

In [81]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):

      with mlflow.start_run():
        mlflow.set_tag("model", "RandomForest")
        mlflow.log_params(params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

In [82]:
run_optimization('/data2',2)

  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]




 50%|█████     | 1/2 [00:33<00:33, 33.34s/trial, best loss: 5.370086069268862]





100%|██████████| 2/2 [00:42<00:00, 21.03s/trial, best loss: 5.370086069268862]





In [67]:
# Define constants
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']
MODEL_NAME = "random-forest"

# Set tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.set_experiment(HPO_EXPERIMENT_NAME)
mlflow.sklearn.autolog()



In [68]:
# Set tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Ensure the experiments exist
client = MlflowClient()

def get_or_create_experiment(name):
    experiment = client.get_experiment_by_name(name)
    if experiment is None:
        print(f"Experiment '{name}' not found. Creating it.")
        experiment_id = mlflow.create_experiment(name)
        experiment = client.get_experiment(experiment_id)
    return experiment

hpo_experiment = get_or_create_experiment(HPO_EXPERIMENT_NAME)
main_experiment = get_or_create_experiment(EXPERIMENT_NAME)

# Enable autologging
mlflow.sklearn.autolog()



In [69]:
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [70]:
def train_and_log_model(data_path: str, params: dict):
    with mlflow.start_run():
        mlflow.set_tag("model", "RandomForestRegressor")
        mlflow.log_params(params)

        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
        X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred_val = rf.predict(X_val)
        y_pred_test = rf.predict(X_test)

        val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
        test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)

        mlflow.log_metric("val_rmse", val_rmse)
        mlflow.log_metric("test_rmse", test_rmse)

        # Save model artifact
        model_path = "random_forest_model.pkl"
        with open(model_path, "wb") as f:
            pickle.dump(rf, f)
        mlflow.log_artifact(local_path=model_path, artifact_path="models_pickle")

In [None]:
train_and_log_model(data_path=data_path, params=run.data.params)

In [118]:
def run_register_model(data_path: str, top_n: int):
    client = MlflowClient()

    # Retrieve the top_n model runs from HPO experiment
    hpo_experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    if hpo_experiment is None:
        raise ValueError(f"Experiment '{HPO_EXPERIMENT_NAME}' not found.")
    
    runs = client.search_runs(
        experiment_ids=hpo_experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )

    # Train and log models from HPO experiment
    for run in runs:
        print(run)
        params = run.data.params
        print(params)
        # Convert parameter values from strings to appropriate types
        params = {key: eval(value) if value.isdigit() else value for key, value in params.items() if key in RF_PARAMS}
        train_and_log_model(data_path=data_path, params=params)

    # Select the model with the lowest test RMSE from the main experiment
    main_experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    if main_experiment is None:
        raise ValueError(f"Experiment '{HPO_EXPERIMENT_NAME}' not found.")

    all_runs = client.search_runs(
        experiment_ids=main_experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY
    )

    print(all_runs)

    if not all_runs:
        raise ValueError(f"No runs found in experiment '{HPO_EXPERIMENT_NAME}'.")

    best_run = client.search_runs(
        experiment_ids=main_experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.test_rmse ASC"]
    )[0]

    best_run_id = best_run.info.run_id

    # Register the best model
    model_uri = f"runs:/{best_run_id}/models_pickle/random_forest_model.pkl"

    print(model_uri)
    mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)

In [119]:
data_path = "/data2"
run_register_model(data_path, top_n=5)

<Run: data=<RunData: metrics={'mean_squared_error_X_val': 5.370086069268862,
 'rmse': 5.370086069268862,
 'training_mean_absolute_error': 3.341844396061385,
 'training_mean_squared_error': 26.75293577873919,
 'training_r2_score': 0.6714525000717566,
 'training_root_mean_squared_error': 5.172324021050807,
 'training_score': 0.6714525000717566}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '13',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '43',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "046a7431175442d88ad5693cbf17ff5b", '
 



<Run: data=<RunData: metrics={'mean_squared_error_X_val': 6.336044899923993,
 'rmse': 6.336044899923993,
 'training_mean_absolute_error': 4.219143142306118,
 'training_mean_squared_error': 38.96874159510181,
 'training_r2_score': 0.5214326108989062,
 'training_root_mean_squared_error': 6.242494821391669,
 'training_score': 0.5214326108989062}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '2',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '22',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "abe2e1dbcbf94f79ba3f4334b0f707bd", '
  



<Run: data=<RunData: metrics={'mean_squared_error-2_X_test': 5.599280662415257,
 'mean_squared_error_X_val': 5.370086069268862,
 'test_rmse': 5.599280662415257,
 'training_mean_absolute_error': 3.341844396061385,
 'training_mean_squared_error': 26.75293577873919,
 'training_r2_score': 0.6714525000717566,
 'training_root_mean_squared_error': 5.172324021050807,
 'training_score': 0.6714525000717566,
 'val_rmse': 5.370086069268862}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '13',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '43',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegre



<Run: data=<RunData: metrics={'mean_squared_error-2_X_test': 6.606223294805459,
 'mean_squared_error_X_val': 6.336044899923993,
 'test_rmse': 6.606223294805459,
 'training_mean_absolute_error': 4.219143142306118,
 'training_mean_squared_error': 38.96874159510181,
 'training_r2_score': 0.5214326108989062,
 'training_root_mean_squared_error': 6.242494821391669,
 'training_score': 0.5214326108989062,
 'val_rmse': 6.336044899923993}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '2',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '22',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegres



<Run: data=<RunData: metrics={'mean_squared_error-2_X_test': 5.599280662415257,
 'mean_squared_error_X_val': 5.370086069268862,
 'test_rmse': 5.599280662415257,
 'training_mean_absolute_error': 3.341844396061385,
 'training_mean_squared_error': 26.75293577873919,
 'training_r2_score': 0.6714525000717566,
 'training_root_mean_squared_error': 5.172324021050807,
 'training_score': 0.6714525000717566,
 'val_rmse': 5.370086069268862}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '13',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '43',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegre

Registered model 'random-forest' already exists. Creating a new version of this model...
2024/05/30 23:00:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest, version 2


[<Run: data=<RunData: metrics={'mean_squared_error-2_X_test': 5.599280662415257,
 'mean_squared_error_X_val': 5.370086069268862,
 'test_rmse': 5.599280662415257,
 'training_mean_absolute_error': 3.341844396061385,
 'training_mean_squared_error': 26.75293577873919,
 'training_r2_score': 0.6714525000717566,
 'training_root_mean_squared_error': 5.172324021050807,
 'training_score': 0.6714525000717566,
 'val_rmse': 5.370086069268862}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '13',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '7',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '43',
 'n_jobs': 'None',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegr

Created version '2' of model 'random-forest'.
