In [1]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

In [2]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.search_experiments()

[<Experiment: artifact_location=('file:///c:/Users/CPE '
  'KMUTT/Documents/git/CPE393-MLOps/MLExperimenttracking-cpe393/mlruns/2'), creation_time=1743001819999, experiment_id='2', last_update_time=1743001819999, lifecycle_stage='active', name='my-new-experiment', tags={}>,
 <Experiment: artifact_location=('file:///c:/Users/CPE '
  'KMUTT/Documents/git/CPE393-MLOps/MLExperimenttracking-cpe393/mlruns/1'), creation_time=1742459591224, experiment_id='1', last_update_time=1742459591224, lifecycle_stage='active', name='mlops_nyc_taxi', tags={}>]

In [3]:
client.create_experiment(name="my-new-experiment")

'2'

In [3]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 100",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [4]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: b12bc85f0f834ac5b449d9f6cf5b6ed1, rmse: 25.6204
run id: 6f6ef3e0d34c48b9a6ea2fcf98e352b0, rmse: 25.9395
run id: 22d9a6eef0fa468c859020629e0ace2c, rmse: 25.9395
run id: 68418b15027b46acbdcf6311e572749d, rmse: 27.1123
run id: 495ff13a14c84f4b82ceff41ae3ad286, rmse: 27.1290


In [5]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [38]:
run_id = "495ff13a14c84f4b82ceff41ae3ad286"  #insert your run id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '6' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1743095160027, current_stage='None', description=None, last_updated_timestamp=1743095160027, name='nyc-taxi-regressor', run_id='495ff13a14c84f4b82ceff41ae3ad286', run_link=None, source=('file:///c:/Users/CPE '
 'KMUTT/Documents/git/CPE393-MLOps/MLExperimenttracking-cpe393/mlruns/1/495ff13a14c84f4b82ceff41ae3ad286/artifacts/model'), status='READY', status_message=None, tags={}, user_id=None, version=6>

In [6]:
#check model version and stages
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 4, stage: None
version: 5, stage: Production
version: 6, stage: Staging


  latest_versions = client.get_latest_versions(name=model_name)


In [39]:
model_version = 6
new_stage = "Staging"

client.transition_model_version_stage(
  name=model_name,
  version=model_version,
  stage=new_stage,
  archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1743095160027, current_stage='Staging', description=None, last_updated_timestamp=1743095168640, name='nyc-taxi-regressor', run_id='495ff13a14c84f4b82ceff41ae3ad286', run_link=None, source=('file:///c:/Users/CPE '
 'KMUTT/Documents/git/CPE393-MLOps/MLExperimenttracking-cpe393/mlruns/1/495ff13a14c84f4b82ceff41ae3ad286/artifacts/model'), status='READY', status_message=None, tags={}, user_id=None, version=6>

Comparing versions and selecting the new "Production" model
In the last section, we will retrieve models registered in the model registry and compare their performance on an unseen test set. The idea is to simulate the scenario in which a deployment engineer has to interact with the model registry to decide whether to update the model version that is in production or not.

These are the steps:

Load the test dataset, which corresponds to the NYC Green Taxi data from the month of March 2021.
Download the DictVectorizer that was fitted using the training data and saved to MLflow as an artifact, and load it with pickle.
Preprocess the test set using the DictVectorizer so we can properly feed the regressors.
Make predictions on the test set using the model versions that are currently in the "Staging" and "Production" stages, and compare their performance.
Based on the results, update the "Production" model version accordingly.


In [12]:
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error
import mlflow.pyfunc

# Load the test dataset
test_data = pd.read_csv("green_tripdata_2021-03.csv.gz", compression='gzip')  # Replace with the actual path to your test dataset

# Preprocess the test dataset
with open("dict_vectorizer.pkl", "rb") as f:
  dv = pickle.load(f)

# Retrieve the Production and Staging model versions
production_version = next((v for v in latest_versions if v.current_stage == "Production"), None)
staging_version = next((v for v in latest_versions if v.current_stage == "Staging"), None)

# Prepare the test dataset for prediction
X_test = dv.transform(test_data.drop(columns=["total_amount"]).to_dict(orient="records"))
y_test = test_data["total_amount"]

if production_version and staging_version:
  # Load the Production model
  production_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{production_version.version}")
  y_pred_production = production_model.predict(X_test)
  production_rmse = mean_squared_error(y_test, y_pred_production)

  # Load the Staging model
  staging_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{staging_version.version}")
  y_pred_staging = staging_model.predict(X_test)
  staging_rmse = mean_squared_error(y_test, y_pred_staging)

  # Compare the RMSE values
  print(f"Production Model RMSE (version {production_version.version}): {production_rmse:.4f}")
  print(f"Staging Model RMSE (version {staging_version.version}): {staging_rmse:.4f}")
  
  # Update the model
  if staging_rmse < production_rmse:
    print("The Staging model performs better and will be promoted to Production.")
    client.transition_model_version_stage(
      name=model_name,
      version=staging_version.version,
      stage="Production",
      archive_existing_versions=True
    )
  else:
    print("The Production model performs better or is comparable. No promotion needed.")
else:
  print("Either the Production or Staging model version is missing.")

  test_data = pd.read_csv("green_tripdata_2021-03.csv.gz", compression='gzip')  # Replace with the actual path to your test dataset


Production Model RMSE (version 5): 143.7272
Staging Model RMSE (version 6): 147.8844
The Production model performs better or is comparable. No promotion needed.
