In [71]:
from mlflow.tracking import MlflowClient

In [40]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [72]:
## Listing all experiments
all_experiments = client.search_experiments() ##list_experiments() deprecated
print(all_experiments)

[<Experiment: artifact_location='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/2', creation_time=1735696146213, experiment_id='2', last_update_time=1735696146213, lifecycle_stage='active', name='my-cool-experiment', tags={}>, <Experiment: artifact_location='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/1', creation_time=1735159898745, experiment_id='1', last_update_time=1735159898745, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1735154853504, experiment_id='0', last_update_time=1735154853504, lifecycle_stage='active', name='Default', tags={}>]


In [74]:
## Creating a new experiment
client.create_experiment(name="my-cool-experiment-2")

'3'

In [75]:
from mlflow.entities import ViewType

## Listing the first 10 runs in the experiment with id 1
runs = client.search_runs(
    experiment_ids="1",
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=10,
    order_by=["metrics.rmse ASC"]
    )


In [76]:
runs

[<Run: data=<RunData: metrics={'rmse': 6.328857455927513}, params={'learning_rate': '0.5007497542674484',
  'max_depth': '69',
  'min_child_weight': '1.593971756791148',
  'objective': 'reg:linear',
  'reg_alpha': '0.05449929909264596',
  'reg_lambda': '0.34077893742416615',
  'seed': '42'}, tags={'mlflow.log-model.history': '[{"run_id": "b4e5d303ed394f9ebceb9db15ca3a4d2", '
                              '"artifact_path": "models_mlflow", '
                              '"utc_time_created": "2024-12-26 '
                              '01:14:27.968959", "model_uuid": '
                              '"49146fd018924ce582f76a819ef4d93b", "flavors": '
                              '{"python_function": {"loader_module": '
                              '"mlflow.xgboost", "python_version": "3.9.17", '
                              '"data": "model.xgb", "env": {"conda": '
                              '"conda.yaml", "virtualenv": "python_env.yaml"}}, '
                              '"xgboost": 

In [77]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: b4e5d303ed394f9ebceb9db15ca3a4d2, rmse: 6.3289
run id: be8a424e727a4c5e8ae0a5568a8137fa, rmse: 6.3289
run id: c8a84f63d95c43ce99e751622d99183e, rmse: 6.3289
run id: b2f14614a00547af906c762e72e0d820, rmse: 6.3866
run id: 920ebe547f8248869e746999ce802127, rmse: 6.4606


## Promoting Models


In [47]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [48]:
## Registering a model
run_id = "c8a84f63d95c43ce99e751622d99183e"
model_uri = f"runs:/{run_id}/model"


In [49]:
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '5' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1735705169234, current_stage='None', description=None, last_updated_timestamp=1735705169234, name='nyc-taxi-regressor', run_id='c8a84f63d95c43ce99e751622d99183e', run_link=None, source='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/1/c8a84f63d95c43ce99e751622d99183e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [79]:
## Getting the latest versions for each stage of the model
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f" version: {version.version}, stage: {version.current_stage}")

 version: 1, stage: Archived
 version: 2, stage: Production
 version: 5, stage: None
 version: 4, stage: Staging


  latest_versions = client.get_latest_versions(name=model_name)


In [78]:
## Transitioning the model #4 to the staging stage
model_version = 4
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1735697491402, current_stage='Staging', description='This model version 4 was transitioned to Staging on 2025-01-01', last_updated_timestamp=1737331316229, name='nyc-taxi-regressor', run_id='c8a84f63d95c43ce99e751622d99183e', run_link=None, source='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/1/c8a84f63d95c43ce99e751622d99183e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [52]:
from datetime import datetime

date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"This model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1735697491402, current_stage='Staging', description='This model version 4 was transitioned to Staging on 2025-01-01', last_updated_timestamp=1735705181457, name='nyc-taxi-regressor', run_id='c8a84f63d95c43ce99e751622d99183e', run_link=None, source='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/1/c8a84f63d95c43ce99e751622d99183e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

## Comparing versions and selecting the new "Production" model

In [53]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import pandas as pd

In [54]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [55]:
df = read_dataframe('./data/green_tripdata_2021-03.parquet')

In [57]:
## Extracting from run_id to the current directory '.'
run_id = "b4e5d303ed394f9ebceb9db15ca3a4d2"
#run_id = "c8a84f63d95c43ce99e751622d99183e"
client.download_artifacts(run_id=run_id, path="preprocessor", dst_path='.')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/preprocessor'

In [58]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [59]:
X_test = preprocess(df, dv)

In [60]:
target = 'duration'
y_test = df[target].values


In [69]:
%time test_model(name=model_name, stage="Staging", X_test=X_test, y_test=y_test)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


CPU times: user 5.16 s, sys: 90 ms, total: 5.25 s
Wall time: 614 ms


{'rmse': 6.279579187330177}

In [68]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


CPU times: user 5.42 s, sys: 73.9 ms, total: 5.49 s
Wall time: 588 ms


{'rmse': 6.279579187330177}

In [70]:
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production",
    archive_existing_versions=True
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1735690585204, current_stage='Production', description='', last_updated_timestamp=1735705790294, name='nyc-taxi-regressor', run_id='be8a424e727a4c5e8ae0a5568a8137fa', run_link='', source='/Users/luisvaras/code/zoomcamp/mlops-zoomcamp/03-training/experiment_tracking/mlruns/1/be8a424e727a4c5e8ae0a5568a8137fa/artifacts/models_mlflow', status='READY', status_message=None, tags={'model': 'xgboost'}, user_id=None, version=2>