In [1]:
from mlflow.tracking import MlflowClient
import subprocess


MLFLOW_TRACKING_URI = "sqlite:///data/mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI]) 

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

[2023-05-29 04:19:04 +0000] [1290] [INFO] Starting gunicorn 20.1.0
[2023-05-29 04:19:04 +0000] [1290] [INFO] Listening at: http://127.0.0.1:5000 (1290)
[2023-05-29 04:19:04 +0000] [1290] [INFO] Using worker: sync
[2023-05-29 04:19:04 +0000] [1291] [INFO] Booting worker with pid: 1291
[2023-05-29 04:19:04 +0000] [1292] [INFO] Booting worker with pid: 1292
[2023-05-29 04:19:04 +0000] [1293] [INFO] Booting worker with pid: 1293
[2023-05-29 04:19:04 +0000] [1294] [INFO] Booting worker with pid: 1294


In [3]:
client.search_experiments()

[<Experiment: artifact_location='/workspaces/codespaces-blank/mlruns/2', creation_time=1685207778960, experiment_id='2', last_update_time=1685207778960, lifecycle_stage='active', name='new experiment', tags={}>,
 <Experiment: artifact_location='/workspaces/codespaces-blank/mlruns/1', creation_time=1684734875730, experiment_id='1', last_update_time=1684734875730, lifecycle_stage='active', name='duration-prediction-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1684731928772, experiment_id='0', last_update_time=1684731928772, lifecycle_stage='active', name='Default', tags={}>]

In [None]:
client.create_experiment(name="new experiment")

In [17]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string='metrics.rmse < 6.3',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [28]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: d95a1ba956ec425baf0d2076fb9eafa1, rmse: 6.2902
run id: 77ee85eda95340f3a6fea26bff864a02, rmse: 6.2902
run id: 65a23aafb705477d9527824f2bf8999d, rmse: 6.2902
run id: 3dd2b6c782644652b41364c7b7039408, rmse: 6.2928


In [2]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [29]:
run_id = '77ee85eda95340f3a6fea26bff864a02'
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name='green data duration prediction')

Registered model 'green data duration prediction' already exists. Creating a new version of this model...
2023/05/28 06:00:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: green data duration prediction, version 3
Created version '3' of model 'green data duration prediction'.


<ModelVersion: aliases=[], creation_timestamp=1685253602169, current_stage='None', description=None, last_updated_timestamp=1685253602169, name='green data duration prediction', run_id='77ee85eda95340f3a6fea26bff864a02', run_link=None, source='/workspaces/codespaces-blank/mlruns/1/77ee85eda95340f3a6fea26bff864a02/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

## Transition model from one stage to the other

In [3]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1685250681213, description='', last_updated_timestamp=1685255074697, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1685253602169, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2023-05-28', last_updated_timestamp=1685255110892, name='green data duration prediction', run_id='77ee85eda95340f3a6fea26bff864a02', run_link=None, source='/workspaces/codespaces-blank/mlruns/1/77ee85eda95340f3a6fea26bff864a02/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>], name='green data duration prediction', tags={}>]

In [38]:
model_name = 'green data duration prediction'
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f"{version.version}, stage: {version.current_stage}")

1, stage: Staging
3, stage: None


In [41]:
model_version = 3
new_stage = 'Staging'
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685253602169, current_stage='Staging', description=None, last_updated_timestamp=1685255074697, name='green data duration prediction', run_id='77ee85eda95340f3a6fea26bff864a02', run_link=None, source='/workspaces/codespaces-blank/mlruns/1/77ee85eda95340f3a6fea26bff864a02/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

## Update model version

In [44]:
from datetime import datetime

date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"

)

<ModelVersion: aliases=[], creation_timestamp=1685253602169, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2023-05-28', last_updated_timestamp=1685255110892, name='green data duration prediction', run_id='77ee85eda95340f3a6fea26bff864a02', run_link=None, source='/workspaces/codespaces-blank/mlruns/1/77ee85eda95340f3a6fea26bff864a02/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

# Predictions with the model

In [4]:
import pandas as pd
from sklearn.metrics import mean_squared_error


In [5]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >=1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype('str')
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance'] 

    dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(dicts)


def test_model(stage, X_test, y_test, name='green data duration prediction'):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_true=y_test, y_pred=y_pred, squared=False)}

### Download data for march

In [6]:
%%capture
!wget -nc https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-03.parquet -P data


In [7]:
df_test = read_dataframe('data/green_tripdata_2021-03.parquet')


In [8]:
client.download_artifacts(run_id='d95a1ba956ec425baf0d2076fb9eafa1', 
                          path='preprocessor',
                          dst_path='.')

  client.download_artifacts(run_id='d95a1ba956ec425baf0d2076fb9eafa1',


'/workspaces/codespaces-blank/preprocessor'

In [9]:
import pickle
with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)

In [10]:
X_test = preprocess(df_test, dv)

In [11]:
target = 'duration'
y_test = df_test[target].values


In [13]:
%time test_model(stage="Staging", X_test=X_test, y_test=y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 13.4 s, sys: 50 ms, total: 13.4 s
Wall time: 4.55 s


{'rmse': 6.243007626483976}