# Model Registry 

In [26]:
from mlflow.tracking import MlflowClient

ML_FLOW_TRACKING_URI = "sqlite:///mlflow.db"


client = MlflowClient(tracking_uri=ML_FLOW_TRACKING_URI)

In [35]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyctaxi_experiments', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='my_cool_experiment', tags={}>]

In [34]:
client.create_experiment(name="my_cool_experiment")

'2'

In [43]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 6.8",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metric.rmse ASC'] # multiple criteria possible
)

In [115]:
for run in runs:
    print(f"run id {run.info.run_id}, rmse : {run.data.metrics['rmse']:.4f} ")

run id 630b430d1ed04b73803aa5c5e05e67c1, rmse : 6.3040 


In [116]:
import mlflow
mlflow.set_tracking_uri(ML_FLOW_TRACKING_URI)

In [129]:
run_id ="630b430d1ed04b73803aa5c5e05e67c1"


model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri,name="nyc-taxi-xgboost")

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
2022/06/28 13:01:07 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyc-taxi-xgboost, version 4
Created version '4' of model 'nyc-taxi-xgboost'.


<ModelVersion: creation_timestamp=1656439267128, current_stage='None', description=None, last_updated_timestamp=1656439267128, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [130]:
client.list_registered_models()

[<RegisteredModel: creation_timestamp=1656436067650, description=None, last_updated_timestamp=1656439267128, latest_versions=[<ModelVersion: creation_timestamp=1656439267128, current_stage='None', description=None, last_updated_timestamp=1656439267128, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>,
  <ModelVersion: creation_timestamp=1656437326565, current_stage='Production', description='The model version 2 was transactioned to Production on 2022-06-28', last_updated_timestamp=1656439246472, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>], name='nyc-taxi-xgboost', tags={}>]

In [131]:
model_name="nyc-taxi-xgboost"
latest = client.get_latest_versions(name=model_name)

for version in latest:
    print(f"version {version.version}, stage : {version.current_stage}")

version 4, stage : None
version 2, stage : Production


In [132]:
model_version = 2
new_stage= "Production"

from datetime import date

date = date.today()

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage= new_stage,
    archive_existing_versions=False   
)


<ModelVersion: creation_timestamp=1656437326565, current_stage='Production', description='The model version 2 was transactioned to Production on 2022-06-28', last_updated_timestamp=1656439271461, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [133]:
client.update_model_version(
    name=model_name,
    version = model_version,
    description= f"The model version {model_version} was transactioned to {new_stage} on {date}"
)

<ModelVersion: creation_timestamp=1656437326565, current_stage='Production', description='The model version 2 was transactioned to Production on 2022-06-28', last_updated_timestamp=1656439274247, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

# Comparing versions and selecting the new "Production" model


In the last section, we will retrieve models registered in the model registry and compare their performance on an unseen test set. The idea is to simulate the scenario in which a deployment engineer has to interact with the model registry to decide whether to update the model version that is in production or not.

Steps to be performed:
1. Load the dataset which corresponds to the NYC Green Taxi Data up to March.
2. Download the `DictVectorizer` that was fitted using the training data and saved to MLFlow as an artifact , load with pickle.
3. Preprocess the dataset using DictVectorizer so we can properly feed the regressors.
4. Make predictions on the test dataset using the model versions that are currently in the model 'staging' and 'production'.
5. Based on the results , update the production model.

*Note: the model registry doesn't actually deploy the model to production when you transition a model to the "Production" stage, it just assign a label to that model version. You should complement the registry with some CI/CD code that does the actual deployment.*

In [134]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import mlflow


class Model:
    def __init__(self,filename,name,stage):
        self._filename = filename
        self.name = name
        self.stage = stage
        self.df = self.read_dataframe(self._filename)
    
    # PROPERTIES
    @property
    def filename(self):
        return self._filename

    
    # SETTERS
    @filename.setter
    def filename(self,value):
        self._filename = value
    
        
    def read_dataframe(self,filename):
        if filename.endswith('.csv'):
            df = pd.read_csv(filename)
            df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
            df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        elif filename.endswith('.parquet'):
            df = pd.read_parquet(filename)
            df = pd.read_parquet(filename)
        df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
        df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

        df = df[(df.duration >= 1) & (df.duration <= 60)]

        categorical = ['PULocationID', 'DOLocationID']
        df[categorical] = df[categorical].astype(str)
        return df

    # Get dict vectorizer
    def preprocess(self, dv):
        self.df['PU_DO'] = self.df['PULocationID'] + '_' + self.df['DOLocationID']
        categorical = ['PU_DO']
        numerical = ['trip_distance']
        train_dicts = self.df[categorical + numerical].to_dict(orient='records')
        return dv.transform(train_dicts)
    
    def test_model(self,X_test, y_test):
        model = mlflow.pyfunc.load_model(f"models:/{self.name}/{self.stage}")
        y_pred = model.predict(X_test)
        return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [135]:
model_name = "nyc-taxi-xgboost"
production_model = Model("./data/2021/green_tripdata_2021-03.parquet",model_name,"Production")

In [136]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

'/home/elegant00/Dropbox/Ml_OpsZoomcampRepo/Week_2/preprocessor'

In [137]:
import pickle

with open("preprocessor/preprocessor.b","rb" ) as f_in:
    dv =pickle.load(f_in)

In [138]:
df = production_model.df

df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2021-03-01 00:05:42,2021-03-01 00:14:03,N,1.0,83,129,1.0,1.56,7.5,...,0.5,0.0,0.0,,0.3,8.8,1.0,1.0,0.0,8.35
1,2,2021-03-01 00:21:03,2021-03-01 00:26:17,N,1.0,243,235,1.0,0.96,6.0,...,0.5,0.0,0.0,,0.3,7.3,2.0,1.0,0.0,5.233333
2,2,2021-03-01 00:02:06,2021-03-01 00:22:26,N,1.0,75,242,1.0,9.93,28.0,...,0.5,2.0,0.0,,0.3,31.3,1.0,1.0,0.0,20.333333
3,2,2021-03-01 00:24:03,2021-03-01 00:31:43,N,1.0,242,208,1.0,2.57,9.5,...,0.5,0.0,0.0,,0.3,10.8,2.0,1.0,0.0,7.666667
4,1,2021-03-01 00:11:10,2021-03-01 00:14:46,N,1.0,41,151,1.0,0.8,5.0,...,0.5,1.85,0.0,,0.3,8.15,1.0,1.0,0.0,3.6


In [139]:
X_test = production_model.preprocess(dv) 
target = "duration"
y_test = df[target].values

In [144]:
%time production_model.test_model(X_test=X_test,y_test=y_test)

CPU times: user 17.4 s, sys: 205 ms, total: 17.6 s
Wall time: 2.56 s


{'rmse': 6.250963441118867}

Transition model :

In [146]:
client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage="Production",
    archive_existing_versions=True
)

<ModelVersion: creation_timestamp=1656437326565, current_stage='Production', description='The model version 2 was transactioned to Production on 2022-06-28', last_updated_timestamp=1656439508449, name='nyc-taxi-xgboost', run_id='630b430d1ed04b73803aa5c5e05e67c1', run_link=None, source='./mlruns/1/630b430d1ed04b73803aa5c5e05e67c1/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

*Data lineage* is the process of understanding, recording, and visualizing data as it flows from data sources to consumption

Model management in MLflow

The model registry component is centralized model store, set of APIs, a UI, to collaboratory manage the full lifecycle of an Mlflow Model.

It provides:
- Model Lineage
- Model version control
- Stage transitions
- Annotations