# Model Registry 

In [23]:
from mlflow.tracking import MlflowClient

ML_FLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=ML_FLOW_TRACKING_URI)

In [24]:
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyctaxi_experiments', tags={}>,
 <Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='my_new_experiment', tags={}>]

In [None]:
client.create_experiment(name="my_new_experiment")

In [27]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metric.rmse ASC']

)

In [28]:
runs

[<Run: data=<RunData: metrics={'rmse': 6.303975653496241}, params={'learning_rate': '0.09585355369315604',
  'max_depth': '30',
  'min_child_weight': '1.060597050922164',
  'objective': 'reg:linear',
  'reg_alpha': '0.018060244040060163',
  'reg_lambda': '0.011658731377413597',
  'seed': '42'}, tags={'mlflow.log-model.history': '[{"run_id": "9342aa2b9fd4498ab25bbfaadc6d599b", '
                              '"artifact_path": "models_mlflow", '
                              '"utc_time_created": "2022-06-24 '
                              '22:00:38.875492", "flavors": {"python_function": '
                              '{"loader_module": "mlflow.xgboost", '
                              '"python_version": "3.9.7", "data": "model.xgb", '
                              '"env": "conda.yaml"}, "xgboost": {"xgb_version": '
                              '"1.6.1", "data": "model.xgb", "model_class": '
                              '"xgboost.core.Booster", "code": null}}, '
                      

In [47]:
for run in runs:
    print(f"{run.info.run_id}")

9342aa2b9fd4498ab25bbfaadc6d599b
939a3ea7078f4a3d8cf7829cf12fc414
5c780dd850974d81b019cb8afd900368
be22fc89078a44468fb27185f51e1ca2
9f568855233540ab8b90ad42f78a9171


In [48]:
for run in runs:
    print(f"run id {run.info.run_id}, rmse : {run.data.metrics['rmse']:.4f} ")

run id 9342aa2b9fd4498ab25bbfaadc6d599b, rmse : 6.3040 
run id 939a3ea7078f4a3d8cf7829cf12fc414, rmse : 6.3040 
run id 5c780dd850974d81b019cb8afd900368, rmse : 6.3040 
run id be22fc89078a44468fb27185f51e1ca2, rmse : 6.3040 
run id 9f568855233540ab8b90ad42f78a9171, rmse : 6.3040 


In [31]:
import mlflow
mlflow.set_tracking_uri(ML_FLOW_TRACKING_URI)

In [40]:
run_id ="5c780dd850974d81b019cb8afd900368"


model_uri = f"runs:/{run_id}/model"

mlflow.register_model(model_uri=model_uri,name="nyc-taxi-xgboost")

Registered model 'nyc-taxi-xgboost' already exists. Creating a new version of this model...
2022/06/24 17:01:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyc-taxi-xgboost, version 2
Created version '2' of model 'nyc-taxi-xgboost'.


<ModelVersion: creation_timestamp=1656108086070, current_stage='None', description=None, last_updated_timestamp=1656108086070, name='nyc-taxi-xgboost', run_id='5c780dd850974d81b019cb8afd900368', run_link=None, source='./mlruns/1/5c780dd850974d81b019cb8afd900368/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [41]:
client.list_registered_models()

[<RegisteredModel: creation_timestamp=1656108059249, description=None, last_updated_timestamp=1656108086070, latest_versions=[<ModelVersion: creation_timestamp=1656108086070, current_stage='None', description=None, last_updated_timestamp=1656108086070, name='nyc-taxi-xgboost', run_id='5c780dd850974d81b019cb8afd900368', run_link=None, source='./mlruns/1/5c780dd850974d81b019cb8afd900368/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>], name='nyc-taxi-xgboost', tags={}>]

In [42]:
model_name="nyc-taxi-xgboost"
latest = client.get_latest_versions(name=model_name)

for version in latest:
    print(f"version {version.version}, stage : {version.current_stage}")

version 2, stage : None


In [43]:


from datetime import date

date = date.today()
new_stage = "Staging"

client.transition_model_version_stage(
    name=model_name,
    version=2,
    stage= new_stage,
    archive_existing_versions=False   
)


<ModelVersion: creation_timestamp=1656108086070, current_stage='Staging', description=None, last_updated_timestamp=1656108090432, name='nyc-taxi-xgboost', run_id='5c780dd850974d81b019cb8afd900368', run_link=None, source='./mlruns/1/5c780dd850974d81b019cb8afd900368/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

# Comparing versions and selecting the new "Production" model


In the last section, we will retrieve models registered in the model registry and compare their performance on an unseen test set. The idea is to simulate the scenario in which a deployment engineer has to interact with the model registry to decide whether to update the model version that is in production or not.

Steps to be performed:
1. Load the dataset which corresponds to the NYC Green Taxi Data up to March.
2. Download the `DictVectorizer` that was fitted using the training data and saved to MLFlow as an artifact , load with pickle.
3. Preprocess the dataset using DictVectorizer so we can properly feed the regressors.
4. Make predictions on the test dataset using the model versions that are currently in the model 'staging' and 'production'.
5. Based on the results , update the production model.

*Note: the model registry doesn't actually deploy the model to production when you transition a model to the "Production" stage, it just assign a label to that model version. You should complement the registry with some CI/CD code that does the actual deployment.*

In [44]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import mlflow


class Model:
    
    def __init__(self,filename,name,stage):
        self._filename = filename
        self.name = name
        self.stage = stage
        self._df = None
    
    # PROPERTIES
    @property
    def filename(self):
        return self._filename

    @property
    def dataframe(self):
        return self._df
    
    # SETTERS
    @filename.setter
    def filename(self,value):
        self._filename = value
    
    @dataframe.setter
    def dataframe(self,value):
        self._df = value
        
    def read_dataframe(self):
        if self.filename.endswith('.csv'):
            df = pd.read_csv(self.filename)
            df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
            df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        elif self.filename.endswith('.parquet'):
            df = pd.read_parquet(self.filename)

        df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
        df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

        df = df[(df.duration >= 1) & (df.duration <= 60)]

        categorical = ['PULocationID', 'DOLocationID']
        df[categorical] = df[categorical].astype(str)
        self._df = df

    # Get dict vectorizer
    def preprocess(self, df, dv):
        df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
        categorical = ['PU_DO']
        numerical = ['trip_distance']
        train_dicts = df[categorical + numerical].to_dict(orient='records')
        return dv.transform(train_dicts)
    
    def test_model(self,X_test, y_test):
        model = mlflow.pyfunc.load_model(f"models:/{self.name}/{self.stage}")
        y_pred = model.predict(X_test)
        return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [45]:
model_name = "nyc-taxi-regressor"
production_model = Model("./data/2021/green_tripdata_2021-03.parquet",model_name,"Production")

In [46]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

'/home/elegant00/Dropbox/Ml_OpsZoomcampRepo/Week_2/preprocessor'