In [9]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [10]:
MLFLOW_TRACKING_URI = 'sqlite:///mlflow.db'
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
#mlflow.search_experiments()
# client.list_artifacts(run_id='eff1613c78114ce39c6b0b622b9d294e')

In [52]:
# client.create_experiment(name="test_modelling_duration")

In [53]:
runs = client.search_runs(
    experiment_ids='1',
    filter_string = "metrics.val_rmse<8",
    run_view_type = ViewType.ACTIVE_ONLY,
    max_results = 5,
    order_by = ["metrics.val_rmse ASC"]
    )

In [54]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['val_rmse']:.4f}")

run id: 770a01b98a8640e3b863b60032d60b26, rmse: 6.6003
run id: 471ceb10785b43ee8c734f96a150d4cb, rmse: 6.6143
run id: 619a82cc69064080a2afef57c731362a, rmse: 7.2926
run id: eff1613c78114ce39c6b0b622b9d294e, rmse: 7.2926
run id: 6c8e7d591b6f495c819c8147cc49c41e, rmse: 7.2926


In [11]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [7]:
run_id = "619a82cc69064080a2afef57c731362a"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name='modelling_cab_duration')

In [57]:
model_uri

'runs:/619a82cc69064080a2afef57c731362a/model'

In [58]:
mlflow.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1693741910998, description=('The taxi trip duration predictor, trained on NYC dataset from Aug 2022 to '
  'Dec2022\n'
  'Link to the data: '
  'https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page'), last_updated_timestamp=1693753514687, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1693741911029, current_stage='Production', description='', last_updated_timestamp=1693749638731, name='modelling_cab_duration', run_id='eff1613c78114ce39c6b0b622b9d294e', run_link='', source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/eff1613c78114ce39c6b0b622b9d294e/artifacts/lr_spark_final', status='READY', status_message=None, tags={'model': 'linear regression'}, user_id=None, version=1>,
  <ModelVersion: aliases=[], creation_timestamp=1693747998495, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2023-09-03 ', last_updated_timestamp=1693750213864, name='modelling_cab_dura

In [59]:
model_name = "modelling_cab_duration"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: Production
version: 3, stage: Staging
version: 4, stage: None


In [60]:
latest_versions

[<ModelVersion: aliases=[], creation_timestamp=1693741911029, current_stage='Production', description='', last_updated_timestamp=1693749638731, name='modelling_cab_duration', run_id='eff1613c78114ce39c6b0b622b9d294e', run_link='', source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/eff1613c78114ce39c6b0b622b9d294e/artifacts/lr_spark_final', status='READY', status_message=None, tags={'model': 'linear regression'}, user_id=None, version=1>,
 <ModelVersion: aliases=[], creation_timestamp=1693747998495, current_stage='Staging', description='The model version 3 was transitioned to Staging on 2023-09-03 ', last_updated_timestamp=1693750213864, name='modelling_cab_duration', run_id='770a01b98a8640e3b863b60032d60b26', run_link=None, source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/770a01b98a8640e3b863b60032d60b26/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>,
 <ModelVersion: aliases=[], creation_timestamp=16937535146

In [61]:
model_version = 4
new_stage = 'Staging'

client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage = new_stage,
    archive_existing_versions = False
)

<ModelVersion: aliases=[], creation_timestamp=1693753514687, current_stage='Staging', description=None, last_updated_timestamp=1693753562949, name='modelling_cab_duration', run_id='619a82cc69064080a2afef57c731362a', run_link=None, source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [62]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name = model_name,
    version = model_version,
    description = f"The model version {model_version} was transitioned to {new_stage} on {date} "
)

<ModelVersion: aliases=[], creation_timestamp=1693753514687, current_stage='Staging', description='The model version 4 was transitioned to Staging on 2023-09-03 ', last_updated_timestamp=1693753564894, name='modelling_cab_duration', run_id='619a82cc69064080a2afef57c731362a', run_link=None, source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [12]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import  StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, LinearRegression
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
import seaborn as sns
import matplotlib.pyplot as plt

from utils import read_process_df

from pyspark_utils.data_prep import prepare_data
#from pyspark_utils.train import train_model
from pyspark_utils.evaluate import evaluate_model

In [2]:
spark = SparkSession.builder.appName('test_staged_model').getOrCreate()
spark

your 131072x1 screen size is bogus. expect trouble
23/09/03 20:41:25 WARN Utils: Your hostname, Bhaiyu resolves to a loopback address: 127.0.1.1; using 172.17.120.207 instead (on interface eth0)
23/09/03 20:41:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/03 20:41:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/03 20:41:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [13]:
df_test_processed = read_process_df('/home/abhishek-wsl/codes/MLops_project/data/test_data/green_tripdata_2023-01.parquet',spark)

# client.download_artifacts(run_id=run_id, path = 'preprocessors', dst_path='./artifacts/')
# si = PipelineModel.load('/home/abhishek-wsl/codes/MLops_project/final_codes/artifacts/preprocessors/encoderindexer_PipelineModel')

si = mlflow.spark.load_model(f'runs:/{run_id}/indexer')
ohe = mlflow.spark.load_model(f'runs:/{run_id}/encoder')

categorical_cols = ['VendorID','pu_hour','pu_weekday','PU_DO']
label_col = 'duration'
encoded_df_test, _, _ = prepare_data(df_test_processed,categorical_cols,si,ohe,is_test=True)
 

trained_lr_model = f'runs:/{run_id}/lr_spark_final'
trained_lr_model = mlflow.spark.load_model(trained_lr_model)
_ = evaluate_model(trained_lr_model,encoded_df_test,label_col,metric='rmse')

2023/09/03 20:44:25 INFO mlflow.spark: 'runs:/619a82cc69064080a2afef57c731362a/indexer' resolved as '/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/indexer'
2023/09/03 20:44:25 INFO mlflow.spark: URI 'runs:/619a82cc69064080a2afef57c731362a/indexer/sparkml' does not point to the current DFS.
2023/09/03 20:44:25 INFO mlflow.spark: File 'runs:/619a82cc69064080a2afef57c731362a/indexer/sparkml' not found on DFS. Will attempt to upload the file.


67613 6


2023/09/03 20:44:27 INFO mlflow.spark: 'runs:/619a82cc69064080a2afef57c731362a/encoder' resolved as '/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/encoder'
2023/09/03 20:44:27 INFO mlflow.spark: URI 'runs:/619a82cc69064080a2afef57c731362a/encoder/sparkml' does not point to the current DFS.
2023/09/03 20:44:27 INFO mlflow.spark: File 'runs:/619a82cc69064080a2afef57c731362a/encoder/sparkml' not found on DFS. Will attempt to upload the file.
2023/09/03 20:44:29 INFO mlflow.spark: 'runs:/619a82cc69064080a2afef57c731362a/lr_spark_final' resolved as '/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/lr_spark_final'
2023/09/03 20:44:29 INFO mlflow.spark: URI 'runs:/619a82cc69064080a2afef57c731362a/lr_spark_final/sparkml' does not point to the current DFS.
2023/09/03 20:44:29 INFO mlflow.spark: File 'runs:/619a82cc69064080a2afef57c731362a/lr_spark_final/sparkml' not found on DFS. Will 

rmse  : 6.23828615094559


- Test multiple models, compare the time, metrics, etc.,
    - then push the suitable model to production

In [16]:
client.transition_model_version_stage(
    name = model_name,
    version = 4,
    stage='Production',
    archive_existing_versions = True
)

<ModelVersion: aliases=[], creation_timestamp=1693753514687, current_stage='Production', description='The model version 4 was transitioned to Staging on 2023-09-03 ', last_updated_timestamp=1693754554436, name='modelling_cab_duration', run_id='619a82cc69064080a2afef57c731362a', run_link=None, source='/home/abhishek-wsl/codes/MLops_project/final_codes/mlruns/1/619a82cc69064080a2afef57c731362a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [None]:
# from datetime import datetime

# date = datetime.today().date()
# client.update_model_version(
#     name = 'modelling_cab_duration',
#     version = 4,
#     description = f"The model version {model_version} was transitioned to {new_stage} on {date} "
# )