## Import libraries

In [62]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor

import pickle

In [2]:
!python -V

Python 3.9.12


#### Import MLflow

In [3]:
import mlflow

In [4]:
# We need to set tracking URI to point the library to the SQLite backend database for MLFlow
mlflow.set_tracking_uri("sqlite:///mlflow.db")

In [5]:
# We also need to set the experiment. MLflow will create one if it doesn't exist
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

### Data and Modeling

In [6]:
# read Jan 2021 FHV data 
df = pd.read_parquet("data/fhv_tripdata_2021-01.parquet")

In [7]:
print("Number of Records in Jan 2021 FHV data:", len(df))

Number of Records in Jan 2021 FHV data: 1154112


In [8]:
# let's check out the columns in the df
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        float64       
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(3), object(2)
memory usage: 61.6+ MB


In [10]:
# let's calculate duration of each trip
df["duration"] = df["dropOff_datetime"] - df["pickup_datetime"]
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,0 days 00:17:00
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,0 days 00:17:00
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,0 days 01:50:00
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,0 days 00:08:17
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,0 days 00:15:13


In [11]:
# convert duration into minutes
df["duration"] = df["duration"].apply(lambda x: x.total_seconds() / 60)
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [12]:
print("Average duration in Jan 2021 FHV:", round(df["duration"].mean(),2))

Average duration in Jan 2021 FHV: 19.17


In [13]:
print(f"Fractions of missing values of the pickup location ID: {round(df.PUlocationID.isnull().mean()*100, 2)}%")

Fractions of missing values of the pickup location ID: 83.03%


In [14]:
df["PUlocationID"].fillna("-1", inplace=True)
df["DOlocationID"].fillna("-1", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            1154112 non-null  object        
 4   DOlocationID            1154112 non-null  object        
 5   SR_Flag                 0 non-null        float64       
 6   Affiliated_base_number  1153227 non-null  object        
 7   duration                1154112 non-null  float64       
dtypes: datetime64[ns](2), float64(2), object(4)
memory usage: 70.4+ MB


In [15]:
df["duration"].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [16]:
df["duration"].describe(percentiles=[0.25, 0.90, 0.95, 0.98, 0.99])

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
90%      3.563333e+01
95%      4.725000e+01
98%      6.613333e+01
99%      9.030000e+01
max      4.233710e+05
Name: duration, dtype: float64

Maximum trip is 42,337 minutes, equivalent to 294 days and minimum is 0.017 minutes. Let's filter down the data to be more than 4 mins and less than 95 mins

In [17]:
print("Total % records for filtered data: {}%".format(round(((df["duration"] >= 4) & (df["duration"] <= 95)).mean()*100, 2)))

Total % records for filtered data: 90.44%


In [18]:
df["duration"].describe(percentiles=[0.95, 0.98, 0.99])

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
50%      1.340000e+01
95%      4.725000e+01
98%      6.613333e+01
99%      9.030000e+01
max      4.233710e+05
Name: duration, dtype: float64

### Pulling input_data together

In [19]:
def read_data(file:str):
    df = pd.read_parquet(file)
    print("Number of Records in data:", len(df))
    
    # get duration
    df["duration"] = df["dropOff_datetime"] - df["pickup_datetime"]
    # convert duration to minutes
    df["duration"] = df["duration"].apply(lambda x: x.total_seconds() / 60)
    print("Average duration in data:", round(df["duration"].mean(),2))
    print(f"Fractions of missing values of the pickup location ID: {round(sum(df.PUlocationID.isnull())/len(df.PUlocationID)*100, 2)}%")

    # fill na
    df["PUlocationID"].fillna("-1", inplace=True)
    df["DOlocationID"].fillna("-1", inplace=True)

    df[['PUlocationID', 'DOlocationID']] = df[['PUlocationID', 'DOlocationID']].astype(str)
    df['PU_DO'] = df['PUlocationID'] + '_' + df['DOlocationID']

    df = df[(df["duration"] >= 4) & (df["duration"] <= 95)]

    return df

In [21]:
df_train = read_data("data/fhv_tripdata_2021-01.parquet")

Number of Records in data: 1154112
Average duration in data: 19.17
Fractions of missing values of the pickup location ID: 83.03%


### One-hot encoding

In [22]:
# lets extrain X_train data in dictionary form
X_train_dict = df_train[["PUlocationID", "DOlocationID"]].to_dict(orient="records")

In [23]:
# initialize vectorizer 
dv = DictVectorizer()

# one-hot encoding
X_train = dv.fit_transform(X_train_dict)

In [24]:
print("Dimensionality of X_train:", X_train.shape)

Dimensionality of X_train: (1043765, 525)


In [25]:
# target variable
y_train = df_train["duration"]

### Fit the model!

In [27]:
df_val = read_data("./data/fhv_tripdata_2021-02.parquet")

# lets extrain X_train data in dictionary form
X_val_dict = df_val[["PU_DO"]].to_dict(orient="records")

# one-hot encoding
X_val = dv.transform(X_val_dict)

# y_val
y_val = df_val["duration"]

Number of Records in data: 1037692
Average duration in data: 20.71
Fractions of missing values of the pickup location ID: 85.26%


In [55]:
# start logging with mlflow
with mlflow.start_run():
    # set tag for mlflow
    mlflow.set_tag("developer", "Bengsoon")
    mlflow.set_tag("model", "linear_regression")

    # set data path param
    mlflow.log_param("train-data-path", "./data/fhv_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/fhv_tripdata_2021-02.parquet")

    # set hyper parameter for Lasso
    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha = alpha)
    
    # train the model
    lr.fit(X_train, y_train)

    # get the y_pred from X_train
    y_pred = lr.predict(X_val)

    # get RMSE and record on mlflow
    rmse = round(mean_squared_error(y_val, y_pred, squared=False),2)
    print("RMSE for training data:", rmse)
    mlflow.log_metric("rmse", rmse)
    

RMSE for training data: 14.51


### Hyperparameter Tuning with XGBoost and hyperopt

In [28]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [29]:
# create an objective function that trains the xgboost model with a set of hyperparameters (from hyperopt) and then validated against our validation data. 
## for each set of hyperparameters and the model's corresponding performance score, we record them in mlflow 
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [30]:
# create search space for our XGBoost hyperparameters
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [43]:
# run the hyperopt optimization
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:20.79160                          
[1]	validation-rmse:18.41274                          
[2]	validation-rmse:17.00143                          
[3]	validation-rmse:15.83355                          
[4]	validation-rmse:15.25739                          
[5]	validation-rmse:14.79529                          
[6]	validation-rmse:14.61999                          
[7]	validation-rmse:14.51805                          
[8]	validation-rmse:14.46920                          
[9]	validation-rmse:14.44403                          
[10]	validation-rmse:14.42998                         
[11]	validation-rmse:14.43086                         
[12]	validation-rmse:14.43444                         
[13]	validation-rmse:14.44042                         
[14]	validation-rmse:14.44777                         
[15]	validation-rmse:14.45262                         
[16]	validation-rmse:14.45624                         
[17]	validation-rmse:14.46003                         
[18]	valid

### Model Selection

We have selected run `09923bbad64045ca837a1656254ce756` in MLflow experiment as our model

In [31]:
# Hyperparameter for run 09923bbad64045ca837a1656254ce756

params = {
    'max_depth': 4,
    'learning_rate': 0.14493221791716185,
    'reg_alpha': 0.012153110171030913,
    'reg_lambda': 0.017881159785939696,
    'min_child_weight': 0.674864917045824,
    'objective': 'reg:linear',
    'seed': 42
}

#### `Autolog`

In [35]:
# Train the model with the selected params and autolog 
mlflow.xgboost.autolog(disable=False)

with mlflow.start_run():
    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=1000,
                evals=[(valid, 'validation')],
                early_stopping_rounds=50
            )

[0]	validation-rmse:21.51928
[1]	validation-rmse:19.75349
[2]	validation-rmse:18.37375
[3]	validation-rmse:17.31223
[4]	validation-rmse:16.47958
[5]	validation-rmse:15.88425
[6]	validation-rmse:15.44593
[7]	validation-rmse:15.18472
[8]	validation-rmse:14.91837
[9]	validation-rmse:14.78518
[10]	validation-rmse:14.64478
[11]	validation-rmse:14.57900
[12]	validation-rmse:14.53318
[13]	validation-rmse:14.49797
[14]	validation-rmse:14.47204
[15]	validation-rmse:14.45054
[16]	validation-rmse:14.44113
[17]	validation-rmse:14.43616
[18]	validation-rmse:14.43254
[19]	validation-rmse:14.43052
[20]	validation-rmse:14.42980
[21]	validation-rmse:14.42992
[22]	validation-rmse:14.43050
[23]	validation-rmse:14.43129
[24]	validation-rmse:14.43218
[25]	validation-rmse:14.43268
[26]	validation-rmse:14.43350
[27]	validation-rmse:14.43428
[28]	validation-rmse:14.43500
[29]	validation-rmse:14.43576
[30]	validation-rmse:14.44011
[31]	validation-rmse:14.44078
[32]	validation-rmse:14.44218
[33]	validation-rmse



##### Using Saved `Autolog` MLflow Model

In [39]:
logged_model = 'runs:/01d97a61959f42ba964175e922ee9573/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

loaded_model



mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.xgboost
  run_id: 01d97a61959f42ba964175e922ee9573

In [44]:
loaded_model.predict(X_val)

array([20.216328, 20.216328, 20.216328, ..., 20.216328, 20.216328,
       20.216328], dtype=float32)

In [45]:
# Load model as XGBmodel
xgb_model = mlflow.xgboost.load_model(logged_model)
xgb_model



<xgboost.core.Booster at 0x7ffac359c070>

In [48]:
xgb_model.predict(valid)

array([20.216328, 20.216328, 20.216328, ..., 20.216328, 20.216328,
       20.216328], dtype=float32)

### Logging Models with MLflow


In [52]:
# Adapting from our previous XGBoost model, we will save the artifact into MLflow and log the parameters manually
# We will also log the DictVectorizer preprocessor as an artifact

# Turn off autolog
mlflow.xgboost.autolog(disable=True)


with mlflow.start_run():
    # Hyperparameter for run 09923bbad64045ca837a1656254ce756
    params = {
        'max_depth': 4,
        'learning_rate': 0.14493221791716185,
        'reg_alpha': 0.012153110171030913,
        'reg_lambda': 0.017881159785939696,
        'min_child_weight': 0.674864917045824,
        'objective': 'reg:linear',
        'seed': 42
    }

    mlflow.log_params(params)
    

    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=1000,
                evals=[(valid, 'validation')],
                early_stopping_rounds=50
            )

    # get the y_pred from X_train
    y_pred = booster.predict(valid)

    # get RMSE and record on mlflow
    rmse = round(mean_squared_error(y_val, y_pred, squared=False),2)
    print("RMSE for training data:", rmse)
    mlflow.log_metric("rmse", rmse)

    # log xgboost model to mlflow
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

    # log the preprocessor DictVectorizer
    with open("models/preprocessor.bin", "wb") as f_out:
        pickle.dump(dv, f_out)
    
    mlflow.log_artifact("models/preprocessor.bin", artifact_path="preprocessor")

[0]	validation-rmse:21.51928
[1]	validation-rmse:19.75349
[2]	validation-rmse:18.37375
[3]	validation-rmse:17.31223
[4]	validation-rmse:16.47958
[5]	validation-rmse:15.88425
[6]	validation-rmse:15.44593
[7]	validation-rmse:15.18472
[8]	validation-rmse:14.91837
[9]	validation-rmse:14.78518
[10]	validation-rmse:14.64478
[11]	validation-rmse:14.57900
[12]	validation-rmse:14.53318
[13]	validation-rmse:14.49797
[14]	validation-rmse:14.47204
[15]	validation-rmse:14.45054
[16]	validation-rmse:14.44113
[17]	validation-rmse:14.43616
[18]	validation-rmse:14.43254
[19]	validation-rmse:14.43052
[20]	validation-rmse:14.42980
[21]	validation-rmse:14.42992
[22]	validation-rmse:14.43050
[23]	validation-rmse:14.43129
[24]	validation-rmse:14.43218
[25]	validation-rmse:14.43268
[26]	validation-rmse:14.43350
[27]	validation-rmse:14.43428
[28]	validation-rmse:14.43500
[29]	validation-rmse:14.43576
[30]	validation-rmse:14.44011
[31]	validation-rmse:14.44078
[32]	validation-rmse:14.44218
[33]	validation-rmse

#### Prediction

In [53]:
logged_model = 'runs:/237dc915805441e8bfe958044ede7b18/models_mlflow'

# Load model as a xgboost.
xgb_model = mlflow.xgboost.load_model(logged_model)

# Prediction
y_valid = xgb_model.predict(valid) # using DMatrix-typed validation data

y_valid



array([20.216328, 20.216328, 20.216328, ..., 20.216328, 20.216328,
       20.216328], dtype=float32)

## Alternate Models

Let's retrain the model using different architectures so that we can have different versions. We will use `mlflow.*.autolog()` for the logging of our models

#### `Lasso`

In [60]:
# serialize Preprocessor
with open("models/preprocessor.bin", "wb") as f_out:
    pickle.dump(dv, f_out)

In [68]:
# start logging with mlflow
with mlflow.start_run():
    
    mlflow.sklearn.autolog(log_post_training_metrics=False, 
                           serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE)

    # set tag for mlflow
    mlflow.set_tag("developer", "Bengsoon")
    mlflow.set_tag("model", "lasso")

    # set data path param
    mlflow.log_param("train-data-path", "./data/fhv_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/fhv_tripdata_2021-02.parquet")

    # set hyper parameter for Lasso
    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha = alpha)
    
    # train the model
    lr.fit(X_train, y_train)

    # get the y_pred from X_train
    y_pred = lr.predict(X_val)

    # get RMSE and record on mlflow
    rmse = round(mean_squared_error(y_val, y_pred, squared=False),2)
    print("RMSE for training data:", rmse)
    mlflow.log_metric("rmse", rmse)
    
    # log preprocessor as artifact
    mlflow.log_artifact("models/preprocessor.bin", artifact_path="preprocessor")

       



RMSE for training data: 14.51


#### `GradientBoostingRegressor`

In [69]:
# start logging with mlflow
with mlflow.start_run():
    
    mlflow.sklearn.autolog(log_post_training_metrics=False, 
                           serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE)

    # set tag for mlflow
    mlflow.set_tag("developer", "Bengsoon")
    mlflow.set_tag("model", "gradient_boosting_regressor")

    # set data path param
    mlflow.log_param("train-data-path", "./data/fhv_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/fhv_tripdata_2021-02.parquet")

    # set GradientBoostingRegressor's params
    params = {"learning_rate": 0.1, 
              "n_estimators": 100,
              "min_samples_split": 2,
              "max_depth": 3
              }
    gbr = GradientBoostingRegressor(**params)
    mlflow.log_params(params)

    # train the model
    gbr.fit(X_train, y_train)

    # get the y_pred from X_val
    y_pred = gbr.predict(X_val)

    # get RMSE and record on mlflow
    rmse = round(mean_squared_error(y_val, y_pred, squared=False),2)
    print("RMSE for training data:", rmse)
    mlflow.log_metric("rmse", rmse)
    
    # log preprocessor as artifact
    mlflow.log_artifact("models/preprocessor.bin", artifact_path="preprocessor")




RMSE for training data: 14.45


#### `ElasticNet`

In [70]:
# start logging with mlflow
with mlflow.start_run():
    
    mlflow.sklearn.autolog(log_post_training_metrics=False, 
                           serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE)

    # set tag for mlflow
    mlflow.set_tag("developer", "Bengsoon")
    mlflow.set_tag("model", "elasticnet")

    # set data path param
    mlflow.log_param("train-data-path", "./data/fhv_tripdata_2021-01.parquet")
    mlflow.log_param("valid-data-path", "./data/fhv_tripdata_2021-02.parquet")

    # set GradientBoostingRegressor's params
    alpha = 1.0
    enr = ElasticNet(alpha=alpha)
    mlflow.log_param("alpha", alpha)

    # train the model
    enr.fit(X_train, y_train)

    # get the y_pred from X_val
    y_pred = enr.predict(X_val)

    # get RMSE and record on mlflow
    rmse = round(mean_squared_error(y_val, y_pred, squared=False),2)
    print("RMSE for training data:", rmse)
    mlflow.log_metric("rmse", rmse)
    
    # log preprocessor as artifact
    mlflow.log_artifact("models/preprocessor.bin", artifact_path="preprocessor")



RMSE for training data: 14.47


## Using MLflow Client

In [71]:
from mlflow.tracking import MlflowClient

In [72]:
# instantiate the client

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [79]:
# we can create experiments
client.create_experiment(name="test")

'4'

In [95]:
# list experiments
client.list_experiments()

[<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='./mlruns/4', experiment_id='4', lifecycle_stage='active', name='test', tags={}>]

### Get the runs in `nyc-taxi-experiment`

In [134]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids= "2",
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [136]:
runs[:2]

[<Run: data=<RunData: metrics={'rmse': 14.434830071276503}, params={'learning_rate': '0.4265314430554626',
  'max_depth': '90',
  'min_child_weight': '19.781075119943388',
  'objective': 'reg:linear',
  'reg_alpha': '0.07355580679915082',
  'reg_lambda': '0.023586442422092634',
  'seed': '42'}, tags={'mlflow.source.name': '/home/bengsoon/anaconda3/envs/exp-tracking-env/lib/python3.9/site-packages/ipykernel_launcher.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'bengsoon',
  'model': 'xgboost'}>, info=<RunInfo: artifact_uri='./mlruns/2/572b0ed57e3748ac965af939c28f3d0e/artifacts', end_time=1653561903802, experiment_id='2', lifecycle_stage='active', run_id='572b0ed57e3748ac965af939c28f3d0e', run_uuid='572b0ed57e3748ac965af939c28f3d0e', start_time=1653561788679, status='FINISHED', user_id='bengsoon'>>,
 <Run: data=<RunData: metrics={'rmse': 14.436014044219164}, params={'learning_rate': '0.3903501367212693',
  'max_depth': '87',
  'min_child_weight': '18.341681221213463',
  'object

In [182]:
# the runs have a ton of information, but we can choose to only view the run_id and rmse
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']}")

run id: 572b0ed57e3748ac965af939c28f3d0e, rmse: 14.434830071276503
run id: be2c1e01147c422fa40f9aca18025447, rmse: 14.436014044219164
run id: ebb70531b3d348808e9c13bd801f335a, rmse: 14.436321257721108
run id: 494b84cae6e84edb8e3caa51603c2318, rmse: 14.436641761102681
run id: 1e835cd7d1544c01ac90901218a8757e, rmse: 14.44200357693116


In [185]:
runs[0].info.run_id


'572b0ed57e3748ac965af939c28f3d0e'

#### Register a new model

We will try to register the run `572b0ed57e3748ac965af939c28f3d0e` in the Model Registry.

In [166]:
# import and set up mlflow
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [169]:
# set up variables
RUN_ID = "572b0ed57e3748ac965af939c28f3d0e"
MODEL_URI = f"runs:/{RUN_ID}/model"

But first let's ensure that the run has not been registered:

In [162]:
assert client.search_model_versions(f"run_id = '{RUN_ID}'") == [], "Run has been registered!"

In [170]:
mlflow.register_model(model_uri = MODEL_URI, name="nyctaxi_tripduration_regressor")

Registered model 'nyctaxi_tripduration_regressor' already exists. Creating a new version of this model...
2022/06/01 14:51:46 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyctaxi_tripduration_regressor, version 5
Created version '5' of model 'nyctaxi_tripduration_regressor'.


<ModelVersion: creation_timestamp=1654066306963, current_stage='None', description=None, last_updated_timestamp=1654066306963, name='nyctaxi_tripduration_regressor', run_id='572b0ed57e3748ac965af939c28f3d0e', run_link=None, source='./mlruns/2/572b0ed57e3748ac965af939c28f3d0e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

#### Transition Model

In [172]:
# get the latest versions
model_name = "nyctaxi_tripduration_regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"Version: {version.version}, Stage: {version.current_stage}")

Version: 1, Stage: Production
Version: 4, Stage: Staging
Version: 5, Stage: None


In [174]:
# transition version 5 to "Staging"
stage = "Staging"
version = 5

client.transition_model_version_stage(
    name = model_name,
    version = version,
    stage = stage,
    archive_existing_versions = False
)

<ModelVersion: creation_timestamp=1654066306963, current_stage='Staging', description=None, last_updated_timestamp=1654072597722, name='nyctaxi_tripduration_regressor', run_id='572b0ed57e3748ac965af939c28f3d0e', run_link=None, source='./mlruns/2/572b0ed57e3748ac965af939c28f3d0e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

In [179]:
from datetime import datetime
today = datetime.today().date()

# Change description of Version 5
client.update_model_version(
    name=model_name,
    version = version,
    description = f"Model version {version} transitioned to {stage} on {today}"
)

<ModelVersion: creation_timestamp=1654066306963, current_stage='Staging', description='Model version 5 transitioned to Staging on 2022-06-01', last_updated_timestamp=1654072851285, name='nyctaxi_tripduration_regressor', run_id='572b0ed57e3748ac965af939c28f3d0e', run_link=None, source='./mlruns/2/572b0ed57e3748ac965af939c28f3d0e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=5>

#### Promote Version 2 to `Production`