In [1]:
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

import pickle

In [3]:
import mlflow

In [4]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('nyc-experiment-tracker')

<Experiment: artifact_location='/home/rohit/mlops-zoomcamp/02-Experiment-Tracking/mlruns/1', creation_time=1726390207628, experiment_id='1', last_update_time=1726390207628, lifecycle_stage='active', name='nyc-experiment-tracker', tags={}>

In [5]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td:td.total_seconds()/60)
    df = df[((df.duration > 1) & (df.duration <= 60))]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [6]:
train_df = read_data('../data/green_tripdata_2021-01.parquet')
val_df = read_data('../data/green_tripdata_2021-02.parquet')

train_df['PU_DO'] = train_df['PULocationID'] + '_' + train_df['DOLocationID']
val_df['PU_DO'] = val_df['PULocationID'] + '_' + val_df['DOLocationID']

categorical = ['PU_DO'] #['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = train_df[categorical+numerical].to_dict(orient='records')
train_x = dv.fit_transform(train_dicts)

val_dicts = val_df[categorical+numerical].to_dict(orient='records')
val_x = dv.transform(val_dicts)

target = 'duration'
train_y = train_df[target].values
val_y = val_df[target].values

In [7]:
with mlflow.start_run():
    mlflow.set_tag('developer', 'rohit')

    mlflow.log_param('train-data-path', '../data/green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path', '../data/green_tripdata_2021-02.parquet')

    lr = LinearRegression()
    lr.fit(train_x, train_y)

    pred_y = lr.predict(val_x)
    rmse = root_mean_squared_error(val_y, pred_y)
    mlflow.log_metric('rmse', rmse)


In [7]:
import xgboost as xgb

In [8]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [9]:
train = xgb.DMatrix(train_x, label=train_y)
valid = xgb.DMatrix(val_x, label=val_y)

In [10]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        pred_y = booster.predict(valid)
        rmse = root_mean_squared_error(val_y, pred_y)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [11]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:11.36302                          
[1]	validation-rmse:10.63586                          
  0%|          | 0/50 [00:07<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

Selected Best model (which has less loss(rmse)) parameters from MLFlow.
Feed the same parameter and log the model param using single statement `autolog`. The MLFlow provides autolog functionality for majjor libraries and algorithm. To check the full list refer to MLFlow documentation.

In [13]:
params = {'learning_rate': 0.2781661032472618,
        'max_depth': 15, 
        'min_child_weight': 1.3495667465841719,
        'objective': 'reg:linear',
        'reg_alpha': 0.3018615566476437,
        'reg_lambda': 0.003037809823464441,
        'seed': 42
        }
mlflow.xgboost.autolog()
booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
)

2024/09/21 00:15:32 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '11124326ac454c1d89c0190372c597c3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:10.08482
[1]	validation-rmse:8.73553
[2]	validation-rmse:7.90295
[3]	validation-rmse:7.40928
[4]	validation-rmse:7.11052
[5]	validation-rmse:6.93097
[6]	validation-rmse:6.82239
[7]	validation-rmse:6.75396
[8]	validation-rmse:6.71184
[9]	validation-rmse:6.68311
[10]	validation-rmse:6.66319
[11]	validation-rmse:6.64437
[12]	validation-rmse:6.63435
[13]	validation-rmse:6.62721
[14]	validation-rmse:6.62242
[15]	validation-rmse:6.61884
[16]	validation-rmse:6.61660
[17]	validation-rmse:6.61537
[18]	validation-rmse:6.61386
[19]	validation-rmse:6.61212
[20]	validation-rmse:6.60940
[21]	validation-rmse:6.60816
[22]	validation-rmse:6.60628
[23]	validation-rmse:6.60415
[24]	validation-rmse:6.60242
[25]	validation-rmse:6.60019
[26]	validation-rmse:6.59829
[27]	validation-rmse:6.59657
[28]	validation-rmse:6.59384
[29]	validation-rmse:6.59276
[30]	validation-rmse:6.58977
[31]	validation-rmse:6.58721
[32]	validation-rmse:6.58463
[33]	validation-rmse:6.58223
[34]	validation-rmse:6.



Logging the models

In [12]:
mlflow.xgboost.autolog(disable=True)

In [17]:
params = {'learning_rate': 0.2781661032472618,
    'max_depth': 15, 
    'min_child_weight': 1.3495667465841719,
    'objective': 'reg:linear',
    'reg_alpha': 0.3018615566476437,
    'reg_lambda': 0.003037809823464441,
    'seed': 42
    }

with mlflow.start_run():
    mlflow.log_params(params)

    booster = xgb.train(
        params= params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    pred_y = booster.predict(valid)
    rmse = root_mean_squared_error(pred_y, val_y)
    mlflow.log_metric('rmse', rmse)

    # Save dataset to disk. This is needed to avoid preprocessing of data in case we want to use this model
    with open('models/preprocessor.b','wb') as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')



[0]	validation-rmse:10.08482
[1]	validation-rmse:8.73553
[2]	validation-rmse:7.90295
[3]	validation-rmse:7.40928
[4]	validation-rmse:7.11052
[5]	validation-rmse:6.93097
[6]	validation-rmse:6.82239
[7]	validation-rmse:6.75396
[8]	validation-rmse:6.71184
[9]	validation-rmse:6.68311
[10]	validation-rmse:6.66319
[11]	validation-rmse:6.64437
[12]	validation-rmse:6.63435
[13]	validation-rmse:6.62721
[14]	validation-rmse:6.62242
[15]	validation-rmse:6.61884
[16]	validation-rmse:6.61660
[17]	validation-rmse:6.61537
[18]	validation-rmse:6.61386
[19]	validation-rmse:6.61212
[20]	validation-rmse:6.60940
[21]	validation-rmse:6.60816
[22]	validation-rmse:6.60628
[23]	validation-rmse:6.60415
[24]	validation-rmse:6.60242
[25]	validation-rmse:6.60019
[26]	validation-rmse:6.59829
[27]	validation-rmse:6.59657
[28]	validation-rmse:6.59384
[29]	validation-rmse:6.59276
[30]	validation-rmse:6.58977
[31]	validation-rmse:6.58721
[32]	validation-rmse:6.58463
[33]	validation-rmse:6.58223
[34]	validation-rmse:6.

