In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_squared_error

import pickle

In [1]:
import mlflow

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment("nyc-taxi-experiment")

2022/09/04 22:19:22 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [4]:
def read_preprocess(file_name) -> pd.DataFrame:

    df = pd.read_parquet(file_name)
    
    # parse dates
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    # get trip duration
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda x : x.total_seconds() / 60)

    df = df.loc[(df.duration >= 1) & (df.duration <= 60)]

    cat = ['PULocationID', 'DOLocationID']
    df[cat] = df[cat].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    return df

In [5]:
df_train = read_preprocess("data/green_jan_2021.parquet")
df_val = read_preprocess("data/green_feb_2021.parquet")

len(df_train), len(df_val)

(73908, 61921)

In [6]:
# cat = ['PULocationID', 'DOLocationID']
cat = ['PU_DO']
num = ['trip_distance']

dv = DictVectorizer()

train_dict = df_train[cat + num].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[cat + num].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [7]:
target =  'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [8]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

7.758715199477344

In [22]:
# start an experiment
with mlflow.start_run():

    # create tag
    mlflow.set_tag("developer", "benedict")

    # log dataset used
    mlflow.log_param("train-data-path", "data/green_jan_2021.parquet")
    mlflow.log_param("valid-data-path", "data/green_feb_2021.parquet")
    
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    lr_lasso = Lasso(alpha)
    lr_lasso.fit(X_train, y_train)

    y_pred = lr_lasso.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)

    mlflow.log_metric("rmse", rmse)

    # save model
    mlflow.log_artifact(local_path = "models/lin_reg.bin", artifact_path="file_path")

In [64]:
lr_ridge = Ridge(alpha=0.001)
lr_ridge.fit(X_train, y_train)

y_pred = lr_ridge.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

11.342565604854506

## xgboost hyperparameter tracking

In [13]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# fmin - function used to minimize
# tpe - algorithm used
# hp - define search space
# Trials - keep track of info from each rone

from hyperopt.pyll import scope # define range of type int

In [14]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [15]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000, # iterations of booster
            evals=[(valid, 'validation')],
            early_stopping_rounds=50 # if >50 iterations without improvement
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
# ranges for hyperopt to explore
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0), # exp(-3) to exp(0) = [0.05, 1]
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

## autolog

In [17]:
mlflow.xgboost.autolog()

params = {
    "learning_rate": 0.07278314034839226,
    "max_depth": 73,
    "min_child_weight": 5.004368122639275,
    "objective": "reg:linear",
    "reg_alpha": 0.0632094630338191,
    "reg_lambda": 0.0030308745502548133,
    "seed": 42,
}


xgb.train(
    params = params,
    dtrain = train,
    num_boost_round = 100,
    evals = [(valid, "validation")],
    early_stopping_rounds=50
)

2022/09/04 22:47:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c32371d4f1c844cd8bc24735e88ba30d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:19.89248
[1]	validation-rmse:18.69612
[2]	validation-rmse:17.59588
[3]	validation-rmse:16.58601
[4]	validation-rmse:15.65949
[5]	validation-rmse:14.81043
[6]	validation-rmse:14.03377
[7]	validation-rmse:13.32330
[8]	validation-rmse:12.67540
[9]	validation-rmse:12.08520
[10]	validation-rmse:11.54795
[11]	validation-rmse:11.05983
[12]	validation-rmse:10.61690
[13]	validation-rmse:10.21670
[14]	validation-rmse:9.85482
[15]	validation-rmse:9.52734
[16]	validation-rmse:9.23261
[17]	validation-rmse:8.96679
[18]	validation-rmse:8.72826
[19]	validation-rmse:8.51350
[20]	validation-rmse:8.32006
[21]	validation-rmse:8.14808
[22]	validation-rmse:7.99355
[23]	validation-rmse:7.85426
[24]	validation-rmse:7.72919
[25]	validation-rmse:7.61786
[26]	validation-rmse:7.51845
[27]	validation-rmse:7.42976
[28]	validation-rmse:7.35036
[29]	validation-rmse:7.27912
[30]	validation-rmse:7.21489
[31]	validation-rmse:7.15766
[32]	validation-rmse:7.10565
[33]	validation-rmse:7.05918
[34]	valid



<xgboost.core.Booster at 0x7f6f4ba804f0>

## model logging

In [24]:
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run():
    params = {
        "learning_rate": 0.07278314034839226,
        "max_depth": 73,
        "min_child_weight": 5.004368122639275,
        "objective": "reg:linear",
        "reg_alpha": 0.0632094630338191,
        "reg_lambda": 0.0030308745502548133,
        "seed": 42,
    }

    mlflow.log_params(params)

    booster = xgb.train(
        params = params,
        dtrain = train,
        num_boost_round = 100,
        evals = [(valid, "validation")],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path = "preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path = "models_mlflow")

[0]	validation-rmse:19.89248
[1]	validation-rmse:18.69612
[2]	validation-rmse:17.59588
[3]	validation-rmse:16.58601
[4]	validation-rmse:15.65949
[5]	validation-rmse:14.81043
[6]	validation-rmse:14.03377
[7]	validation-rmse:13.32330
[8]	validation-rmse:12.67540
[9]	validation-rmse:12.08520
[10]	validation-rmse:11.54795
[11]	validation-rmse:11.05983
[12]	validation-rmse:10.61690
[13]	validation-rmse:10.21670
[14]	validation-rmse:9.85482
[15]	validation-rmse:9.52734
[16]	validation-rmse:9.23261
[17]	validation-rmse:8.96679
[18]	validation-rmse:8.72826
[19]	validation-rmse:8.51350
[20]	validation-rmse:8.32006
[21]	validation-rmse:8.14808
[22]	validation-rmse:7.99355
[23]	validation-rmse:7.85426
[24]	validation-rmse:7.72919
[25]	validation-rmse:7.61786
[26]	validation-rmse:7.51845
[27]	validation-rmse:7.42976
[28]	validation-rmse:7.35036
[29]	validation-rmse:7.27912
[30]	validation-rmse:7.21489
[31]	validation-rmse:7.15766
[32]	validation-rmse:7.10565
[33]	validation-rmse:7.05918
[34]	valid

## load model

In [25]:
import mlflow
logged_model = 'runs:/272a28f2395e4d75ae627a85fdd8cb90/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [26]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 272a28f2395e4d75ae627a85fdd8cb90

In [27]:
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [28]:
y_pred = xgboost_model.predict(valid)

In [29]:
y_pred[:10]

array([14.753914,  6.77946 , 15.48373 , 25.64809 ,  9.230557, 17.196167,
       12.552075,  8.765083,  9.357581, 20.37567 ], dtype=float32)