In [None]:
!python -V

In [19]:
import pandas as pd

In [20]:
import pickle

In [21]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
import click

In [23]:
import os
import pickle
import click
import pandas as pd

from sklearn.feature_extraction import DictVectorizer

def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)


def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
    )

    # Extract the target
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

In [22]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error

In [23]:
import mlflow

mlflow.set_tracking_uri("postgresql+psycopg2://postgres:Batya@localhost:5433/mlflow")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/batya/notebooks/MLops-zoomcamp/02-exp-tracking-mlflow/mlruns/1', creation_time=1716636737730, experiment_id='1', last_update_time=1716636737730, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [15]:
df = pd.read_parquet('./data/green_tripdata_2023-01.parquet')

df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df = df[(df['duration'] >=1) & (df['duration'] <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df[categorical] = df[categorical].astype(str)

In [16]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

7.0603515317164085

In [32]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

In [33]:
df_train = read_dataframe('./data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2023-02.parquet')

In [34]:
categorical = ['PULocationID', 'DOLocationID']
train_dicts = df_train[categorical].to_dict(orient='records')
val_dicts = df_val[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.356162221489667

In [35]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope 

In [37]:
categorical = ['PULocationID', 'DOLocationID']
train = xgb.DMatrix(X_train, label = y_train)
val = xgb.DMatrix(X_val, label = y_val)

In [43]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 100,
            evals = [(val, "validation")],
            early_stopping_rounds = 40,
        )
        y_pred = booster.predict(val)
        rmse = mean_squared_error(y_val, y_pred, squared = False)
        mlflow.log_metric('rmse', rmse)

    return {'loss':rmse, 'status': STATUS_OK}

In [44]:
search_space = {
    'max_depth': scope.int(hp.uniform('max_depth', 4, 100)),
    'learning_rate': hp.loguniform('learning_rate', -4, -1),
    'reg_alpha': hp.loguniform("reg_alpha", -5, -1),
    'reg_lambda': hp.loguniform("reg_lambda", -6, -1),
    'min_child_weight': hp.loguniform("min_child_weight", -1, 3),
    'objective': 'reg:linear',
    'seed': 42,
}

best_result =fmin(
    fn=objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 5,
    trials = Trials()
    )

  0%|                                                                                                         | 0/5 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:9.23764                                                                                                                         
[1]	validation-rmse:9.15926                                                                                                                         
[2]	validation-rmse:9.08749                                                                                                                         
[3]	validation-rmse:9.01929                                                                                                                         
[4]	validation-rmse:8.95674                                                                                                                         
[5]	validation-rmse:8.89795                                                                                                                         
[6]	validation-rmse:8.84365                                                                               




[0]	validation-rmse:8.37216                                                                                                                         
[1]	validation-rmse:7.82206                                                                                                                         
[2]	validation-rmse:7.49803                                                                                                                         
[3]	validation-rmse:7.16746                                                                                                                         
[4]	validation-rmse:7.02504                                                                                                                         
[5]	validation-rmse:6.86779                                                                                                                         
[6]	validation-rmse:6.78071                                                                               




[0]	validation-rmse:8.14697                                                                                                                         
[1]	validation-rmse:7.47231                                                                                                                         
[2]	validation-rmse:7.00737                                                                                                                         
[3]	validation-rmse:6.77985                                                                                                                         
[4]	validation-rmse:6.60290                                                                                                                         
[5]	validation-rmse:6.51226                                                                                                                         
[6]	validation-rmse:6.45000                                                                               




[0]	validation-rmse:9.00172                                                                                                                         
[1]	validation-rmse:8.71692                                                                                                                         
[2]	validation-rmse:8.46534                                                                                                                         
[3]	validation-rmse:8.24607                                                                                                                         
[4]	validation-rmse:8.04300                                                                                                                         
[5]	validation-rmse:7.86466                                                                                                                         
[6]	validation-rmse:7.70402                                                                               




[0]	validation-rmse:8.68352                                                                                                                         
[1]	validation-rmse:8.18902                                                                                                                         
[2]	validation-rmse:7.79140                                                                                                                         
[3]	validation-rmse:7.46293                                                                                                                         
[4]	validation-rmse:7.21700                                                                                                                         
[5]	validation-rmse:7.02987                                                                                                                         
[6]	validation-rmse:6.88707                                                                               

In [51]:
mlflow.xgboost.autolog(disable= True)

In [55]:
with mlflow.start_run():
    best_params = {
    'learning_rate': 0.28,
    'max_depth':67,
    'min_child_weight':7.5,
    'objective':'reg:linear',
    'reg_alpha':0.3,
    'reg_lambda': 0.005,
    'seed':42,    
    }
    
    mlflow.log_params(best_params)
    
    booster = xgb.train(
            params = best_params,
            dtrain = train,
            num_boost_round = 70,
            evals = [(val, "validation")],
            early_stopping_rounds = 10,
    )
    
    y_pred = booster.predict(val)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric('rmse', rmse)

    with open('models/preprocessor.b', "wb") as f_out:
        pickle.dump(dv,f_out)
        
    mlflow.log_artifact("models/preprocessor.b",artifact_path ='preprocessor')
    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')




[0]	validation-rmse:8.14629
[1]	validation-rmse:7.47151
[2]	validation-rmse:7.00664
[3]	validation-rmse:6.77922
[4]	validation-rmse:6.60240
[5]	validation-rmse:6.51184
[6]	validation-rmse:6.44964
[7]	validation-rmse:6.38660
[8]	validation-rmse:6.35728
[9]	validation-rmse:6.32127
[10]	validation-rmse:6.29715
[11]	validation-rmse:6.26021
[12]	validation-rmse:6.24874
[13]	validation-rmse:6.23884
[14]	validation-rmse:6.23163
[15]	validation-rmse:6.21801
[16]	validation-rmse:6.20572
[17]	validation-rmse:6.19701
[18]	validation-rmse:6.19082
[19]	validation-rmse:6.18344
[20]	validation-rmse:6.17780
[21]	validation-rmse:6.16191
[22]	validation-rmse:6.15929
[23]	validation-rmse:6.15287
[24]	validation-rmse:6.14836
[25]	validation-rmse:6.14387
[26]	validation-rmse:6.14003
[27]	validation-rmse:6.13601
[28]	validation-rmse:6.13033
[29]	validation-rmse:6.11391
[30]	validation-rmse:6.10794
[31]	validation-rmse:6.10709
[32]	validation-rmse:6.10328
[33]	validation-rmse:6.10001
[34]	validation-rmse:6.0



In [56]:
logged_model = 'runs:/82de4876966f4a4f9c9e8c3431b4699b/model'
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model




mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.xgboost
  run_id: 82de4876966f4a4f9c9e8c3431b4699b

In [57]:
xgb_model = mlflow.xgboost.load_model(logged_model)

In [58]:
xgb_model

<xgboost.core.Booster at 0x7f0ea55168b0>

In [59]:
y_pred = xgb_model.predict(val)

In [60]:
rmse = mean_squared_error(y_val, y_pred, squared = False)

In [61]:
print(rmse)

5.979650556649659
