In [1]:
import pandas as pd
import os
import pickle
import sys
import importlib
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pickle import dump
import optuna

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

package_name = "mlflow"
try:
    importlib.import_module(package_name)
    print(f"{package_name} is already installed.")
except ModuleNotFoundError:
    print(f"{package_name} not found. Installing...")
    %pip install {package_name}

mlflow is already installed.


In [3]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df_train = read_dataframe('/home/bcthakreda/mlops_zoomcamp/Machine-Learning-ZoomCamp/week1/data/yellow_tripdata_2021-01.parquet')
df_val = read_dataframe('/home/bcthakreda/mlops_zoomcamp/Machine-Learning-ZoomCamp/week1/data/yellow_tripdata_2021-02.parquet')

In [5]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [6]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [7]:

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [8]:
def get_size_mb(variable):
    size_in_bytes = sys.getsizeof(variable)
    size_in_mb = size_in_bytes / (1024 * 1024)
    return size_in_mb

In [9]:
size_my_list = get_size_mb(train_dicts)
size_my_dict = get_size_mb(X_train)

In [10]:
print(f"Size of 'my_list': {size_my_list:.2f} MB")
print(f"Size of 'my_dict': {size_my_dict:.2f} MB")

Size of 'my_list': 11.47 MB
Size of 'my_dict': 0.00 MB


In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-optuna-different-model")

2023/04/24 03:45:43 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/04/24 03:45:43 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2023/04/24 03:45:43 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-optuna-different-model' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/bcthakreda/mlops_zoomcamp/Machine-Learning-ZoomCamp/week2/mlruns/7', creation_time=1682307943575, experiment_id='7', last_update_time=1682307943575, lifecycle_stage='active', name='nyc-taxi-optuna-different-model', tags={}>

In [13]:

models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)


In [20]:

with mlflow.start_run(description="Running second Random Forest Regressor with specified.Hyperparameters and model dump. Added r square as metric as well .Adding model as pickle. Also saving the model using mlflow"):

    mlflow.set_tag("developer", "bt")
    mlflow.set_tag("model", "Random Forest Regressor")
    mlflow.set_tag("date", "2023-04-23")

    mlflow.log_param("train data", "/home/bcthakreda/mlops_zoomcamp/Machine-Learning-ZoomCamp/week1/data/yellow_tripdata_2021-01.parquet")
    mlflow.log_param("test data", "/home/bcthakreda/mlops_zoomcamp/Machine-Learning-ZoomCamp/week1/data/yellow_tripdata_2021-02.parquet")

    n_estimators = 25
    max_depth = 15
    min_samples_split = 5
    min_samples_leaf = 5

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("min_samples_leaf", min_samples_leaf)


    randomForestRegressor = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    randomForestRegressor.fit(X_train, y_train)

    y_pred = randomForestRegressor.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    rSquared = r2_score(y_val,y_pred)
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric('r2_score',rSquared)

    model_path = os.path.join(models_dir,'randomForestRegressor.bin')

    with open(model_path,"wb") as f:
        pickle.dump(randomForestRegressor,f)

    mlflow.log_artifact(local_path=model_path,artifact_path="models_pickle_rf")

    mlflow.sklearn.log_model(randomForestRegressor, "modelByMlflow", registered_model_name="model_mlflow_rf")


2023/04/15 20:32:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/04/15 20:32:33 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'model_mlflow_rf'.
2023/04/15 20:32:33 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: model_mlflow_rf, version 1
Created version '1' of model 'model_mlflow_rf'.
