In [1]:
import mlflow
import dagshub

# Inicializar MLflow y Dagshub
dagshub.init(url="https://dagshub.com/arturotowers/nyc-taxi-time-prediction.git", mlflow=True)
MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

mlflow.set_experiment("GradientBoosting_Experiment")
mlflow.set_experiment("RandomForest_Experiment")

https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/28559c146958489da93a281d752509c1', creation_time=1726877398258, experiment_id='2', last_update_time=1726877398258, lifecycle_stage='active', name='RandomForest_Experiment', tags={}>

In [2]:
import os
import requests

# Create the directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Download files using requests
url1 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet'
url2 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet'

# Define the file paths
file_path1 = '../data/green_tripdata_2024-01.parquet'
file_path2 = '../data/green_tripdata_2024-02.parquet'

# Download and save the first file
response = requests.get(url1)
with open(file_path1, 'wb') as f:
    f.write(response.content)

# Download and save the second file
response = requests.get(url2)
with open(file_path2, 'wb') as f:
    f.write(response.content)

print('Files downloaded successfully!')

Files downloaded successfully!


In [3]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [4]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("arturotowers/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)
# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import mean_squared_error
import mlflow.sklearn
import pickle
import pathlib

In [None]:
# Definir la función objetivo
def objective_rf(params):
    with mlflow.start_run(nested=True):
        
        # Etiquetar modelo
        mlflow.set_tag("model_family", "RandomForest")
        
        # Registrar los parámetros
        mlflow.log_params(params)
        
        # Entrenar modelo
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        
        # Predecir sobre el conjunto de validación
        y_pred = model.predict(X_val)
        
        # Calcular la métrica
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Registrar la métrica de rendimiento
        mlflow.log_metric("rmse", rmse)
        
        # Registrar el modelo entrenado
        mlflow.sklearn.log_model(model, artifact_path="model")

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'bootstrap': hp.choice('bootstrap', [True, False])
}

# Realizar la búsqueda de hiperparámetros
with mlflow.start_run(run_name="RandomForest Hyper-parameter Optimization", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best params
    best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
    best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
    best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
    best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])
    best_params_rf['bootstrap'] = bool(best_params_rf['bootstrap'])
    mlflow.log_params(best_params_rf)

    # Log tags
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "hyper-opt",
        "model_family": "RandomForest",
        "feature_set_version": 1
    })

    # Log a fit model instance
    best_model_rf = RandomForestRegressor(**best_params_rf)
    best_model_rf.fit(X_train, y_train)
    
    y_pred_rf = best_model_rf.predict(X_val)
    rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
    mlflow.log_metric("rmse", rmse_rf)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

In [None]:
# Declarar el modelo con los parámetros específicos
best_model_rf = RandomForestRegressor(
    bootstrap=False,
    max_depth=15,  # int
    max_features=2,  # int
    min_samples_leaf=3,  # int
    min_samples_split=4,  # int
    n_estimators=389  # int
)

# Ajustar el modelo
best_model_rf.fit(X_train, y_train)

# Predecir y calcular RMSE
y_pred_rf = best_model_rf.predict(X_val)
rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
mlflow.log_metric("rmse", rmse_rf)

# Guardar el preprocesador
pathlib.Path("models").mkdir(exist_ok=True)
with open("models/preprocessor.b", "wb") as f_out:
    pickle.dump(dv, f_out)

mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Definir la función objetivo para GradientBoosting
def objective_gb(params):
    with mlflow.start_run(nested=True):
        
        # Etiquetar modelo
        mlflow.set_tag("model_family", "GradientBoosting")
        
        # Registrar los parámetros
        mlflow.log_params(params)
        
        # Entrenar modelo
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        
        # Predecir sobre el conjunto de validación
        y_pred = model.predict(X_val)
        
        # Calcular la métrica
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Registrar la métrica de rendimiento
        mlflow.log_metric("rmse", rmse)
        
        # Registrar el modelo entrenado
        mlflow.sklearn.log_model(model, artifact_path="model")

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
# Definir el espacio de búsqueda para GradientBoosting
search_space_gb = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 30, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1))
}

# Realizar la búsqueda de hiperparámetros
with mlflow.start_run(run_name="GradientBoosting Hyper-parameter Optimization", nested=True):
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )

    best_params_gb['n_estimators'] = int(best_params_gb['n_estimators'])
    best_params_gb['max_depth'] = int(best_params_gb['max_depth'])
    best_params_gb['min_samples_split'] = int(best_params_gb['min_samples_split'])
    best_params_gb['min_samples_leaf'] = int(best_params_gb['min_samples_leaf'])
    # Registrar los mejores parámetros
    mlflow.log_params(best_params_gb)

    # Etiquetas del modelo
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "hyper-opt",
        "model_family": "GradientBoosting",
        "feature_set_version": 1
    })

    # Entrenar y registrar el mejor modelo
    best_model_gb = GradientBoostingRegressor(**best_params_gb)
    best_model_gb.fit(X_train, y_train)
    
    y_pred_gb = best_model_gb.predict(X_val)
    rmse_gb = mean_squared_error(y_val, y_pred_gb, squared=False)
    mlflow.log_metric("rmse", rmse_gb)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

In [None]:
print(f"RMSE GradientBoostingRegressor: {rmse_gb}")
print(f"RMSE RandomForestRegressor: {rmse_rf}")

In [None]:
if rmse_gb < rmse_rf:
    best_rmse = rmse_gb
    best_model_name = "GradientBoostingRegressor"
else:
    best_rmse = rmse_rf
    best_model_name = "RandomForestRegressor"

print(f"Mejor modelo: {best_model_name} con RMSE: {best_rmse}")

In [29]:
# Registrar el mejor modelo en el model registry
best_run_id = "153069cf7f9840d996517f0976673cd5"
model_uri=f"runs:/{best_run_id}/model"
result = mlflow.register_model(model_uri, "nyc-taxi-model")

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 22:10:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 7
Created version '7' of model 'nyc-taxi-model'.


In [31]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Best model of HW 5",
)

new_alias = "challenger"
date = datetime.today()
model_version = result.version

client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['challenger'], creation_timestamp=1726891815085, current_stage='None', description=('The model version 7 was transitioned to challenger on 2024-09-20 '
 '22:10:44.628925'), last_updated_timestamp=1726891844339, name='nyc-taxi-model', run_id='153069cf7f9840d996517f0976673cd5', run_link='', source='mlflow-artifacts:/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='7'>

In [10]:
!curl -o ../data/green_tripdata_2024-03.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 64 1340k   64  861k    0     0  1666k      0 --:--:-- --:--:-- --:--:-- 1669k
100 1340k  100 1340k    0     0  2461k      0 --:--:-- --:--:-- --:--:-- 2468k


In [11]:
df_test = read_dataframe('../data/green_tripdata_2024-03.parquet')

In [12]:
df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
test_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)
y_test = df_test['duration'].values

In [33]:
import mlflow.pyfunc

In [34]:
champion_model = mlflow.pyfunc.load_model(model_uri="models:/nyc-taxi-model/5")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
#challenger_model = mlflow.pyfunc.load_model(model_uri="models:/nyc-taxi-model/@challenger")
run_id = "153069cf7f9840d996517f0976673cd5"
model_uri = f"runs:/{run_id}/model"
challenger_model = mlflow.pyfunc.load_model(model_uri=model_uri)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

MlflowException: The following failures occurred while downloading one or more artifacts from https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/api/2.0/mlflow-artifacts/artifacts/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts:
##### File model #####
API request to https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/api/2.0/mlflow-artifacts/artifacts/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts/model failed with exception HTTPSConnectionPool(host='dagshub.com', port=443): Max retries exceeded with url: /arturotowers/nyc-taxi-time-prediction.mlflow/api/2.0/mlflow-artifacts/artifacts/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts/model (Caused by ResponseError('too many 500 error responses'))

In [36]:
from mlflow import MlflowClient
client = MlflowClient(tracking_uri=mlflow.get_tracking_uri())
model = client.get_registered_model("nyc-taxi-model")
print(model)

<RegisteredModel: aliases={'challenger': '7', 'champion': '1'}, creation_timestamp=1726882812649, description='Best model of HW 5', last_updated_timestamp=1726891843976, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1726891815085, current_stage='None', description=('The model version 7 was transitioned to challenger on 2024-09-20 '
 '22:10:44.628925'), last_updated_timestamp=1726891844339, name='nyc-taxi-model', run_id='153069cf7f9840d996517f0976673cd5', run_link='', source='mlflow-artifacts:/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='7'>,
 <ModelVersion: aliases=[], creation_timestamp=1726882813057, current_stage='Staging', description=('The model version 1 was transitioned to challenger on 2024-09-20 '
 '22:08:43.005513'), last_updated_timestamp=1726891722700, name='nyc-taxi-model', run_id='153069cf7f9840d996517f0976673cd5', run_link='', source='mlflow-artifacts:

In [37]:
print(champion_model)

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.xgboost
  run_id: 43fb6f3393fc4a28b53ce039e0f19222



# No se pudo cargar el modelo challenger pero si se hubiera cargado hubiera hecho el siguiente codigo

In [None]:
y_pred_challenger = challenger_model.predict(X_test)
rmse_challenger = sqrt(mean_squared_error(y_test, y_pred_challenger, squared=False))

In [None]:
y_pred_champion = champion_model.predict(X_test)
rmse_champion = sqrt(mean_squared_error(y_test, y_pred_champion, squared=False))

Hubiera elegido el que menor rmse hubiera tenido