In [1]:
import mlflow
import dagshub

# Inicializar MLflow y Dagshub
dagshub.init(url="https://dagshub.com/arturotowers/nyc-taxi-time-prediction.git", mlflow=True)
MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

mlflow.set_experiment("GradientBoosting_Experiment")
mlflow.set_experiment("RandomForest_Experiment")

https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow


2024/09/20 18:09:57 INFO mlflow.tracking.fluent: Experiment with name 'GradientBoosting_Experiment' does not exist. Creating a new experiment.
2024/09/20 18:09:57 INFO mlflow.tracking.fluent: Experiment with name 'RandomForest_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/28559c146958489da93a281d752509c1', creation_time=1726877398258, experiment_id='2', last_update_time=1726877398258, lifecycle_stage='active', name='RandomForest_Experiment', tags={}>

In [2]:
import os
import requests

# Create the directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Download files using requests
url1 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet'
url2 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-02.parquet'

# Define the file paths
file_path1 = '../data/green_tripdata_2024-01.parquet'
file_path2 = '../data/green_tripdata_2024-02.parquet'

# Download and save the first file
response = requests.get(url1)
with open(file_path1, 'wb') as f:
    f.write(response.content)

# Download and save the second file
response = requests.get(url2)
with open(file_path2, 'wb') as f:
    f.write(response.content)

print('Files downloaded successfully!')

Files downloaded successfully!


In [3]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [4]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
df_train = read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = read_dataframe('../data/green_tripdata_2024-02.parquet')

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("arturotowers/nyc-taxi-time-prediction")

# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-01.parquet",  # local path of file to upload
    Key="train_data.parquet",  # remote path where to upload the file
)
# Upload file
s3.upload_file(
    Bucket="nyc-taxi-time-prediction",  # name of the repo
    Filename="../data/green_tripdata_2024-02.parquet",  # local path of file to upload
    Key="eval_data.parquet",  # remote path where to upload the file
)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import mean_squared_error
import mlflow.sklearn
import pickle
import pathlib

In [15]:
# Definir la función objetivo
def objective_rf(params):
    with mlflow.start_run(nested=True):
        
        # Etiquetar modelo
        mlflow.set_tag("model_family", "RandomForest")
        
        # Registrar los parámetros
        mlflow.log_params(params)
        
        # Entrenar modelo
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        
        # Predecir sobre el conjunto de validación
        y_pred = model.predict(X_val)
        
        # Calcular la métrica
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Registrar la métrica de rendimiento
        mlflow.log_metric("rmse", rmse)
        
        # Registrar el modelo entrenado
        mlflow.sklearn.log_model(model, artifact_path="model")

    return {'loss': rmse, 'status': STATUS_OK}

In [17]:
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'bootstrap': hp.choice('bootstrap', [True, False])
}

# Realizar la búsqueda de hiperparámetros
with mlflow.start_run(run_name="RandomForest Hyper-parameter Optimization", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best params
    best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
    best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
    best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
    best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])
    best_params_rf['bootstrap'] = bool(best_params_rf['bootstrap'])
    mlflow.log_params(best_params_rf)

    # Log tags
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "hyper-opt",
        "model_family": "RandomForest",
        "feature_set_version": 1
    })

    # Log a fit model instance
    best_model_rf = RandomForestRegressor(**best_params_rf)
    best_model_rf.fit(X_train, y_train)
    
    y_pred_rf = best_model_rf.predict(X_val)
    rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
    mlflow.log_metric("rmse", rmse_rf)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  0%|                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 18:49:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run adaptable-boar-173 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/e4e4109086194c7187b0bf2d3ec21d11.

2024/09/20 18:49:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 10%|█████                                             | 1/10 [01:30<13:38, 90.97s/trial, best loss: 5.441429687630555]



2024/09/20 18:50:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-dolphin-172 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/4dbb07117f594ecaa5747d2a59d6e5db.

2024/09/20 18:50:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 20%|██████████                                        | 2/10 [02:28<09:29, 71.18s/trial, best loss: 5.441429687630555]



2024/09/20 18:50:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-doe-609 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/b9bfa3b8dd0a4e949a05988d35a2ca77.

2024/09/20 18:50:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 30%|███████████████                                   | 3/10 [02:37<05:00, 42.90s/trial, best loss: 5.441429687630555]



2024/09/20 18:52:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-hare-283 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/39804af0464f4f79baeba095f3b428f3.

2024/09/20 18:52:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 40%|████████████████████                              | 4/10 [04:00<05:51, 58.54s/trial, best loss: 5.359177418213355]



2024/09/20 18:52:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-perch-44 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/90b9d4c2f5564c29a312337122eb6bf2.

2024/09/20 18:52:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 50%|█████████████████████████                         | 5/10 [04:10<03:25, 41.14s/trial, best loss: 5.359177418213355]



2024/09/20 18:52:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run bemused-foal-862 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/fd8bc339c5c849f29b7c2f140ebe7716.

2024/09/20 18:52:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 60%|██████████████████████████████                    | 6/10 [04:19<02:01, 30.34s/trial, best loss: 5.359177418213355]



2024/09/20 18:56:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run bemused-robin-12 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/6b4755b277fd4c7ebc6aebec5f954fb3.

2024/09/20 18:56:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 70%|███████████████████████████████████               | 7/10 [08:21<04:58, 99.62s/trial, best loss: 5.359177418213355]



2024/09/20 18:56:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-grouse-151 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/faaabfb6d7ed42f8a9eee733c67322a9.

2024/09/20 18:56:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 80%|████████████████████████████████████████          | 8/10 [08:44<02:29, 74.96s/trial, best loss: 5.359177418213355]



2024/09/20 18:58:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run defiant-shrike-42 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/f18587c337db4028886cd7a2ba271f1d.

2024/09/20 18:58:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 90%|█████████████████████████████████████████████     | 9/10 [10:10<01:18, 78.50s/trial, best loss: 5.359177418213355]



2024/09/20 18:59:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run serious-skunk-590 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/6f8b532909e24b2b955bea28fc8f6047.

2024/09/20 18:59:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



100%|█████████████████████████████████████████████████| 10/10 [11:21<00:00, 68.13s/trial, best loss: 5.359177418213355]


2024/09/20 18:59:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest Hyper-parameter Optimization at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/9694bfd5ba534c348b3dfc7f31280446.
2024/09/20 18:59:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.


InvalidParameterError: The 'max_depth' parameter of RandomForestRegressor must be an int in the range [1, inf) or None. Got np.float64(15.0) instead.

In [20]:
# Declarar el modelo con los parámetros específicos
best_model_rf = RandomForestRegressor(
    bootstrap=False,
    max_depth=15,  # int
    max_features=2,  # int
    min_samples_leaf=3,  # int
    min_samples_split=4,  # int
    n_estimators=389  # int
)

# Ajustar el modelo
best_model_rf.fit(X_train, y_train)

# Predecir y calcular RMSE
y_pred_rf = best_model_rf.predict(X_val)
rmse_rf = mean_squared_error(y_val, y_pred_rf, squared=False)
mlflow.log_metric("rmse", rmse_rf)

# Guardar el preprocesador
pathlib.Path("models").mkdir(exist_ok=True)
with open("models/preprocessor.b", "wb") as f_out:
    pickle.dump(dv, f_out)

mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")



In [22]:
from sklearn.ensemble import GradientBoostingRegressor

# Definir la función objetivo para GradientBoosting
def objective_gb(params):
    with mlflow.start_run(nested=True):
        
        # Etiquetar modelo
        mlflow.set_tag("model_family", "GradientBoosting")
        
        # Registrar los parámetros
        mlflow.log_params(params)
        
        # Entrenar modelo
        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        
        # Predecir sobre el conjunto de validación
        y_pred = model.predict(X_val)
        
        # Calcular la métrica
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        
        # Registrar la métrica de rendimiento
        mlflow.log_metric("rmse", rmse)
        
        # Registrar el modelo entrenado
        mlflow.sklearn.log_model(model, artifact_path="model")

    return {'loss': rmse, 'status': STATUS_OK}

In [23]:
# Definir el espacio de búsqueda para GradientBoosting
search_space_gb = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 3, 30, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1))
}

# Realizar la búsqueda de hiperparámetros
with mlflow.start_run(run_name="GradientBoosting Hyper-parameter Optimization", nested=True):
    best_params_gb = fmin(
        fn=objective_gb,
        space=search_space_gb,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )

    best_params_gb['n_estimators'] = int(best_params_gb['n_estimators'])
    best_params_gb['max_depth'] = int(best_params_gb['max_depth'])
    best_params_gb['min_samples_split'] = int(best_params_gb['min_samples_split'])
    best_params_gb['min_samples_leaf'] = int(best_params_gb['min_samples_leaf'])
    # Registrar los mejores parámetros
    mlflow.log_params(best_params_gb)

    # Etiquetas del modelo
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "hyper-opt",
        "model_family": "GradientBoosting",
        "feature_set_version": 1
    })

    # Entrenar y registrar el mejor modelo
    best_model_gb = GradientBoostingRegressor(**best_params_gb)
    best_model_gb.fit(X_train, y_train)
    
    y_pred_gb = best_model_gb.predict(X_val)
    rmse_gb = mean_squared_error(y_val, y_pred_gb, squared=False)
    mlflow.log_metric("rmse", rmse_gb)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  0%|                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/20 19:11:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-swan-157 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/f82e0fe867af454394ba65b5d7c92097.

2024/09/20 19:11:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 10%|█████                                             | 1/10 [00:25<03:53, 25.95s/trial, best loss: 5.150663351132724]



2024/09/20 19:12:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run mercurial-snake-523 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/f8bb66b0873044f5a948fd4fc1131fbd.

2024/09/20 19:12:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 20%|██████████                                        | 2/10 [00:41<02:39, 19.94s/trial, best loss: 5.150663351132724]



2024/09/20 19:12:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run bemused-sheep-663 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/4905c06360854e1aa6ce03c8a6af0752.

2024/09/20 19:12:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 30%|███████████████                                   | 3/10 [01:21<03:22, 28.87s/trial, best loss: 5.150663351132724]



2024/09/20 19:13:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run puzzled-smelt-784 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/025a662a104c449aae934e9fb466cd94.

2024/09/20 19:13:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 40%|████████████████████                              | 4/10 [01:42<02:35, 25.94s/trial, best loss: 5.150663351132724]



2024/09/20 19:14:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run melodic-shoat-582 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/305fb9ab8cdc4358a9d318e77b56b6b2.

2024/09/20 19:14:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 50%|█████████████████████████▌                         | 5/10 [02:55<03:33, 42.68s/trial, best loss: 5.13993890221879]



2024/09/20 19:14:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run amazing-crane-355 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/4f27e6207b0c455ea360b0580558eacb.

2024/09/20 19:14:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 60%|██████████████████████████████▌                    | 6/10 [03:24<02:33, 38.36s/trial, best loss: 5.13993890221879]



2024/09/20 19:15:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run capricious-shark-563 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/71e7e3b9e70c4022bfa7e770bf649864.

2024/09/20 19:15:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 70%|███████████████████████████████████▋               | 7/10 [03:52<01:44, 34.87s/trial, best loss: 5.13993890221879]



2024/09/20 19:16:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run treasured-smelt-217 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/15a3f04b2d734912a7cdf0abe6a8ca96.

2024/09/20 19:16:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 80%|████████████████████████████████████████▊          | 8/10 [04:48<01:23, 41.69s/trial, best loss: 5.13993890221879]



2024/09/20 19:16:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run salty-flea-901 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/cc3750a9abf04f9584b3cd9738b0f29a.

2024/09/20 19:16:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



 90%|█████████████████████████████████████████████▉     | 9/10 [05:22<00:39, 39.01s/trial, best loss: 5.13993890221879]



2024/09/20 19:18:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-rat-712 at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/f19b7c749f334ed19f247e5a4ef0b68a.

2024/09/20 19:18:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.



100%|██████████████████████████████████████████████████| 10/10 [06:41<00:00, 40.19s/trial, best loss: 5.13993890221879]


2024/09/20 19:19:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting Hyper-parameter Optimization at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2/runs/153069cf7f9840d996517f0976673cd5.
2024/09/20 19:19:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/arturotowers/nyc-taxi-time-prediction.mlflow/#/experiments/2.


In [24]:
print(f"RMSE GradientBoostingRegressor: {rmse_gb}")
print(f"RMSE RandomForestRegressor: {rmse_rf}")

RMSE GradientBoostingRegressor: 5.134557911510872
RMSE RandomForestRegressor: 9.098692327125566


In [26]:
if rmse_gb < rmse_rf:
    best_rmse = rmse_gb
    best_model_name = "GradientBoostingRegressor"
else:
    best_rmse = rmse_rf
    best_model_name = "RandomForestRegressor"

print(f"Mejor modelo: {best_model_name} con RMSE: {best_rmse}")

Mejor modelo: GradientBoostingRegressor con RMSE: 5.134557911510872


In [32]:
# Registrar el mejor modelo en el model registry
best_run_id = "153069cf7f9840d996517f0976673cd5"
model_uri=f"runs:/{best_run_id}/model"
result = mlflow.register_model(model_uri, "nyc-taxi-model")

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/20 19:47:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 4
Created version '4' of model 'nyc-taxi-model'.


In [33]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Best model of HW 5",
)

new_alias = "challenger"
date = datetime.today()
model_version = "1"

client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['challenger'], creation_timestamp=1726882813057, current_stage='Staging', description=('The model version 1 was transitioned to challenger on 2024-09-20 '
 '19:48:23.687389'), last_updated_timestamp=1726883304450, name='nyc-taxi-model', run_id='153069cf7f9840d996517f0976673cd5', run_link='', source='mlflow-artifacts:/28559c146958489da93a281d752509c1/153069cf7f9840d996517f0976673cd5/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [34]:
!curl -o ../data/green_tripdata_2024-03.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1340k  100 1340k    0     0  2471k      0 --:--:-- --:--:-- --:--:-- 2486k
