# <font color= #8FC3FA> **NYC Taxi Predictions 2025 - Initial Tracking** </font>

In [1]:
# General Libraries
import pandas as pd
import pickle

# Feature Engineering
from sklearn.feature_extraction import DictVectorizer

# Modeling
import mlflow
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR

# Evaluation
from sklearn.metrics import root_mean_squared_error

# Autolog function
mlflow.sklearn.autolog()

# <font color= #8FC3FA> **1. Data Loading** </font>

In [2]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")

df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()
X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient="records"))
X_val = dv.transform(df_val[categorical + numerical].to_dict(orient="records"))

y_train = df_train["duration"].values
y_val = df_val["duration"].values

# <font color= #8FC3FA> **2. ML FLow** </font>

### **Activate MLFlow:**

`uv run mlflow ui --backend-store-uri sqlite:///mlflow.db`

## <font color= #8FC3FA> **• Lasso Model - Experiment 07/10** </font>

**Experiment:** Lasso Model - alpha: 0.1

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("class-nyc-taxi-experiment")          # Experiment name

with mlflow.start_run(run_name="lasso_alpha_0.1"):
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model")

    print(f"✅ Run finalizado. RMSE = {rmse:.4f}")

## <font color= #8FC3FA> **• GradientBoost, ExtraTreeRegressor & SVR - Experiment 14/10** </font>

**Experiment Storage:** 

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("class-nyc-taxi-model-registry-example")          # Experiment name

2025/10/16 20:32:27 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/16 20:32:27 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='file:c:/Users/Vivienne/apps/data_science_project/nyc-taxi-predictions-2025/notebooks/mlruns/2', creation_time=1760498545784, experiment_id='2', last_update_time=1760498545784, lifecycle_stage='active', name='class-nyc-taxi-model-registry-example', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [4]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

Models to Run:

In [7]:
models = [
    
    {"model": GradientBoostingRegressor,
     "params": {"n_estimators": 100, "learning_rate": 0.3, "max_depth": 25, "random_state": 42},
     },
    
    {"model": ExtraTreesRegressor,
     "params": {"n_estimators": 100, "max_depth": 15, "random_state": 42},
     },
    
    {"model": LinearSVR,
     "params": {"C": 1.0, "epsilon": 0}, 
     },

]

In [11]:
with mlflow.start_run(run_name="Nested Runs"):
    for model in models:
        
        model_class = model["model"]            # Select the model
        model_name = model_class.__name__       # Extract Model Name
        params = model["params"]                # Apply the params of the model defined in the model list
        
        with mlflow.start_run(run_name=model_name,nested=True):         # Nests all the models
            
            ml_model = model_class(**params)    # Unzips the params and applies it to the models
           
            ml_model.fit(X_train, y_train)      # Fitting
    
            y_pred = ml_model.predict(X_val)    # Prediction
            
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric("rmse", rmse)
            
            # !mkdir models
            with open("models/preprocessor.b", "wb") as f_out:
                pickle.dump(dv, f_out)
                
            mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")



## <font color= #8FC3FA> **• Random Forest Regressor - Experiment 14/10** </font>

In [13]:
with mlflow.start_run(run_name="RandomForestRegressor"):
    ml_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        random_state=42
    )
    
    ml_model.fit(X_train, y_train)
    
    mlflow.sklearn.log_model(
        sk_model=model, 
        artifact_path="model",
        registered_model_name="nyc-taxi-model"      # This will append the new experiment to the already created experiment
    )
    
    y_pred = ml_model.predict(X_val)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    # !mkdir models
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

2025/10/14 21:47:01 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/14 21:47:01 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-taxi-model'.


## Método 2 para registrar modelos

In [15]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
Created version '3' of model 'nyc-taxi-model'.


## Método 3 para registrar modelos - MLFlow Client

In [6]:
from mlflow import MlflowClient

client = MlflowClient(tracking_uri="sqlite:///mlflow.db")

In [None]:
run_id = input("Ingrese el run_id")

client.download_artifacts(
    run_id=run_id, 
    path='preprocessor', 
    dst_path='.'
)

# <font color= #8FC3FA> **3. Modify Elements in MLFlow** </font>

## Set an Alias

In [45]:
# 'Challenger' Allias to Version 3
client.set_registered_model_alias(
    name="nyc-taxi-model", 
    alias="challenger",
    version=3
)

## Add Description

In [9]:
client.update_model_version(
    name="nyc-taxi-model",
    version=3,
    description="This model version is a scikit-learn random forest containing 100 decision trees",
)

<ModelVersion: aliases=['challenger'], creation_timestamp=1760500169215, current_stage='None', deployment_job_state=None, description=('This model version is a scikit-learn random forest containing 100 decision '
 'trees'), last_updated_timestamp=1760669320659, metrics=None, model_id=None, name='nyc-taxi-model', params=None, run_id='789b551243f844af8591e4e54d09d35e', run_link=None, source='models:/m-c2468579c45049a9be91efc0d1f1e22b', status='READY', status_message=None, tags={}, user_id=None, version=3>

## Delete Alias

In [None]:
# Delete 'Challenger' Alias
client.delete_registered_model_alias(
    name="nyc-taxi-model", 
    alias="challenger"
)

## Get Version by Alias

In [13]:
# Get model version by alias
champion_model = client.get_model_version_by_alias(
    name="nyc-taxi-model",
    alias="champion"
)

print(champion_model.version)

1


# <font color= #8FC3FA> **4. Retrieve Models** </font>

## By Alias

In [19]:
import mlflow.pyfunc

model_name = 'nyc-taxi-model'
alias = 'champion'

model_uri = f'models:/{model_name}@{alias}'

champion_version = mlflow.sklearn.load_model(
    model_uri= model_uri,
)

champion_version

0,1,2
,loss,'squared_error'
,learning_rate,0.3
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,25
,min_impurity_decrease,0.0


In [21]:
champion_version.predict(X = X_val)

array([ 4.98661726, 24.1397534 , 26.9647779 , ..., 19.72875784,
       15.33514682, 22.07257903], shape=(44218,))

----
# <font color= #8FC3FA> **5. Comparar Modelos** </font>

In [38]:
def read_parquet(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, alias, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}@{alias}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [40]:
df = read_parquet("../data/green_tripdata_2025-03.parquet")

In [42]:
# Download preprocessor
run_id = input("Ingrese el run_id")

client.download_artifacts(
    run_id=run_id, 
    path='preprocessor', 
    dst_path='.'
)

'c:\\Users\\Vivienne\\apps\\data_science_project\\nyc-taxi-predictions-2025\\notebooks\\preprocessor'

In [44]:
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

X_test = preprocess(df, dv)

In [46]:
target = "duration"
y_test = df[target].values

### Comparar modelos

In [49]:
test_model(name="nyc-taxi-model", alias="champion", X_test=X_test, y_test=y_test)

{'rmse': 6.01147524270201}

In [50]:
test_model(name="nyc-taxi-model", alias="challenger", X_test=X_test, y_test=y_test)

{'rmse': 565.7889039000793}