# Experiment Name:

In [64]:
exp_name = "Duration Prediction Model v4"
models = {}

In [103]:
import fastparquet
import pyarrow
import pandas as pd
import numpy as np

In [105]:
df = pd.read_parquet("green_trip_data.parquet", engine = "pyarrow")

In [106]:
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee
0,2,2025-01-01 00:03:01,2025-01-01 00:17:12,N,1.0,75,235,1.0,5.93,24.7,...,0.5,6.8,0.0,,1.0,34.0,1.0,1.0,0.0,0.0
1,2,2025-01-01 00:19:59,2025-01-01 00:25:52,N,1.0,166,75,1.0,1.32,8.6,...,0.5,0.0,0.0,,1.0,11.1,2.0,1.0,0.0,0.0
2,2,2025-01-01 00:05:29,2025-01-01 00:07:21,N,5.0,171,73,1.0,0.41,25.55,...,0.0,0.0,0.0,,1.0,26.55,2.0,2.0,0.0,0.0
3,2,2025-01-01 00:52:24,2025-01-01 01:07:52,N,1.0,74,223,1.0,4.12,21.2,...,0.5,6.13,6.94,,1.0,36.77,1.0,1.0,0.0,0.0
4,2,2025-01-01 00:25:05,2025-01-01 01:01:10,N,1.0,66,158,1.0,4.71,33.8,...,0.5,7.81,0.0,,1.0,46.86,1.0,1.0,2.75,0.0


In [68]:
df.isna().sum()

VendorID                     0
lpep_pickup_datetime         0
lpep_dropoff_datetime        0
store_and_fwd_flag        1836
RatecodeID                1836
PULocationID                 0
DOLocationID                 0
passenger_count           1836
trip_distance                0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
ehail_fee                48326
improvement_surcharge        0
total_amount                 0
payment_type              1836
trip_type                 1843
congestion_surcharge      1836
cbd_congestion_fee        1836
dtype: int64

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48326 entries, 0 to 48325
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               48326 non-null  int32         
 1   lpep_pickup_datetime   48326 non-null  datetime64[us]
 2   lpep_dropoff_datetime  48326 non-null  datetime64[us]
 3   store_and_fwd_flag     46490 non-null  object        
 4   RatecodeID             46490 non-null  float64       
 5   PULocationID           48326 non-null  int32         
 6   DOLocationID           48326 non-null  int32         
 7   passenger_count        46490 non-null  float64       
 8   trip_distance          48326 non-null  float64       
 9   fare_amount            48326 non-null  float64       
 10  extra                  48326 non-null  float64       
 11  mta_tax                48326 non-null  float64       
 12  tip_amount             48326 non-null  float64       
 13  t

In [70]:
# Important Feature's Imformation

# Target: Duration
# Model: Regression Model
# Independent features: trip_distance, total_amount, PULocationID, DOLocationID, pickup_time -> (which have to converted into time_sin and time_cos)  

In [71]:
df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"]
df["duration"] = df["duration"].astype("str")
df.loc[0, "duration"]

'0 days 00:14:11'

In [72]:
def getMins(x):
    h, m, s = list(map(int, x.split()[2].split(":")))
    return h*60 + m + s // 60

In [73]:
df["duration"] = df["duration"].apply(lambda x: getMins(x))

In [74]:
df["hour"] = df["lpep_pickup_datetime"].dt.hour
df["minute"] = df["lpep_pickup_datetime"].dt.minute
df["time_numeric"] = df["hour"] + df["minute"] / 60

In [75]:
df["time_sin"] = np.sin(2 * np.pi * df["time_numeric"] / 24)
df["time_cos"] = np.cos(2 * np.pi * df["time_numeric"] / 24)

In [76]:
nominal_cols = ["PULocationID", "DOLocationID"]
numerical_cols = ["time_sin", "time_cos", "trip_distance", "total_amount"]

df = df[numerical_cols + nominal_cols + ["duration"]]

In [77]:
df.head()

Unnamed: 0,time_sin,time_cos,trip_distance,total_amount,PULocationID,DOLocationID,duration
0,0.01309,0.999914,5.93,34.0,75,235,14
1,0.082808,0.996566,1.32,11.1,166,75,5
2,0.021815,0.999762,0.41,26.55,171,73,1
3,0.224951,0.97437,4.12,36.77,74,223,15
4,0.108867,0.994056,4.71,46.86,66,158,36


In [78]:
X = df.drop("duration", axis = 1)
y = df["duration"]

In [79]:
# models -> Xgboost, random_forest, linear_regression, knn

In [80]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

## LinearRegression Model

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [82]:
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(sparse_output = True, handle_unknown = "ignore"), nominal_cols),
    ("num", StandardScaler(), numerical_cols)
])

In [83]:
LR_model = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

In [84]:
LR_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [85]:
print("Training Score:", LR_model.score(X_train, y_train))
print("Testing Score:", LR_model.score(X_test, y_test))

Training Score: 0.025521616744684672
Testing Score: 0.006826232383397746


In [86]:
models["LR_model"] = LR_model

## RandomForestRegressor

In [87]:
preprocess = ColumnTransformer([
    ("num", "passthrough", numerical_cols),
    ("cat", OneHotEncoder(sparse_output = True, handle_unknown = "ignore"), nominal_cols)
])

In [88]:
RFR_model = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
            n_estimators=200,           # Number of trees in the forest
            max_depth=15,               # Controls tree depth to prevent overfitting
            max_features='sqrt',        # Good balance between randomness and performance
            min_samples_split=5,        # Minimum samples to split a node
            min_samples_leaf=2,         # Minimum samples at a leaf node
            bootstrap=True,             # Enables sampling with replacement
            random_state=42,            # Ensures reproducibility
            n_jobs=-1 
    ))
])

In [89]:
RFR_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,15
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [90]:
print("Training Score:", RFR_model.score(X_train, y_train))
print("Testing Score:", RFR_model.score(X_test, y_test))

Training Score: 0.09110979835222954
Testing Score: 0.018996356511148837


In [91]:
models["RFR_model"] = RFR_model

## XGBoostRegressor

In [92]:
XGBR_model = Pipeline([
    ("preprocess", preprocess),
    ("model", XGBRegressor(
            n_estimators=300,           # Number of boosting rounds
            learning_rate=0.05,         # Step size shrinkage
            max_depth=6,                # Controls tree complexity
            subsample=0.8,              # Fraction of samples used per tree
            colsample_bytree=0.7,       # Fraction of features used per tree
            min_child_weight=3,         # Minimum sum of instance weight in a child
            gamma=0,                    # Minimum loss reduction to make a split
            reg_alpha=0.1,              # L1 regularization
            reg_lambda=1,               # L2 regularization
            objective='reg:squarederror', # Standard regression loss
            n_jobs=-1,                  # Parallelize across all cores
            random_state=42 
    ))
])

In [93]:
XGBR_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7
,device,
,early_stopping_rounds,
,enable_categorical,False


In [94]:
print("Training Score:", XGBR_model.score(X_train, y_train))
print("Testing Score:", XGBR_model.score(X_test, y_test))

Training Score: 0.2860821485519409
Testing Score: 0.024784088134765625


In [95]:
models["XGBR_model"] = XGBR_model

## Exp-Tracking and Adding Models to registry

In [96]:
import mlflow

In [97]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score as r2_fn, mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment(exp_name)

for mod in models:
    with mlflow.start_run(run_name=mod):
        
        # Logging Params
        mlflow.log_params(models[mod].get_params())

        # Logging Metrics
        pred_vals = models[mod].predict(X_test)
        r2 = r2_fn(y_test, pred_vals)
        mse = mean_squared_error(y_test, pred_vals)
        rmse = np.sqrt(mse)

        mlflow.log_metrics({
            "r2_score": r2,
            "MSE": mse,
            "RMSE": rmse
        })

        # Infer model signature
        signature = mlflow.models.infer_signature(X_test, pred_vals)
        
        # Logging Model
        mlflow.sklearn.log_model(
            models[mod],
            name=mod, 
            # registered_model_name=exp_name,
            signature=signature,
            input_example=pd.DataFrame({
                "trip_distance": [180],
                "total_amount": [250],
                "PULocationID": [70],
                "DOLocationID": [160],
                "time_sin": [np.sin(2 * np.pi * 4/24)],
                "time_cos": [np.cos(2 * np.pi * 4/24)]    
            })
        )

        # Logging Artifact (single file)
        mlflow.log_artifact("green_trip_data.parquet", artifact_path="dataset")




🏃 View run LR_model at: http://127.0.0.1:5000/#/experiments/139958987303239491/runs/72a56d69e4fc474eb08be9ef4b8e6aa7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/139958987303239491




🏃 View run RFR_model at: http://127.0.0.1:5000/#/experiments/139958987303239491/runs/ef65d8d326594a0496e8a9049d1ea336
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/139958987303239491




🏃 View run XGBR_model at: http://127.0.0.1:5000/#/experiments/139958987303239491/runs/d8e0ccb1cba748eb8379b94d276ff4b5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/139958987303239491


## Setting alias according to the metrics

In [98]:
from mlflow import MlflowClient

In [102]:
from mlflow import MlflowClient
import mlflow

client = MlflowClient()

experiment = client.get_experiment_by_name(exp_name)
exp_id = experiment.experiment_id

best_run = client.search_runs(
    experiment_ids=[exp_id],
    order_by=["metrics.RMSE ASC"],
    max_results=1
)
best_run_id = best_run[0].info.run_id

# Use correct artifact path from logging
mod_name = best_run[0].data.tags.get("mlflow.runName")  # Assuming run_name=mod was set
model_uri = f"runs:/{best_run_id}/{mod_name}"

try:
    champ_model = client.get_model_version_by_alias(exp_name, "champion")

    champ_run_id = champ_model.run_id
    champ_metric = client.get_run(champ_run_id).data.metrics["RMSE"]
    new_metric = client.get_run(best_run_id).data.metrics["RMSE"]

    # Register new model version
    new_reg_mod = mlflow.register_model(model_uri, name=exp_name)

    if champ_metric > new_metric:
        client.set_registered_model_alias(name=exp_name, alias="challenger", version=new_reg_mod.version)
    else:
        client.set_registered_model_alias(name=exp_name, alias="challenger", version=champ_model.version)
        client.set_registered_model_alias(name=exp_name, alias="champion", version=new_reg_mod.version)

except Exception:
    # No champion exists → register and promote
    registered_model = mlflow.register_model(model_uri, name=exp_name)
    client.set_registered_model_alias(name=exp_name, alias="champion", version=registered_model.version)


Registered model 'Duration Prediction Model v4' already exists. Creating a new version of this model...
2025/09/07 09:52:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Duration Prediction Model v4, version 2
Created version '2' of model 'Duration Prediction Model v4'.


## Loading the "champion" version

In [100]:
model_uri = f"models:/{exp_name}@champion"

mod = mlflow.pyfunc.load_model(model_uri)

Downloading artifacts: 100%|████████████████████████████████████| 7/7 [00:00<00:00, 11.50it/s]


In [101]:
mod.predict(X_test)

array([24.693474, 16.096872,  9.891144, ...,  9.669532,  8.485616,
       17.818956], dtype=float32)