In [1]:
import mlflow

In [15]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

In [16]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000/'


In [6]:
# load the dataset and run the experiment
import pandas as pd
df = pd.read_parquet("/Users/avikumart/Documents/GitHub/MLOps-Project/Data/yellow_tripdata_2025-01.parquet")
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0


In [7]:
df["duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

In [8]:
# seggregating the data for input variables and target variable
X = df[["PULocationID", "DOLocationID","trip_distance"]]
y = df["duration"]

# splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
from sklearn.linear_model import LinearRegression

# setting up the experiment
mlflow.create_experiment("experiment-1")
mlflow.set_experiment("experiment-1")

with mlflow.start_run():
    # training the model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # evaluating the model
    score = model.score(X_test_scaled, y_test)
    print(f"Model score: {score}")

    # logging the model
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_metric("score", score)
    mlflow.log_param("model_type", "LinearRegression")
    print("artifact uri:", mlflow.get_artifact_uri())

Model score: 0.0038047647869704493




artifact uri: mlflow-artifacts:/1/8e0c293b860f455bac1dc0b38eabf89c/artifacts
🏃 View run useful-grouse-809 at: http://127.0.0.1:5000/#/experiments/1/runs/8e0c293b860f455bac1dc0b38eabf89c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [19]:
from mlflow.tracking import MlflowClient

client = MlflowClient("http://127.0.0.1:5000")

In [22]:
# get the run ID of the last run
last_run = client.search_runs(experiment_ids=1,order_by=["start_time desc"], max_results=1)
last_run_id = last_run[0].info.run_id
print(f"Last run ID: {last_run_id}")

Last run ID: 8e0c293b860f455bac1dc0b38eabf89c


In [23]:
# register the model
model_uri = f"runs:/{last_run_id}/model"
model_name = "yellow_trip_duration_model"
model_version = mlflow.register_model(model_uri, model_name)
print(f"Model version: {model_version.version}")

Successfully registered model 'yellow_trip_duration_model'.
2025/05/17 19:03:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: yellow_trip_duration_model, version 1


Model version: 1


Created version '1' of model 'yellow_trip_duration_model'.


In [28]:
# run 2 of experiment 1
with mlflow.start_run(run_name="run-3"):
    
    # train the model
    params = {"fit_intercept": True, "n_jobs": 3}
    model = LinearRegression(**params)
    mlflow.log_params(params)
    model.fit(X_train_scaled, y_train)
    # evaluate the model
    score = model.score(X_test_scaled, y_test)
    mlflow.log_metric("score", score)
    print(f"Model score: {score}")

    # log the model
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_param("model_type", "LinearRegression")
    print("artifact uri:", mlflow.get_artifact_uri())
    # register the model
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
    model_name = "yellow_trip_duration_model"
    model_version = mlflow.register_model(model_uri, model_name)
    print(f"Model version: {model_version.version}")

Model score: 0.0038047647869704493


Registered model 'yellow_trip_duration_model' already exists. Creating a new version of this model...
2025/05/17 19:13:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: yellow_trip_duration_model, version 3


artifact uri: mlflow-artifacts:/1/d1adf38dd2124d89ac5e1b6c9471567f/artifacts
Model version: 3
🏃 View run run-3 at: http://127.0.0.1:5000/#/experiments/1/runs/d1adf38dd2124d89ac5e1b6c9471567f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '3' of model 'yellow_trip_duration_model'.
