In [18]:
!python -V

Python 3.10.10


In [3]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import pickle

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [6]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [7]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet')

In [8]:
len(df_train), len(df_val)

(59603, 66097)

In [2]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

2023/06/18 08:22:10 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-models-alexey/1', creation_time=1687076530565, experiment_id='1', last_update_time=1687076530565, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [14]:
from sklearn.pipeline import make_pipeline


In [17]:
with mlflow.start_run():
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    mlflow.log_params({
        'categorical': categorical,
        'numerical': numerical,
    })

    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values

    model_params = {
        'fit_intercept': True
    }

    mlflow.log_params(model_params)
    
    pipeline = make_pipeline(
        DictVectorizer(),
        LinearRegression(**model_params)
    )
    
    ## train 

    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    pipeline.fit(train_dicts, y_train)

    ## validate

    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    y_pred = pipeline.predict(val_dicts)

    ## evaluate

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(rmse)
    
    mlflow.log_metric('rmse', rmse)

    mlflow.sklearn.log_model(pipeline, 'model')

    print(f'run ID: {mlflow.active_run().info.run_id}')

8.19383255249626
run ID: 7c373fc9626549ed91cebb714b07e60a


## Loading the model

In [1]:
import pandas as pd

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-03.parquet')

In [6]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")


In [8]:
logged_model = 'runs:/7c373fc9626549ed91cebb714b07e60a/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [14]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

records = df[categorical + numerical].to_dict(orient='records')

In [15]:
records[5]

{'PULocationID': '74', 'DOLocationID': '74', 'trip_distance': 0.9}

In [17]:
loaded_model.predict(records)

array([10.49036301,  8.79592734,  8.3216776 , ..., 14.41299145,
       12.09075749,  9.58627186])

In [19]:
model = mlflow.pyfunc.load_model('models:/duration_prediction/staging')

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [22]:
model = mlflow.pyfunc.load_model('s3://mlflow-models-alexey/1/7c373fc9626549ed91cebb714b07e60a/artifacts/model')

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [23]:
model.predict(records)

array([10.49036301,  8.79592734,  8.3216776 , ..., 14.41299145,
       12.09075749,  9.58627186])