In [2]:
!python -V

Python 3.12.2


In [19]:
import pickle

import pandas as pd
import numpy as np

In [18]:
year = 2023
month = 3
taxi_type = 'yellow'
model_path = 'model.bin'

input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

In [23]:


def read_data(filename, year, month):
    categorical = ['PULocationID', 'DOLocationID']
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    return df

def load_model(model_path):
    with open(model_path, 'rb') as f_in:
        dv, model = pickle.load(f_in)
    return dv, model


def prepare_dictionaries(df):
    categorical = ['PULocationID', 'DOLocationID']
    dicts = df[categorical].to_dict(orient='records')
    return dicts

def get_std(predictions):
    std_dev = np.std(y_pred)
    print(f"Standard Deviation of the predicted duration: {std_dev}")

def apply_model(model_path, input_file, output_file, year, month):
    print(f'reading the data from {input_file}...')
    df = read_data(input_file, year, month)
    dicts = prepare_dictionaries(df)

    print(f'loading the model from {model_path}...')
    dv, model =  load_model(model_path)
    X_val = dv.transform(dicts)
    
    print(f'applying the model...')
    y_pred = model.predict(X_val)
    
    get_std(y_pred)

    df_result = pd.DataFrame({
        'ride_id': df['ride_id'],
        'predicted_duration': y_pred
    })

    print(f'saving the result to {output_file}...')
    df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
    )

In [None]:
apply_model(model_path=model_path, input_file=input_file, output_file=output_file, year=year, month=month)

## Converting to scrip

```
jupyter nbconvert --to script starter.ipynb
````


## Creating virtual environment

In [34]:
!pip freeze | grep -E 'scikit-learn|numpy|pandas|pyarrow'

numpy==1.26.4
pandas==2.2.2
pyarrow==14.0.1
scikit-learn==1.5.0


In [35]:
! pipenv install scikit-learn==1.5.0 pandas==2.2.2 numpy==1.26.4 pyarrow==14.0.1 argparse

[1;32mInstalling scikit-[0m[1;33mlearn[0m[1;32m==[0m[1;36m1.5[0m[1;32m.[0m[1;36m0[0m[1;33m...[0m
[?25lResolving scikit-[33mlearn[0m==[1;36m1.5[0m.[1;36m0[0m[33m...[0m
[2K✔ Installation Succeeded
[2K[32m⠋[0m Installing scikit-learn...
[1A[2K[1;32mInstalling [0m[1;33mpandas[0m[1;32m==[0m[1;36m2.2[0m[1;32m.[0m[1;36m2[0m[1;33m...[0m
[?25lResolving [33mpandas[0m==[1;36m2.2[0m.[1;36m2[0m[33m...[0m
[2K✔ Installation Succeeded
[2K[32m⠋[0m Installing pandas...
[1A[2K[1;32mInstalling [0m[1;33mnumpy[0m[1;32m==[0m[1;36m1.26[0m[1;32m.[0m[1;36m4[0m[1;33m...[0m
[?25lResolving [33mnumpy[0m==[1;36m1.26[0m.[1;36m4[0m[33m...[0m
[2K✔ Installation Succeeded
[2K[32m⠋[0m Installing numpy...
[1A[2K[1;32mInstalling [0m[1;33mpyarrow[0m[1;32m==[0m[1;36m14.0[0m[1;32m.[0m[1;36m1[0m[1;33m...[0m
[?25lResolving [33mpyarrow[0m==[1;36m14.0[0m.[1;36m1[0m[33m...[0m
[2K✔ Installation Succeeded
[2K[32m⠋[0m Inst

In [40]:
! pipenv run python starter.py -y=2023 -m=4

reading the data from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet ...
loading the model from model.bin ...
applying the model...
Mean of the predicted duration: 14.292282936862449
saving the result to output/yellow/2023-04.parquet ...
