In [12]:
!python -V

Python 3.12.0


In [21]:
import pickle
import pandas as pd
import numpy as np
import requests
from io import BytesIO

In [22]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [23]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(url):
    # Send a GET request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for bad responses

    # Read the content of the response in memory
    file_content = BytesIO(response.content)

    # Read the Parquet file into a DataFrame
    df = pd.read_parquet(file_content)
    
    # Calculate duration in minutes
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].dt.total_seconds() / 60

    # Filter rows where duration is between 1 and 60 minutes
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()

    # Fill NA values and convert data types for categorical columns
    df[categorical] = df[categorical].fillna(-1).astype(int).astype(str)
    
    return df


In [15]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [18]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [19]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [24]:
np.std(y_pred)

6.247488852238703

In [26]:
year = 2023
month = 3

In [27]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [28]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,...,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000,2023/03_0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,...,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,2023/03_1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,...,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667,2023/03_2
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,...,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667,2023/03_3
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333,2023/03_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,...,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333,2023/03_3403761
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,...,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667,2023/03_3403762
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,...,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333,2023/03_3403763
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,...,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333,2023/03_3403764


In [35]:
df_result = df[['ride_id']]
df_result['prediction'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['prediction'] = y_pred


In [36]:
output_file = "./taxi_2023_03.parquet"
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [37]:
! ls -lh

total 163936
-rw-r--r--@ 1 aburtsev  staff    21K Jun 15 13:27 MLOps Zoomcamp Starter.ipynb
-rw-r--r--@ 1 aburtsev  staff    17K Jun 13 22:38 model.bin
-rw-r--r--@ 1 aburtsev  staff    65M Jun 15 13:28 taxi_2023_03.parquet


In [None]:
jupyter nbconvert --to script starter.ipynb