In [1]:
# https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/cohorts/2023/04-deployment/homework/starter.ipynb
! pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [2]:
import pickle
import pandas as pd

In [3]:
# wget https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/cohorts/2023/04-deployment/homework/model.bin

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet')
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [11]:
# Q1. What's the standard deviation of the predicted duration for this dataset?
pd.Series(y_pred).describe()["std"]

5.281404481465351

In [15]:
# Q2. Preparing the output
year = 2022
month = 2

df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

df_results = pd.DataFrame()
df_results['ride_id'] = df['ride_id']
df_results['y_pred'] = y_pred

df_results.to_parquet(
    "df_results.parquet",
    engine='pyarrow',
    compression=None,
    index=False
)

In [16]:
! ls -l

total 58616
-rw-rw-rw- 1 codespace codespace 59994831 Jun 12 20:24 df_results.parquet
-rw-rw-rw- 1 codespace codespace    17369 Jun 12 20:00 model.bin
-rw-rw-rw- 1 codespace codespace     3774 Jun 12 20:15 week4.ipynb


In [None]:
# Q3. Creating the scoring script
# https://stackoverflow.com/questions/17077494/how-do-i-convert-a-ipython-notebook-into-a-python-file-via-commandline
! jupyter nbconvert --to script [YOUR_NOTEBOOK].ipynb

In [None]:
# Q4. Virtual environment
! pipenv install -r requirements.txt

In [18]:
# Question 5: Mean predicted duration for March 2022 Yellow dataset
! python week4.py

12.758556818790902


In [None]:
# Q6. Docker container
! docker build -t ride-duration-prediction-april-2022:v1 .