In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.1.1


In [2]:
import pickle
import pandas as pd

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [7]:
df = read_data('https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet')

In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [15]:
y_pred

array([14.53986486, 13.74042222, 15.59333908, ..., 15.83492293,
       16.78317605, 19.65462607])

In [18]:
df_out = pd.DataFrame({'travel_time': y_pred})

In [20]:
year = 2021
month = 2
df_out['ride_id'] = f'{year:04d}/{month:02d}_' + df_out.index.astype('str')

In [21]:
df_out

Unnamed: 0,travel_time,ride_id
0,14.539865,2021/02_0
1,13.740422,2021/02_1
2,15.593339,2021/02_2
3,15.188118,2021/02_3
4,13.817206,2021/02_4
...,...,...
990108,12.433246,2021/02_990108
990109,12.910885,2021/02_990109
990110,15.834923,2021/02_990110
990111,16.783176,2021/02_990111


In [23]:
! pip install pyarrow

Collecting pyarrow
  Using cached pyarrow-8.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-8.0.0


In [24]:
df_out.to_parquet(
    'output_q2.parquet',
    engine='pyarrow',
    compression=None,
    index=False
)

In [27]:
! ls -lh

total 19M
-rw-rw-r-- 1 ubuntu ubuntu   72 Jun 27 16:48 Dockerfile
-rw-rw-r-- 1 ubuntu ubuntu  18K Jun 27 16:48 model.bin
-rw-rw-r-- 1 ubuntu ubuntu  19M Jun 27 18:31 output_q2.parquet
-rw-rw-r-- 1 ubuntu ubuntu 8.4K Jun 27 18:29 starter.ipynb


In [29]:
! jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1396 bytes to starter.py


In [34]:
%%writefile starter.py

import pickle
import pandas as pd
import sys


with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)


categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df


def run():
    year = int(sys.argv[1])
    month = int(sys.argv[2])
    
    df = read_data(f'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{year:04d}-{month:02d}.parquet')

    dicts = df[categorical].to_dict(orient='records')
    X_val = dv.transform(dicts)
    y_pred = lr.predict(X_val)
    
    print('mean predicted travel duration: ', y_pred.mean())
    
    df_out = pd.DataFrame({'travel_time': y_pred})
    
    df_out['ride_id'] = f'{year:04d}/{month:02d}_' + df_out.index.astype('str')
    
    df_out.to_parquet(
        'output.parquet',
        engine='pyarrow',
        compression=None,
        index=False
    )

    
    
if __name__ == '__main__':
    run()


Overwriting starter.py


In [35]:
! ls -a

.		    Dockerfile2   model.bin	     starter.ipynb
..		    Pipfile	  output.parquet     starter.py
.ipynb_checkpoints  Pipfile.lock  output_q2.parquet


In [4]:
%%writefile Dockerfile

FROM agrigorev/zoomcamp-model:mlops-3.9.7-slim

RUN pip install -U pip
RUN pip install pipenv

WORKDIR /app

COPY [ "Pipfile", "Pipfile.lock", "starter.py", "./"]

RUN pipenv install --system --deploy

ENTRYPOINT ["python3", "starter.py"]

Overwriting Dockerfile
