# 1. Deployment as containers

Download and preprocess data:

In [None]:
import pickle
import pandas as pd

with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')

    return df

In [None]:
year, month = 2022, 2
df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-0{month}.parquet')

In [None]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

### Q1: Standard deviation of predicted duration

In [None]:
import numpy as np
np.std(y_pred)

5.28140357655334

### Q2: Preparing output parquet with pyarrow

In [None]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [None]:
df_result = pd.DataFrame({'ride_id': df['ride_id'], 'predicted_duration': y_pred})

In [None]:
output_file = 'data/2022-02-predictions.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [None]:
!ls -lhtra data/*.parquet

-rw-r--r-- 1 root root 58M Jun 20 11:57 data/2022-02-predictions.parquet


### Q3. Convert to script

In [None]:
!jupyter nbconvert --to python homework.ipynb

Note: I added a print mean statement for Q5 in the resulting `score.py` script.

### Q4. Creating a virtual environment

In [None]:
!pip install --quiet pipenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.5/468.5 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!python3 -m pipenv install mlflow scikit-learn==1.2.2

[1mCreating a virtualenv for this project...[0m
Pipfile: [33m[1m/content/mlops-zoomcamp/week4/Pipfile[0m
[1mUsing default python from[0m [33m[1m/usr/bin/python3[0m [32m(3.10.12)[0m [1mto create virtualenv...[0m
[2K[32m⠦[0m Creating virtual environment...[36mcreated virtual environment CPython3.10.12.final.0-64 in 1822ms
  creator CPython3Posix(dest=/root/.local/share/virtualenvs/week4-C3sBiBYm, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/.local/share/virtualenv)
    added seed packages: pip==23.1.2, setuptools==67.8.0, wheel==0.40.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
[0m
✔ Successfully created virtual environment!
[2K[32m⠧[0m Creating virtual environment...
[1A[2K[32mVirtualenv location: /root/.local/share/virtualenvs/week4-C3sBiBYm[0m
[1mCreating a Pipfile for this projec

### Q5. Mean predicted duration for March 2022 Yellow dataset

In [None]:
!python3 -m pipenv run python score.py --year 2022 --month 3

Mean of predicted duration 2022-03: 12.758556818790902


Note: had to change `models/model.bin` back to `model.bin` for next exercise, Dockerfile.

In [None]:
# Just to check the difference between Docker
!python3 -m pipenv run python score.py --year 2022 --month 4

Mean of predicted duration 2022-04: 12.865128336784926


### Q6. Mean of April 2022 Yellow dataset with Docker

Had to finish this task on local, here is the Dockerfile:
```yaml
FROM svizor/zoomcamp-model:mlops-3.10.0-slim

WORKDIR /app
RUN ["mkdir", "data"]

RUN ["pip", "install", "pipenv"]
COPY [ "Pipfile", "Pipfile.lock", "./" ]
RUN pipenv install --system --deploy

COPY notebooks/starter.py .
ENTRYPOINT ["python", "starter.py"]
```

Then I executed
```bash
$ docker build --tag nyc_taxi_deploy:0.0.1 .
[+] Building 54.6s (12/12) FINISHED
 => [internal] load build definition from Dockerfile                                                               0.0s
 => => transferring dockerfile: 291B                                                                               0.0s
 => [internal] load .dockerignore                                                                                  0.0s
 => => transferring context: 2B                                                                                    0.0s
 => [internal] load metadata for docker.io/svizor/zoomcamp-model:mlops-3.10.0-slim                                 1.9s
 => [internal] load build context                                                                                  0.0s
 => => transferring context: 90B                                                                                   0.0s
 => [1/7] FROM docker.io/svizor/zoomcamp-model:mlops-3.10.0-slim@sha256:595bf690875f5b9075550b61c609be10f05e69156  0.0s
 => CACHED [2/7] WORKDIR /app                                                                                      0.0s
 => [3/7] RUN ["mkdir", "data"]                                                                                    0.5s
 => [4/7] RUN ["pip", "install", "pipenv"]                                                                         6.5s
 => [5/7] COPY [ Pipfile, Pipfile.lock, ./ ]                                                                       0.0s
 => [6/7] RUN pipenv install --system --deploy                                                                    40.8s
 => [7/7] COPY score.py .                                                                                          0.0s
 => exporting to image                                                                                             4.6s
 => => exporting layers                                                                                            4.6s
 => => writing image sha256:5e2a2ef27ac4bc4a1c4b4a4c0c10c0dbcc6baec7d767a5ff0764bb19757dae25                       0.0s
 => => naming to docker.io/library/nyc_taxi_deploy:0.0.1                                                           0.0s
$ docker run --rm --name nyc_taxi nyc_taxi_deploy:0.0.1 --year=2022 --month=4
Mean of predicted duration 2022-04: 12.827242870079969
```
We see the result is indeed closer to the answer.