In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [2]:
import pickle
import pandas as pd
import numpy as np

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

### Q1

In [7]:
np.std(y_pred)

5.28140357655334

### Q2

In [23]:
ride_id = (f'{2022:04d}/{2:02d}_' + df.index.astype('str')).to_numpy()
data = np.concatenate((ride_id[:, np.newaxis], y_pred[:, np.newaxis]), axis=1)
output_df = pd.DataFrame(data, columns=['ride_id', 'predicted_duration'])

In [24]:
output_df.head

<bound method NDFrame.head of                  ride_id predicted_duration
0              2022/02_0          18.527783
1              2022/02_1          23.065782
2              2022/02_2          33.686359
3              2022/02_3          23.757436
4              2022/02_4          21.492904
...                  ...                ...
2918182  2022/02_2979426          12.038225
2918183  2022/02_2979427          11.441569
2918184  2022/02_2979428          11.890459
2918185  2022/02_2979429          15.102681
2918186  2022/02_2979430           9.460592

[2918187 rows x 2 columns]>

In [25]:
output_df.to_parquet('yellow_taxi.parquet', engine='pyarrow', compression=None, index=False)

In [26]:
!du -sh yellow_taxi.parquet

58M	yellow_taxi.parquet


### Q3
See [starter.py](starter.py)

In [None]:
!jupyter nbconvert --to script starter.ipynb

### Q4

See [Pipfile](Pipfile) and [Pipfile.lock](Pipfile.lock).

In [None]:
!python -m pipenv shell && pipenv install -r requirements.txt

In [6]:
!grep -n scikit-learn Pipfile.lock

2484:        "scikit-learn": {


In [7]:
!grep -n scikit-learn Pipfile

175:scikit-learn = "==1.2.2"


### Q5

[starter.py](starter.py)

In [5]:
!python starter.py --year 2022 --month 3 --taxi_type yellow

[2023-06-19 22:24:12,314: INFO] Mean predicted duration for yellow_tripdata_2022-03.parquet = 12.7586 min.


### Q6

[Dockerfile_predictions](Dockerfile_predictions)

In [4]:
#docker build -t "taxi_test" -f Dockerfile_predictions .
!docker run -it --rm --name taxi_test -v "$PWD/starter.py":/app/starter.py -w /app taxi_test python starter.py --year 2022 --month 4 --taxi_type yellow

[2023-06-19 19:23:36,812: INFO] Mean predicted duration for yellow_tripdata_2022-04.parquet = 12.8272 min.
