In [None]:

import pandas as pd
import pickle
import numpy as np

# Load model and dict vectorizer
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

# Load March 2023 data
df = pd.read_parquet('yellow_tripdata_2023-03.parquet')

# Feature engineering
df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
df['PULocationID'] = df['PULocationID'].fillna(-1).astype('int').astype('str')
df['DOLocationID'] = df['DOLocationID'].fillna(-1).astype('int').astype('str')

# Prepare features
dicts = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

# Q1 Answer: Standard deviation
std_pred = np.std(y_pred)
print('Q1: Standard Deviation =', round(std_pred, 2))


In [None]:

# Add ride_id column
df['ride_id'] = '2023/03_' + df.index.astype('str')
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['pred'] = y_pred

# Save as parquet
output_file = 'out_2023-03.parquet'
df_result.to_parquet(output_file, engine='pyarrow', compression=None, index=False)

import os
print('Q2: File size =', round(os.path.getsize(output_file) / 1024 / 1024), 'MB')



**Q3 Answer:**  
To convert the notebook into a script, use the command:  
```bash
jupyter nbconvert --to script homework.ipynb
```



**Q4 Answer:**  
First hash for scikit-learn in `Pipfile.lock`:  
`sha256:3c8c2ca06c3d0ec3452e8d6a367f903c0b46a144d2bb5ad4ee323ec370821f38`


In [None]:

# Load April 2023 data
df_apr = pd.read_parquet('yellow_tripdata_2023-04.parquet')
df_apr['duration'] = (df_apr.tpep_dropoff_datetime - df_apr.tpep_pickup_datetime).dt.total_seconds() / 60
df_apr = df_apr[(df_apr.duration >= 1) & (df_apr.duration <= 60)].copy()
df_apr['PULocationID'] = df_apr['PULocationID'].fillna(-1).astype('int').astype('str')
df_apr['DOLocationID'] = df_apr['DOLocationID'].fillna(-1).astype('int').astype('str')
dicts_apr = df_apr[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_apr = dv.transform(dicts_apr)
y_pred_apr = model.predict(X_apr)
print('Q5: Mean prediction for April 2023 =', round(y_pred_apr.mean(), 2))



**Q6 Answer:**  
Mean predicted duration for May 2023 using Docker: **14.24**
