In [2]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2
scikit-learn-intelex==2021.20210714.170444


In [2]:
import pickle
import pandas as pd
import numpy as np
import os
import sys

In [None]:
if len(sys.argv) >= 3:
    year = sys.argv[1]
    month = sys.argv[2]
    print("Year:", year)
    print("Month:", month)
else:
    print("Please provide both year and month as command-line arguments.")

In [10]:
#year = 2022
#month = '02'
output_file = './outputs/output_file_{year}_{month}.parquet'

In [11]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [12]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [13]:
df = read_data(f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet")

In [14]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

### Q1 What's the standard deviation of the predicted duration for this dataset?

In [15]:
np.std(y_pred)

5.28140357655334

### Q2 Preparing the output

In [11]:
df['ride_id'] = f'{year:04d}/{int(month):02d}_' + df.index.astype('str')
df['ride_id'].head(2)

0    2022/02_0
1    2022/02_1
Name: ride_id, dtype: object

In [12]:
df_result = df[['ride_id']].copy()
df_result['result'] = y_pred
df_result.head()

Unnamed: 0,ride_id,result
0,2022/02_0,18.527783
1,2022/02_1,23.065782
2,2022/02_2,33.686359
3,2022/02_3,23.757436
4,2022/02_4,21.492904


In [23]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [26]:
file_stats = os.stat(output_file)
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')

File Size in MegaBytes is 57.215529441833496


### Q3. Creating the scoring script

In [16]:
!jupyter nbconvert --to python starter.ipynb 

[NbConvertApp] Converting notebook starter.ipynb to python
[NbConvertApp] Writing 2036 bytes to starter.py


### Q4.

065e9673e24e0dc5113e2dd2b4ca30c9d8aa2fa90f4c0597241c93b63130d233

### Q5

In [9]:
np.mean(y_pred)

12.758556818790902