In [19]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [20]:
!python -V

Python 3.9.6


In [21]:
import pickle
import pandas as pd
import os

In [22]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [23]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [24]:
taxi_type = 'yellow'
month = 3
year = 2023

input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

df = read_data(input_file)


In [25]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1. Notebook
What's the standard deviation of the predicted duration for this dataset?

In [26]:
print(f'the standard deviation is {y_pred.std():.2f}')

the standard deviation is 6.25


### Adding Ride ID

In [27]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

### Saving id and predictions to a results df

In [28]:
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['prediction'] = y_pred

In [31]:
def create_outfolder(output_file):
    path = os.path.dirname(output_file)
    current_directory = os.getcwd()
    final_directory = os.path.join(current_directory, path)
    if not os.path.exists(final_directory):
        os.makedirs(final_directory)

create_outfolder(output_file)

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

## Q2. Preparing the output
What is the size of the output?

In [45]:
file_stats = os.stat(output_file)
print(f'File Size in MegaBytes is {(file_stats.st_size/(1024*1024)):.1f}')

File Size in MegaBytes is 65.5


## Q3. Creating the scoring script
Now let's turn the notebook into a script. Which command you need to execute for that?

```bash
jupyter nbconvert --to script starter.ipynb
```