In [2]:
!pip freeze 

alembic==1.13.1
aniso8601==9.0.1
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Babel==2.15.0
beautifulsoup4==4.12.3
bleach==6.1.0
blinker==1.8.2
boto3==1.34.113
botocore==1.34.113
cachetools==5.3.3
certifi==2024.2.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cloudpickle==3.0.0
colorama==0.4.6
comm==0.2.2
contourpy==1.2.1
cramjam==2.8.3
cycler==0.12.1
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
Deprecated==1.2.14
docker==7.1.0
entrypoints==0.4
exceptiongroup==1.2.1
executing==2.0.1
fastjsonschema==2.19.1
fastparquet==2024.5.0
Flask==3.0.3
fonttools==4.51.0
fqdn==1.5.1
fsspec==2024.5.0
future==1.0.0
gitdb==4.0.11
GitPython==3.1.43
graphene==3.3
graphql-core==3.2.3
graphql-relay==3.2.0
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
hyperopt==0.2.7
idna==3.7
importlib-metadata==7.0.0
importlib_resources==6.4.0
ipykernel==6.29.4
ipython==8.18.1
ipywidgets==8.1.2
isoduration==20.11.0
itsdan

In [3]:
!python -V

Python 3.9.13


In [24]:
import pickle
import pandas as pd
import math
import os

In [5]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [6]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [7]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [9]:
mean_duration = sum(y_pred) / len(y_pred)

In [10]:
variance = sum((X_val - mean_duration) ** 2 for X_val in y_pred) / len(y_pred)

In [12]:
standard_deviation = math.sqrt(variance)

print("Standard Deviation of the predicted durations:", standard_deviation)

Standard Deviation of the predicted durations: 6.2474888522395675


In [16]:
year = 2023
month = 3

In [17]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [18]:
df['ride_id']

0                2023/03_0
1                2023/03_1
2                2023/03_2
3                2023/03_3
4                2023/03_4
                ...       
3403761    2023/03_3403761
3403762    2023/03_3403762
3403763    2023/03_3403763
3403764    2023/03_3403764
3403765    2023/03_3403765
Name: ride_id, Length: 3316216, dtype: object

In [19]:
df_result = pd.DataFrame()

In [20]:
df_result['ride_id'] =df['ride_id']
df_result['predictions'] =y_pred

In [21]:
df_result.head()

Unnamed: 0,ride_id,predictions
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.99772
4,2023/03_4,10.234486


In [22]:
output_file = 'result.bin'

In [23]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [25]:
file_size = os.path.getsize(output_file)

In [26]:
print(file_size)

68641783


In [28]:
 # Convert bytes to megabytes
file_size_mb = file_size / (1024 * 1024)
file_size_mb

65.46190547943115