# Storage cost of GTFS-RT Data

In [41]:
import pandas as pd
import os
import time
from IPython.display import Markdown
import time
from google.transit import gtfs_realtime_pb2
import requests
from google.protobuf.json_format import MessageToJson, MessageToDict

## Reading in some sample GTFS-RT

In [38]:
# Download the GTFS for big blue
FEED_URL = 'http://gtfs.bigbluebus.com/tripupdates.bin'

feed = gtfs_realtime_pb2.FeedMessage()
response = requests.get(FEED_URL)
feed.ParseFromString(response.content)

82938

In [43]:
# dict version
trip_updates_dict = MessageToDict(feed)

In [50]:
# extrememly naive conversion to DF for writing parquet 
df = pd.DataFrame(trip_updates_dict['entity'])
df.head()

Unnamed: 0,id,tripUpdate
0,846423_1323_55620,"{'trip': {'tripId': '846423', 'startTime': '15..."
1,845102_1704_54840,"{'trip': {'tripId': '845102', 'startTime': '15..."
2,845919_1707_51540,"{'trip': {'tripId': '845919', 'startTime': '14..."
3,845104_1564_56640,"{'trip': {'tripId': '845104', 'startTime': '15..."
4,845028_5310_54300,"{'trip': {'tripId': '845028', 'startTime': '15..."


In [51]:
df.to_parquet('./sample_trip_updates.parquet')

ArrowInvalid: Nested column branch had multiple children: struct<stopTimeUpdate: list<item: struct<arrival: struct<time: string, uncertainty: int64>, departure: struct<time: string, uncertainty: int64>, scheduleRelationship: string, stopId: string, stopSequence: int64>>, timestamp: string, trip: struct<directionId: int64, routeId: string, scheduleRelationship: string, startDate: string, startTime: string, tripId: string>, vehicle: struct<id: string>>

In [2]:
df = pandas.read_parquet("atsac_det_data_20200311.parquet")

In [3]:
df = df.assign(
    TIMESTAMP=pandas.to_datetime(df.TIMESTAMP),
    ANGLE=df.ANGLE.astype("float32"),
)

## I/O Stats

In [4]:
def parquet_stats(df, **kwargs):
    display(Markdown(f"##### args: {kwargs}"))
    fname = "out.parquet"
    s = time.time()
    df.to_parquet(fname, **kwargs)
    e = time.time()
    display(Markdown(f"##### Write time: {e-s:.0f} seconds"))
    s = time.time()
    pandas.read_parquet(fname)
    e = time.time()
    display(Markdown(f"##### Read time: {e-s:.0f} seconds"))
    size = os.stat(fname).st_size/1024/1024
    display(Markdown(f"##### Storage: {size:.0f} MB"))


### Baseline value (snappy compression, dictionary encoding)

In [5]:
parquet_stats(df)

##### args: {}

##### Write time: 31 seconds

##### Read time: 79 seconds

##### Storage: 313 MB

### Snappy compression, no dictionary encoding

In [6]:
parquet_stats(df, use_dictionary=False, compression='snappy')

##### args: {'use_dictionary': False, 'compression': 'snappy'}

##### Write time: 72 seconds

##### Read time: 92 seconds

##### Storage: 995 MB

### gzip compression, no dictionary encoding

In [7]:
parquet_stats(df, use_dictionary=False, compression='gzip')

##### args: {'use_dictionary': False, 'compression': 'gzip'}

##### Write time: 192 seconds

##### Read time: 88 seconds

##### Storage: 426 MB

### gzip compression, dictionary encoding

In [8]:
parquet_stats(df, use_dictionary=True, compression='snappy')

##### args: {'use_dictionary': True, 'compression': 'snappy'}

##### Write time: 66 seconds

##### Read time: 86 seconds

##### Storage: 313 MB

### No compression, no dictionary encoding

In [9]:
parquet_stats(df, use_dictionary=False, compression='none')

##### args: {'use_dictionary': False, 'compression': 'none'}

##### Write time: 60 seconds

##### Read time: 114 seconds

##### Storage: 5965 MB

### No compression, dictionary encoding

In [10]:
parquet_stats(df, use_dictionary=True, compression='none')

##### args: {'use_dictionary': True, 'compression': 'none'}

##### Write time: 66 seconds

##### Read time: 92 seconds

##### Storage: 795 MB

## Storage costs

The above shows that the industry standard of dictionary encoding with snappy compression
is an excellent combination of I/O performance and on-disk size. We will use that
as the representative storage size when estimating cloud storage costs:

In [11]:
s3_cost_per_GB_per_month = 0.023
s3_cost_per_GB_per_year = s3_cost_per_GB_per_month * 12.
daily_data_size = 0.313
yearly_data_size = daily_data_size * 365

s3_cost_per_year = yearly_data_size * s3_cost_per_GB_per_year
display(Markdown(f"#### Approximate size of one year of data: {yearly_data_size:.0f} GB"))
display(Markdown(f"#### Approximate yearly cost per year of data: ${s3_cost_per_year:.0f}"))

#### Approximate size of one year of data: 114 GB

#### Approximate yearly cost per year of data: $32

These costs assume the data is reported at 30 minute intervals.
We can use them to approximately scale to different intervals.
For instance, if the data are reported once an hour,
the costs are halved.
If the data are reported every five minutes,
the costs go up by a factor of six.