In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl
from calitp import query_sql
import calitp.magics
import branca

import shared_utils
import utils

from siuba import *
import pandas as pd

import datetime as dt
import time
from zoneinfo import ZoneInfo

import importlib

import gcsfs
fs = gcsfs.GCSFileSystem()

from tqdm import tqdm_notebook
from tqdm.notebook import trange, tqdm

from IPython.display import display, Markdown



In [2]:
pd.set_option("display.max_columns", 100)

In [3]:
analysis_date1 = '2022-05-25'
analysis_date2 = '2022-05-26'
itp_id = 300


In [4]:
updates = query_sql(f'''
SELECT * FROM `cal-itp-data-infra-staging.natalie_staging.stg_rt__trip_updates`
WHERE date BETWEEN '{analysis_date1}' AND '{analysis_date2}'
AND calitp_itp_id = {itp_id}
LIMIT 2000
''')

In [5]:
updates.head()

Unnamed: 0,calitp_itp_id,calitp_url_number,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key
0,300,0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,881544_1356_47520,1653512301,,1356,,,881544,3483,1,13:12:00,20220525,SCHEDULED,"[{'stopSequence': 44, 'stopId': '441', 'arriva...",fd662beb31e46345a9202397684c61cf
1,300,0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,882601_1319_48600,1653512197,,1319,,,882601,3489,1,13:30:00,20220525,SCHEDULED,"[{'stopSequence': 17, 'stopId': '1409', 'arriv...",50469620c374658e0421f071864aa0df
2,300,0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,883101_1709_47400,1653512169,,1709,,,883101,3493,0,13:10:00,20220525,SCHEDULED,"[{'stopSequence': 36, 'stopId': '1653', 'arriv...",378c8a7b4850b62f3672f112d60be112
3,300,0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,882516_1332_48960,1653512161,,1332,,,882516,3489,0,13:36:00,20220525,SCHEDULED,"[{'stopSequence': 14, 'stopId': '1184', 'arriv...",22ac405ec79a84d15961911297334fa5
4,300,0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,882882_1305_48240,1653512217,,1305,,,882882,3490,1,13:24:00,20220525,SCHEDULED,"[{'stopSequence': 34, 'stopId': '1348', 'arriv...",6f1348482838ad8f2246a1d3332f6b0b


In [6]:
updates.trip_schedule_relationship.value_counts()

SCHEDULED    2000
Name: trip_schedule_relationship, dtype: int64

In [7]:
updates>>count(_.date)

Unnamed: 0,date,n
0,2022-05-25,2000


In [8]:
updates.trip_id.value_counts()

881486    12
882518    12
883343    12
883342    12
880853    12
          ..
881544     4
881900     2
882601     1
883144     1
882516     1
Name: trip_id, Length: 185, dtype: int64

In [9]:
updates.trip_direction_id.value_counts()

0    1011
1     989
Name: trip_direction_id, dtype: int64

In [10]:
updates.vehicle_id.value_counts()

2912                            35
block_4304_schedBasedVehicle    34
1822                            34
1824                            34
1821                            34
                                ..
1812                            11
1505                            11
1330                            11
1352                            11
2104                            10
Name: vehicle_id, Length: 100, dtype: int64

In [11]:
updates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   calitp_itp_id               2000 non-null   int64 
 1   calitp_url_number           2000 non-null   int64 
 2   original_file_path          2000 non-null   object
 3   date                        2000 non-null   object
 4   id                          2000 non-null   object
 5   timestamp                   2000 non-null   int64 
 6   delay                       0 non-null      object
 7   vehicle_id                  2000 non-null   object
 8   vehicle_label               0 non-null      object
 9   vehicle_license_plate       0 non-null      object
 10  trip_id                     2000 non-null   object
 11  trip_route_id               2000 non-null   object
 12  trip_direction_id           2000 non-null   int64 
 13  trip_start_time             2000 non-null   obje

In [12]:
# reading in the vehicle_position and scheduled trips
trips = utils.read_data()

In [13]:
# filtering down to same operator (BBB) and service date (2022-05-25)

trips = (trips >> filter(_.calitp_itp_id == itp_id,
                         _.service_date >= analysis_date1,
                         _.service_date <= analysis_date2))

In [14]:
trips>>count(_.service_date)

Unnamed: 0,service_date,n
0,2022-05-25,19
1,2022-05-26,19


In [15]:
len(trips)

38

In [16]:
trips.head()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month
263,300,Big Blue Bus,0,3501,44,2022-05-25,2022-03-08,2022-09-01,104,0,0.0,Wednesday,May
268,300,Big Blue Bus,0,3501,44,2022-05-26,2022-03-08,2022-09-01,104,0,0.0,Thursday,May
2884,300,Big Blue Bus,0,3488,R10,2022-05-25,2022-03-08,2022-09-01,6,5,0.83,Wednesday,May
2891,300,Big Blue Bus,0,3488,R10,2022-05-26,2022-03-08,2022-09-01,6,6,1.0,Thursday,May
4245,300,Big Blue Bus,0,3483,5,2022-05-26,2022-03-08,2022-09-01,24,21,0.88,Thursday,May


In [17]:
updates.sample()

Unnamed: 0,calitp_itp_id,calitp_url_number,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key
298,300,0,gtfs-data/rt/2022-05-25T21:00:41/300/0/gtfs_rt...,2022-05-25,880850_1315_49200,1653512364,,1315,,,880850,3479,1,13:40:00,20220525,SCHEDULED,"[{'stopSequence': 14, 'stopId': '317', 'arriva...",8a9db446d07d22e8f59df2f4b511cf01


In [18]:
## joining updates and trips

In [19]:
updates.date.value_counts()

2022-05-25    2000
Name: date, dtype: int64

In [20]:
updates>>count(_.date)

Unnamed: 0,date,n
0,2022-05-25,2000


In [21]:
updates.date.dtype

dtype('O')

In [22]:
import datetime

In [23]:
single_analysis_date = datetime.date(2022, 5, 25)

In [24]:
len(updates>>filter(_.date==single_analysis_date))

2000

In [25]:
join = (pd.merge((trips>>filter(_.service_date==analysis_date1)), 
                 (updates>>filter(_.date==single_analysis_date)),
                 how='outer', left_on='route_id', right_on='trip_route_id',
                 indicator='have_updates'))

#note changing from merging on trip_id to route_id

In [26]:
join.head()

Unnamed: 0,calitp_itp_id_x,agency_name,calitp_url_number_x,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,calitp_itp_id_y,calitp_url_number_y,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key,have_updates
0,300,Big Blue Bus,0,3501,44,2022-05-25,2022-03-08,2022-09-01,104,0,0.0,Wednesday,May,,,,,,,,,,,,,,,,,,,left_only
1,300,Big Blue Bus,0,3488,R10,2022-05-25,2022-03-08,2022-09-01,6,5,0.83,Wednesday,May,,,,,,,,,,,,,,,,,,,left_only
2,300,Big Blue Bus,0,3483,5,2022-05-25,2022-03-08,2022-09-01,24,18,0.75,Wednesday,May,300.0,0.0,gtfs-data/rt/2022-05-25T21:00:01/300/0/gtfs_rt...,2022-05-25,881544_1356_47520,1653512301.0,,1356.0,,,881544.0,3483.0,1.0,13:12:00,20220525.0,SCHEDULED,"[{'stopSequence': 44, 'stopId': '441', 'arriva...",fd662beb31e46345a9202397684c61cf,both
3,300,Big Blue Bus,0,3483,5,2022-05-25,2022-03-08,2022-09-01,24,18,0.75,Wednesday,May,300.0,0.0,gtfs-data/rt/2022-05-25T21:00:21/300/0/gtfs_rt...,2022-05-25,881544_1356_47520,1653512301.0,,1356.0,,,881544.0,3483.0,1.0,13:12:00,20220525.0,SCHEDULED,"[{'stopSequence': 45, 'stopId': '1345', 'arriv...",939af2b8936c7c0a3281a56381a65fce,both
4,300,Big Blue Bus,0,3483,5,2022-05-25,2022-03-08,2022-09-01,24,18,0.75,Wednesday,May,300.0,0.0,gtfs-data/rt/2022-05-25T21:00:41/300/0/gtfs_rt...,2022-05-25,881544_1356_47520,1653512301.0,,1356.0,,,881544.0,3483.0,1.0,13:12:00,20220525.0,SCHEDULED,"[{'stopSequence': 45, 'stopId': '1345', 'arriv...",381edeecb63b4ae29506baa6100a48af,both


In [27]:
join.have_updates.value_counts()

both          2000
left_only        2
right_only       0
Name: have_updates, dtype: int64

In [28]:
(join>>filter(_.have_updates=='left_only'))

Unnamed: 0,calitp_itp_id_x,agency_name,calitp_url_number_x,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,calitp_itp_id_y,calitp_url_number_y,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key,have_updates
0,300,Big Blue Bus,0,3501,44,2022-05-25,2022-03-08,2022-09-01,104,0,0.0,Wednesday,May,,,,,,,,,,,,,,,,,,,left_only
1,300,Big Blue Bus,0,3488,R10,2022-05-25,2022-03-08,2022-09-01,6,5,0.83,Wednesday,May,,,,,,,,,,,,,,,,,,,left_only


Note: 
* With a limit of 1000, there were three trips that did not match.
* With a limit of 2000 and two dates, there were two trips that did not match
* With a limit of 2000 and one date, there is one trip that did not match

In [29]:
join.sample()

Unnamed: 0,calitp_itp_id_x,agency_name,calitp_url_number_x,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,calitp_itp_id_y,calitp_url_number_y,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key,have_updates
1970,300,Big Blue Bus,0,3479,1,2022-05-25,2022-03-08,2022-09-01,180,179,0.99,Wednesday,May,300.0,0.0,gtfs-data/rt/2022-05-25T21:00:41/300/0/gtfs_rt...,2022-05-25,880851_1344_48600,1653512371.0,,1344,,,880851,3479,1.0,13:30:00,20220525,SCHEDULED,"[{'stopSequence': 23, 'stopId': '325', 'arriva...",dc02d99f38a91b370cf52dc52e35da63,both


In [30]:
(join>>filter(_.have_updates=='both')).delay.info()

<class 'pandas.core.series.Series'>
Int64Index: 2000 entries, 2 to 2001
Series name: delay
Non-Null Count  Dtype 
--------------  ----- 
0 non-null      object
dtypes: object(1)
memory usage: 31.2+ KB


In [31]:
join.delay.describe()

count       0
unique      0
top       NaN
freq      NaN
Name: delay, dtype: object

In [32]:
join.trip_schedule_relationship.value_counts()

SCHEDULED    2000
Name: trip_schedule_relationship, dtype: int64

In [33]:
join>>filter(_.have_updates=='both')>>count(_.trip_id)>>arrange(-_.n)

Unnamed: 0,trip_id,n
1,880757,12
2,880758,12
3,880759,12
5,880761,12
6,880762,12
...,...,...
133,883101,4
62,881900,2
90,882516,1
98,882601,1


In [34]:
join>>filter(_.have_updates=='both')>>count(_.route_id)>>arrange(-_.n)

Unnamed: 0,route_id,n
0,3479,280
2,3481,190
7,3489,172
8,3490,153
4,3485,138
13,3495,136
11,3493,127
6,3487,123
1,3480,118
5,3486,113


In [35]:
join.sample()

Unnamed: 0,calitp_itp_id_x,agency_name,calitp_url_number_x,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,calitp_itp_id_y,calitp_url_number_y,original_file_path,date,id,timestamp,delay,vehicle_id,vehicle_label,vehicle_license_plate,trip_id,trip_route_id,trip_direction_id,trip_start_time,trip_start_date,trip_schedule_relationship,stop_time_updates,key,have_updates
329,300,Big Blue Bus,0,3494,18,2022-05-25,2022-03-08,2022-09-01,53,52,0.98,Wednesday,May,300.0,0.0,gtfs-data/rt/2022-05-25T21:02:22/300/0/gtfs_rt...,2022-05-25,883260_block_1804_schedBasedVehicle_55740,1653512523.0,,block_1804_schedBasedVehicle,,,883260,3494,0.0,15:29:00,20220525,SCHEDULED,"[{'stopSequence': 1, 'stopId': '1410', 'arriva...",92bda038de29f9d7e1b2ff7db342bd6a,both


In [36]:
from shared_utils import geography_utils

In [37]:
updates_agg = (geography_utils.aggregate_by_geography(
    (join>>filter(_.have_updates=="both")),
    group_cols=["calitp_itp_id_x",
                 "agency_name",
                 "calitp_url_number_x",
                "route_id",
                "route_short_name",
                 "service_date",
                "num_sched",
                "num_vp",
                "pct_w_vp",
                 "weekday",
                 "month"],
    count_cols= ["trip_route_id"],
    nunique_cols = ["delay"],
    rename_cols = True,
)).rename(columns = {'trip_route_id_count':'num_trip_updates'})

In [38]:
updates_agg_subset = updates_agg>>select(_.agency_name, _.service_date, _.num_sched, _.num_vp, _.num_trip_updates)

In [39]:
import altair as alt

In [40]:
updates_agg_long =  (updates_agg_subset>>select(_.agency_name,
                                     _.service_date,
                                     _.num_sched,
                                     _.num_vp,
                                     _.num_trip_updates
                        ) 
             >> gather('measure',
                       'value',
                       _.num_sched,
                       _.num_vp,
                       _.num_trip_updates
                      )
            )

In [41]:
updates_agg_long.sample()

Unnamed: 0,agency_name,service_date,measure,value
28,Big Blue Bus,2022-05-25,num_vp,88


In [42]:
alt.Chart(updates_agg_long).mark_bar().encode(
    x='measure',
    y='value',
    color='measure',
)