In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl
from calitp import query_sql, magics
import calitp.magics
import branca

import shared_utils

from siuba import *
import pandas as pd

import datetime as dt
import time
from zoneinfo import ZoneInfo

import importlib

import gcsfs
fs = gcsfs.GCSFileSystem()

from tqdm import tqdm_notebook
from tqdm.notebook import trange, tqdm

from IPython.display import display, Markdown




In [2]:
import utils

In [3]:
itp_id = 300

In [4]:
pd.set_option("display.max_columns", 100)


In [5]:
rt_sched = query_sql(
    """
SELECT *
FROM `cal-itp-data-infra-staging.natalie_views.gtfs_rt_vs_sched_routes`
"""
)



In [6]:
rt_sched.head()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,num_sched,num_vp,pct_w_vp
0,300,Big Blue Bus,0,3479,1,2022-06-26,116,73,0.62931
1,300,Big Blue Bus,0,3479,1,2022-06-12,116,80,0.689655
2,300,Big Blue Bus,0,3479,1,2022-06-19,116,63,0.543103
3,300,Big Blue Bus,0,3479,1,2022-04-10,116,75,0.646552
4,300,Big Blue Bus,0,3479,1,2022-04-17,116,81,0.698276


In [7]:
rt_sched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1491 entries, 0 to 1490
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   calitp_itp_id      1491 non-null   int64  
 1   agency_name        1491 non-null   object 
 2   calitp_url_number  1491 non-null   int64  
 3   route_id           1491 non-null   object 
 4   route_short_name   1491 non-null   object 
 5   service_date       1491 non-null   object 
 6   num_sched          1491 non-null   int64  
 7   num_vp             1491 non-null   int64  
 8   pct_w_vp           1491 non-null   float64
dtypes: float64(1), int64(4), object(4)
memory usage: 105.0+ KB


In [8]:
rt_sched['service_date'] = pd.to_datetime(rt_sched['service_date'])

In [9]:
rt_sched['weekday'] = pd.Series(rt_sched.service_date).dt.day_name()

In [10]:
rt_sched['month'] =  pd.Series(rt_sched.service_date).dt.month_name()

In [11]:
rt_sched.sample()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,num_sched,num_vp,pct_w_vp,weekday,month
1333,300,Big Blue Bus,0,3498,41,2022-05-12,41,41,1.0,Thursday,May


## Charting

In [12]:
import altair as alt
from dla_utils import _dla_utils

from shared_utils import altair_utils
from shared_utils import geography_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

In [13]:
rt_sched.sample()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,num_sched,num_vp,pct_w_vp,weekday,month
1167,300,Big Blue Bus,0,3495,R3,2022-06-20,51,44,0.86,Monday,June


In [14]:
#aggregating by date
day_pct = utils.agg_by_date(rt_sched, 'num_sched', 'num_vp')

In [15]:
#see if we have multiple date entries
day_pct>>count(_.service_date)>>arrange(-_.n)

Unnamed: 0,service_date,n
0,2022-04-01,1
1,2022-04-02,1
2,2022-04-03,1
3,2022-04-04,1
4,2022-04-05,1
...,...,...
86,2022-06-26,1
87,2022-06-27,1
88,2022-06-28,1
89,2022-06-29,1


In [16]:
bar = (alt.Chart(day_pct)
        .mark_bar(size=5)
        .encode(
            x=alt.X('service_date', title=utils.labeling('service_date'), sort=("x")),
            y=alt.Y('pct_w_vp:Q', title=utils.labeling('pct_w_vp'), axis=alt.Axis(format='%')),
            color=alt.Color(
                'pct_w_vp',
                scale=alt.Scale(range=altair_utils.CALITP_SEQUENTIAL_COLORS),
                legend=alt.Legend(title=(_dla_utils.labeling('pct_w_vp')), symbolLimit=10)
            )
        ).properties(title='"Percent of Scheduled Trips with RT Vehicle Position Data"'))
    
chart = styleguide.preset_chart_config(bar)
chart.properties(width=900)

In [17]:
bar = (alt.Chart(day_pct)
        .mark_bar(size=5)
        .encode(
            x=alt.X('service_date', title=utils.labeling('service_date'), sort=("x")),
            y=alt.Y('pct_w_vp', title=utils.labeling('pct_w_vp'))
        ).properties(title="Percent of Scheduled Trips with RT Vehicle Position Data"))

# # mean line
rule = alt.Chart(rt_sched).mark_rule(color='red').encode(
    y=alt.Y('mean(pct_w_vp):Q', axis=alt.Axis(format='%'))
)

# line = alt.Chart(day_pct).mark_line(
#     color='red',
#     size=2
# ).transform_window(
#     rolling_mean='mean(pct_w_vp)',
#     frame=[-7, 7]
# ).encode(
#     x='service_date:T',
#     y='rolling_mean:Q'
# )

(bar + rule).properties(width=900)

In [18]:
day_pct.sample()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,service_date,weekday,month,total_num_sched,total_num_vp,pct_w_vp
68,300,Big Blue Bus,0,2022-06-08,Wednesday,June,1610,1416,0.88


In [19]:
df_long =  (day_pct>>select(_.service_date,
                         _.total_num_sched,
                         _.total_num_vp
                        ) 
             >> gather('measure',
                       'value',
                       _.total_num_sched,
                       _.total_num_vp
                      )
            )

In [20]:
df_long

Unnamed: 0,service_date,measure,value
0,2022-04-01,total_num_sched,1562
1,2022-04-02,total_num_sched,871
2,2022-04-03,total_num_sched,771
3,2022-04-04,total_num_sched,1518
4,2022-04-05,total_num_sched,1616
...,...,...,...
177,2022-06-26,total_num_vp,500
178,2022-06-27,total_num_vp,1017
179,2022-06-28,total_num_vp,1457
180,2022-06-29,total_num_vp,1454


In [21]:
bar = (alt.Chart(df_long).mark_bar(opacity=1, size=5).encode(
    x=alt.X('service_date', title=_dla_utils.labeling('service_date')),
    y=alt.Y('value', stack=None,  title=_dla_utils.labeling('number of trips')),
    color="measure")
      )

chart = styleguide.preset_chart_config(bar)
chart.properties(width=900)

In [22]:
day_pct.sample()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,service_date,weekday,month,total_num_sched,total_num_vp,pct_w_vp
33,300,Big Blue Bus,0,2022-05-04,Wednesday,May,1622,1487,0.92


In [23]:
week = day_pct>>group_by(_.weekday)>>summarize(avg_pct_w_vp = _.pct_w_vp.mean())

In [24]:
week

Unnamed: 0,weekday,avg_pct_w_vp
0,Friday,0.92
1,Monday,0.73
2,Saturday,0.71
3,Sunday,0.7
4,Thursday,0.92
5,Tuesday,0.9
6,Wednesday,0.92


In [25]:
cats_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
week['weekday'] = pd.Categorical(week['weekday'], categories=cats_day, ordered=True)
week=week.sort_values('weekday')          

In [26]:
week

Unnamed: 0,weekday,avg_pct_w_vp
1,Monday,0.73
5,Tuesday,0.9
6,Wednesday,0.92
4,Thursday,0.92
0,Friday,0.92
2,Saturday,0.71
3,Sunday,0.7


In [27]:
bar = alt.Chart(week).mark_bar().encode(
    x=alt.X('weekday', title=utils.labeling('weekday'), sort=cats_day),
    y=alt.Y('avg_pct_w_vp:Q', title=utils.labeling('avg_pct_w_vp'), axis=alt.Axis(format='%')),
    color="weekday")
chart = styleguide.preset_chart_config(bar)
chart

In [28]:
week_month = day_pct>>group_by(_.month, _.weekday)>>summarize(avg_pct_w_vp = _.pct_w_vp.mean())

In [29]:
week_month.sample()

Unnamed: 0,month,weekday,avg_pct_w_vp
14,May,Friday,0.91


In [30]:
bar = alt.Chart(week_month).mark_bar().encode(
    x=alt.X('weekday', title=utils.labeling('weekday'), sort=cats_day),
    y=alt.Y('avg_pct_w_vp:Q', title=utils.labeling('avg_pct_w_vp'), axis=alt.Axis(format='%')),
    color="weekday",
    column="month")
chart = styleguide.preset_chart_config(bar)
chart

In [31]:
cats_month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [32]:
bar = alt.Chart(week_month).mark_bar().encode(
    y=alt.Y('weekday', title=utils.labeling('weekday'), sort=cats_day),
    x=alt.X('avg_pct_w_vp:Q', title=utils.labeling('avg_pct_w_vp'), axis=alt.Axis(format='%')),
    color="weekday",
    row=alt.Row("month", sort=cats_month))
chart = styleguide.preset_chart_config(bar)
chart

In [33]:
line = alt.Chart(week_month).mark_line().encode(
    x=alt.X('weekday', title=_dla_utils.labeling('weekday'), sort=cats_day),
    y=alt.Y('avg_pct_w_vp:Q', title=_dla_utils.labeling('avg_pct_w_vp'), axis=alt.Axis(format='%')),
    color='month',
    strokeDash='month',
)
line_chart = styleguide.preset_chart_config(line)
line_chart.properties(width=600)

In [34]:
day_pct>>filter(_.pct_w_vp>.95)

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,service_date,weekday,month,total_num_sched,total_num_vp,pct_w_vp
0,300,Big Blue Bus,0,2022-04-01,Friday,April,1562,1512,0.97
7,300,Big Blue Bus,0,2022-04-08,Friday,April,1558,1492,0.96
11,300,Big Blue Bus,0,2022-04-12,Tuesday,April,1507,1477,0.98
12,300,Big Blue Bus,0,2022-04-13,Wednesday,April,1512,1459,0.96
13,300,Big Blue Bus,0,2022-04-14,Thursday,April,1508,1480,0.98
59,300,Big Blue Bus,0,2022-05-30,Monday,May,829,800,0.97
75,300,Big Blue Bus,0,2022-06-15,Wednesday,June,1505,1437,0.95
76,300,Big Blue Bus,0,2022-06-16,Thursday,June,1514,1487,0.98
77,300,Big Blue Bus,0,2022-06-17,Friday,June,1516,1507,0.99
83,300,Big Blue Bus,0,2022-06-23,Thursday,June,1558,1481,0.95


In [35]:
day_pct>>filter(_.service_date >= '2022-06-01', _.service_date <= '2022-06-03')>>arrange(_.service_date)

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,service_date,weekday,month,total_num_sched,total_num_vp,pct_w_vp
61,300,Big Blue Bus,0,2022-06-01,Wednesday,June,1613,1463,0.91
62,300,Big Blue Bus,0,2022-06-02,Thursday,June,1614,1458,0.9
63,300,Big Blue Bus,0,2022-06-03,Friday,June,1609,1387,0.86


## Group By Month

In [36]:
utils.groupby_onecol(day_pct, 'month', 'pct_w_vp')

Unnamed: 0,month,avg
0,April,0.83
2,May,0.82
1,June,0.83


In [37]:
bar = alt.Chart((utils.groupby_onecol(day_pct, 'month', 'pct_w_vp'))).mark_bar().encode(
    x=alt.X('month', title=utils.labeling('month'), sort=cats_month),
    y=alt.Y('avg:Q', title=utils.labeling('pct_w_vp'), axis=alt.Axis(format='%')),
    color="month")
chart = styleguide.preset_chart_config(bar)
chart

## Using Functions (without DBT tables)

In [38]:
#gtfs_daily = utils.load_schedule_data(analysis_date_start, analysis_date_end, itp_id)

In [39]:
#gtfs_daily

In [40]:
#gtfs_daily.service_date.min()

In [41]:
#gtfs_daily.service_date.max()

In [42]:
#rt = utils.load_rt_data(analysis_date_start, analysis_date_end)

In [43]:
#rt

In [44]:
#rt['str_len'] = rt.trip_id.str.len()

In [45]:
#rt.str_len.value_counts()

In [46]:
#len(rt)

In [47]:
#rt.trip_id.nunique()

In [48]:
#len(gtfs_daily)

In [49]:
#gtfs_daily.trip_id.nunique()

In [50]:
#date_list =['2022-05-01','2022-05-02', '2022-05-31']

In [51]:

# from datetime import date, timedelta
# start_date = date(2022, 5, 1)
# end_date = date(2022, 5, 31)

In [52]:
# daterange = pd.date_range(start_date, end_date)

In [53]:
# (daterange)

In [54]:
#above range does not work with function

In [55]:
# dates = set()

# def daterange2(start, end):
#     for n in range(int((end - start).days) + 1):
#         yield start + timedelta(n)

# for single_date in daterange2(start_date, end_date):
#     dates.add(single_date.strftime('%Y-%m-%d'))

In [56]:
# (dates)

In [57]:
# rt['date'] = pd.to_datetime(rt['date'])

In [58]:
# gtfs_daily['service_date'] = pd.to_datetime(gtfs_daily['service_date'])

In [59]:
# single_date2 = '2022-05-08'

# (rt>>filter(_.date == single_date2))


In [60]:
# def get_pct_ran_df2(itp_id, list_of_dates):
#     pcts = []
#     for single_date in list_of_dates:
#         gtfs_daily2 = (gtfs_daily>>filter(_.service_date == single_date))
#         rt2 = (rt>>filter(_.date == single_date))
#         sched_rt_df = (pd.merge(gtfs_daily2, rt2, how='outer', on='trip_id', indicator='have_rt'))
#         #pct_ran = (utils.get_pct_ran(sched_rt_df, single_date))
#         day_pct_ran = {}
#         day_pct_ran['date'] = single_date
#         if ((len(sched_rt_df))!=0):
#             day_pct_ran['pct_trips_ran'] = ((len(sched_rt_df>>filter(_.have_rt=='both')))/(len(gtfs_daily2)))
#         elif ((len(sched_rt_df))==0):
#             day_pct_ran['pct_trips_ran'] = ''
#         pct_ran = pd.DataFrame([day_pct_ran])
#         pct_ran['n_have_rt'] = (len(sched_rt_df>>filter(_.have_rt=='both')))
#         pct_ran['n_missing_rt'] = (len(sched_rt_df>>filter(_.have_rt=='right_only')))
#         pct_ran['unmatched_rt'] = (len(sched_rt_df>>filter(_.have_rt=='left_only')))
#         pct_ran['nunique_sched'] = (gtfs_daily2.trip_id.nunique())
#         pct_ran['nunique_rt'] = (rt2.trip_id.nunique())

#         pcts.append(pct_ran)                                                    
#         #code help from: https://stackoverflow.com/questions/28669482/appending-pandas-dataframes-generated-in-a-for-loop
#     pcts = pd.concat(pcts)
#     pcts = pcts>>arrange(_.date)
#     return pd.DataFrame(pcts)


In [61]:
# may = utils.get_pct_ran_df(itp_id, dates, gtfs_daily, rt)

In [62]:
# may