# Report for May 4: {District}

In [1]:
import utils
import shared_utils
from dla_utils import _dla_utils as dla_utils
from shared_utils import styleguide, altair_utils, portfolio_utils

from siuba import *
import pandas as pd

from IPython.display import display, Markdown, HTML

import altair as alt

import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl

from calitp import query_sql, magics




In [9]:
district = '04 - Oakland'

In [10]:
## alternatively 
may4 = query_sql(f'''
SELECT * FROM `cal-itp-data-infra.views.gtfs_rt_vs_schedule_trips_may4_sample`
''')

In [11]:
may4['service_date'] = pd.to_datetime(may4['service_date'])
may4['weekday'] = pd.Series(may4.service_date).dt.day_name()    
may4['month'] =  pd.Series(may4.service_date).dt.month_name()

In [21]:
itpid_district = portfolio_utils.add_caltrans_district()
may4 = pd.merge(may4, itpid_district, on='calitp_itp_id', how='left')

In [23]:
df_all = utils.get_correct_url(may4)

In [24]:
df_all.sample(5)

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,caltrans_district
239,273,Sacramento Regional Transit,0,084,84,2022-05-04,2021-04-15,2099-01-01,62,0,0.0,Wednesday,May,03 - Marysville
1030,294,VTA,1,ACE Green,ACE Green,2022-05-04,2021-12-29,2099-01-01,8,0,0.0,Wednesday,May,04 - Oakland
363,87,Culver CityBus,0,6,CC 6,2022-05-04,2021-04-15,2099-01-01,96,0,0.0,Wednesday,May,07 - Los Angeles
1241,290,,0,250-196,250,2022-05-04,2022-01-11,2022-06-19,66,0,0.0,Wednesday,May,04 - Oakland
1670,278,MTS,0,834,834,2022-05-04,2022-01-30,2022-05-19,13,13,1.0,Wednesday,May,11 - San Diego


In [25]:
d = df_all>>filter(_.caltrans_district== district)

* Note to add functions in `utils` that create these sub dataframes already for 
    * district average
    * agency average 
    * overall average

In [27]:
all_dist_avg = (utils.get_agg_pct(may4, groupings = ['caltrans_district'], sum_sched = 'num_sched', sum_vp = 'num_vp'))
all_dist_avg['District Average'] = all_dist_avg['avg'].astype(float).map("{:.2%}".format)

In [28]:
display(HTML(dla_utils.pretify_tables((all_dist_avg>>select(_.caltrans_district, _['District Average'])>>filter(_.caltrans_district.notnull())>>arrange(_.caltrans_district)))))

Caltrans District,District Average
01 - Eureka,25.59%
02 - Redding,29.60%
03 - Marysville,4.26%
04 - Oakland,50.21%
05 - San Luis Obispo,26.97%
06 - Fresno,39.14%
07 - Los Angeles,68.81%
08 - San Bernardino,43.46%
09 - Bishop,62.90%
10 - Stockton,68.09%


In [29]:
d_avg = (utils.get_agg_pct(d,
                            groupings = ['agency_name', 'calitp_itp_id'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [30]:
#trying using utils bar chart
display(HTML('<strong>Agency Average Scheduled & RT Vehicle Position Data</strong>'))
scatter = ((utils.bar_chart_over_time(d_avg,
                          x_col = 'num_sched', 
                          y_col = 'num_vp', 
                          color_col = 'agency_name', 
                          yaxis_format = '',
                          sort = 'x', 
                          title_txt = ''))
           .mark_circle(size=60)
           .interactive())
(utils.add_tooltip(scatter, 'agency_name', 'calitp_itp_id', 'avg')).properties(width=700)

In [31]:
display(HTML('<strong>Districts Average of Scheduled & RT Vehicle Position Data</strong>'))
bar = (alt.Chart(all_dist_avg)
        .mark_bar(size=50)
        .encode(
            x=alt.X('caltrans_district', title=utils.labeling('caltrans_district')),
            y=alt.Y('avg', title=utils.labeling('Average'), axis=alt.Axis(format='%')),
            color=alt.condition(
        alt.FieldOneOfPredicate('caltrans_district', [f'{district}']),  # If the district is parameter then return true,
        alt.value('orange'),     # which sets the bar orange.
        alt.value('steelblue')   # And if it's not true it sets the bar steelblue.
            )
        ))
## code help: https://stackoverflow.com/questions/71082262/highlighting-specific-columns-in-bar-chart-in-python-using-altair
bar.properties(width=700)

In [34]:
all_dist_avg.sample(3)

Unnamed: 0,caltrans_district,num_sched,num_vp,avg,District Average
5,03 - Marysville,6075.0,259.0,0.04,4.26%
3,04 - Oakland,54044.0,27134.0,0.5,50.21%
4,10 - Stockton,2908.0,1980.0,0.68,68.09%


In [35]:
dist_avg_long = all_dist_avg.copy()
dist_avg_long = dist_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_avg_long =  (dist_avg_long>>select(_.caltrans_district,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      )
            )

In [36]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district}</strong>"))

(utils.bar_chart_over_time((dist_avg_long),
                           'caltrans_district','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [37]:
d_overall_avg = (utils.get_agg_pct(d,
                            groupings = ['service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [38]:
dist_overall_avg_long = d_overall_avg.copy()
dist_overall_avg_long = dist_overall_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_overall_avg_long = (dist_overall_avg_long>>select(_.service_date,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      ))

In [39]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district} by Day</strong>"))

(utils.bar_chart_over_time((dist_overall_avg_long),
                           'service_date','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [40]:
d_agency_date_avg = (utils.get_agg_pct(d,
                            groupings = ['calitp_itp_id', 'agency_name', 'service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [41]:
d_agency_date_avg.sample()

Unnamed: 0,calitp_itp_id,agency_name,service_date,num_sched,num_vp,avg
4,247,Petaluma Transit,2022-05-04,233.0,0.0,0.0


In [42]:
display(HTML(f"<strong>{district} "
             f"Average Percent of Scheduled Trips with Vehicle Postions Data "
             f"by Route Over Time</strong>"))
display(Markdown("To utilize the multi-select, use `shift` when clicking routes in the legend"))

line = alt.Chart(d_agency_date_avg).mark_line(point={"filled": False, "fill":"white"}).encode(
    x=alt.X('service_date', title=utils.labeling('service_date')),
    y=alt.Y('avg', title=('Percent with Vehicle Positions'), axis=alt.Axis(format='%')),
    color=alt.Color('agency_name', title=['Agency Name'],  legend=alt.Legend(columns=3, symbolLimit=0)),
    strokeDash='agency_name')

line_chart = utils.add_tooltip(line, 'agency_name','service_date', 'avg')
line_chart = utils.add_chart_selection(line_chart, 'agency_name')
line_chart.properties(width=700)

To utilize the multi-select, use `shift` when clicking routes in the legend

In [43]:
display(HTML(f"Out of <strong>{len(d_avg)} organizations </strong>"
             f"with Scheduled Data, there are "
             f"<strong>{len(d_avg>>filter(_.num_vp==0)>>arrange(_.calitp_itp_id))} organizations</strong> "
             "with <strong>no</strong> "
             "vehicle position data."))

In [44]:
## funky one
#may4>>filter(_.agency_name== 'Commute.org Shuttles')