# Report for May 4: {District}

In [1]:
import utils
import shared_utils
from dla_utils import _dla_utils as dla_utils
from shared_utils import styleguide, altair_utils, portfolio_utils

from siuba import *
import pandas as pd

from IPython.display import display, Markdown, HTML

import altair as alt

import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl

from calitp import query_sql, magics




In [2]:
district = '04 - Oakland'

In [3]:
## alternatively 
may4 = query_sql(f'''
SELECT * FROM `cal-itp-data-infra.views.gtfs_rt_vs_schedule_trips_may4_sample`
''')

In [4]:
may4['service_date'] = pd.to_datetime(may4['service_date'])
may4['weekday'] = pd.Series(may4.service_date).dt.day_name()    
may4['month'] =  pd.Series(may4.service_date).dt.month_name()

In [5]:
itpid_district = portfolio_utils.add_caltrans_district()
may4 = pd.merge(may4, itpid_district, on='calitp_itp_id', how='left')

In [6]:
df_all = utils.get_correct_url(may4)

In [7]:
df_all.sample(5)

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,caltrans_district
1162,270,Rosemead Explorer,0,Route1,,2022-05-04,2022-01-04,2099-01-01,15,0,0.0,Wednesday,May,07 - Los Angeles
710,231,Simi Valley Transit,0,4763,20,2022-05-04,2021-04-15,2099-01-01,36,0,0.0,Wednesday,May,07 - Los Angeles
1703,188,Madera County Connection,0,578,,2022-05-04,2022-01-31,2099-01-01,10,6,0.6,Wednesday,May,06 - Fresno
2776,200,San Francisco Bay Ferry,0,SB:HB,HB,2022-05-04,2022-04-01,2099-01-01,22,0,0.0,Wednesday,May,
3563,231,VCTC Intercity,0,4132,73X,2022-05-04,2021-08-25,2099-01-01,1,0,0.0,Wednesday,May,07 - Los Angeles


In [38]:
## organizations with no agency names 
## probably duplicates
df_all>>filter(_.agency_name.isnull())>>count(_.calitp_itp_id)

Unnamed: 0,calitp_itp_id,n
0,61,28
1,123,23
2,127,14
3,168,11
4,170,35
5,182,121
6,208,28
7,226,35
8,279,12
9,284,40


In [8]:
d = df_all>>filter(_.caltrans_district== district)

* Note to add functions in `utils` that create these sub dataframes already for 
    * district average
    * agency average 
    * overall average

In [42]:
all_dist_avg = (utils.get_agg_pct(df_all, groupings = ['caltrans_district'], sum_sched = 'num_sched', sum_vp = 'num_vp'))
all_dist_avg['District Average'] = all_dist_avg['avg'].astype(float).map("{:.2%}".format)

In [43]:
display(HTML(dla_utils.pretify_tables((all_dist_avg>>select(_.caltrans_district, _['District Average'])>>filter(_.caltrans_district.notnull())>>arrange(_.caltrans_district)))))

Caltrans District,District Average
01 - Eureka,25.59%
02 - Redding,29.60%
03 - Marysville,4.26%
04 - Oakland,64.13%
05 - San Luis Obispo,26.97%
06 - Fresno,39.14%
07 - Los Angeles,68.81%
08 - San Bernardino,52.52%
09 - Bishop,62.90%
10 - Stockton,68.23%


In [11]:
d_avg = (utils.get_agg_pct(d,
                            groupings = ['agency_name', 'calitp_itp_id'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [12]:
#trying using utils bar chart
display(HTML('<strong>Agency Average Scheduled & RT Vehicle Position Data</strong>'))
scatter = ((utils.bar_chart_over_time(d_avg,
                          x_col = 'num_sched', 
                          y_col = 'num_vp', 
                          color_col = 'agency_name', 
                          yaxis_format = '',
                          sort = 'x', 
                          title_txt = ''))
           .mark_circle(size=60)
           .interactive())
(utils.add_tooltip(scatter, 'agency_name', 'calitp_itp_id', 'avg')).properties(width=700)

In [13]:
display(HTML('<strong>Districts Average of Scheduled & RT Vehicle Position Data</strong>'))
bar = (alt.Chart(all_dist_avg)
        .mark_bar(size=50)
        .encode(
            x=alt.X('caltrans_district', title=utils.labeling('caltrans_district')),
            y=alt.Y('avg', title=utils.labeling('Average'), axis=alt.Axis(format='%')),
            color=alt.condition(
        alt.FieldOneOfPredicate('caltrans_district', [f'{district}']),  # If the district is parameter then return true,
        alt.value('orange'),     # which sets the bar orange.
        alt.value('steelblue')   # And if it's not true it sets the bar steelblue.
            )
        ))
## code help: https://stackoverflow.com/questions/71082262/highlighting-specific-columns-in-bar-chart-in-python-using-altair
bar.properties(width=700)

In [14]:
all_dist_avg.sample(3)

Unnamed: 0,caltrans_district,num_sched,num_vp,avg,District Average
2,07 - Los Angeles,29481.0,20287.0,0.69,68.81%
5,03 - Marysville,6075.0,259.0,0.04,4.26%
6,11 - San Diego,9084.0,8692.0,0.96,95.68%


In [15]:
dist_avg_long = all_dist_avg.copy()
dist_avg_long = dist_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_avg_long =  (dist_avg_long>>select(_.caltrans_district,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      )
            )

In [16]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district}</strong>"))

(utils.bar_chart_over_time((dist_avg_long),
                           'caltrans_district','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [17]:
d_overall_avg = (utils.get_agg_pct(d,
                            groupings = ['service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [18]:
dist_overall_avg_long = d_overall_avg.copy()
dist_overall_avg_long = dist_overall_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_overall_avg_long = (dist_overall_avg_long>>select(_.service_date,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      ))

In [19]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district} by Day</strong>"))

(utils.bar_chart_over_time((dist_overall_avg_long),
                           'service_date','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [20]:
d_agency_date_avg = (utils.get_agg_pct(d,
                            groupings = ['calitp_itp_id', 'agency_name', 'service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [21]:
d_agency_date_avg.sample()

Unnamed: 0,calitp_itp_id,agency_name,service_date,num_sched,num_vp,avg
36,280,San Francisco Bay Ferry,2022-05-04,141.0,0.0,0.0


In [22]:
display(HTML(f"<strong>{district} "
             f"Average Percent of Scheduled Trips with Vehicle Postions Data "
             f"by Route Over Time</strong>"))
display(Markdown("To utilize the multi-select, use `shift` when clicking routes in the legend"))

line = alt.Chart(d_agency_date_avg).mark_line(point={"filled": False, "fill":"white"}).encode(
    x=alt.X('service_date', title=utils.labeling('service_date')),
    y=alt.Y('avg', title=('Percent with Vehicle Positions'), axis=alt.Axis(format='%')),
    color=alt.Color('agency_name', title=['Agency Name'],  legend=alt.Legend(columns=3, symbolLimit=0)),
    strokeDash='agency_name')

line_chart = utils.add_tooltip(line, 'agency_name','service_date', 'avg')
line_chart = utils.add_chart_selection(line_chart, 'agency_name')
line_chart.properties(width=700)

To utilize the multi-select, use `shift` when clicking routes in the legend

In [24]:
## funky one
#may4>>filter(_.agency_name== 'Commute.org Shuttles')

In [None]:
## 

In [68]:
display(HTML(f"Out of <strong>{len(d_avg)} organizations in District {district} </strong>"
             f"with Scheduled Data, there are "
             f"{(len(d_avg>>filter(_.num_vp==0)>>arrange(_.calitp_itp_id)))} or "
             f"<strong>{((len(d_avg>>filter(_.num_vp==0)>>arrange(_.calitp_itp_id)))/(len(d_avg)))*100}% of organizations</strong> "
             "have <strong>no</strong> "
             "vehicle position data."))

In [44]:
df_all_avg = (utils.get_agg_pct(df_all,
                            groupings = ['calitp_itp_id', 'agency_name'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [91]:
have_vp_list = ((df_all_avg>>filter(_.num_vp!=0)>>arrange(_.calitp_itp_id))>>filter(_.agency_name.notnull())).agency_name.to_list()

In [92]:
display(HTML(f"For all organizations in this dataset, only <strong>"
             f"{('{:.2%}'.format((len(have_vp_list))/(len(df_all_avg>>filter(_.agency_name.notnull())))))}"
             f"</strong> of the organizations have RT Vehicle Postion Data present"))

In [93]:
df_all['has_vp'] = df_all['agency_name'].isin(have_vp_list)


In [94]:
has_vp = df_all>>filter(_.has_vp==True)

In [103]:
display(HTML("<strong> District Averages filtered for Organizations with RT Present </strong>"))
(utils.get_agg_pct(has_vp,
                            groupings = ['caltrans_district'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))>>select(_.caltrans_district, _.avg)>>arrange(_.caltrans_district)

Unnamed: 0,caltrans_district,avg
10,01 - Eureka,0.31
4,02 - Redding,0.4
2,03 - Marysville,0.09
1,04 - Oakland,0.86
11,05 - San Luis Obispo,0.71
7,06 - Fresno,0.92
3,07 - Los Angeles,0.88
5,08 - San Bernardino,0.99
6,09 - Bishop,0.63
12,10 - Stockton,0.88


In [106]:
display(HTML("<strong> District Averages NOT filtered for Organizations with RT Present </strong>"))

all_dist_avg>>select(_.caltrans_district, _.avg)>>arrange(_.caltrans_district)

Unnamed: 0,caltrans_district,avg
11,01 - Eureka,0.26
7,02 - Redding,0.3
5,03 - Marysville,0.04
3,04 - Oakland,0.64
1,05 - San Luis Obispo,0.27
9,06 - Fresno,0.39
2,07 - Los Angeles,0.69
8,08 - San Bernardino,0.53
10,09 - Bishop,0.63
4,10 - Stockton,0.68
