# Report for May 4: {District}

In [1]:
import utils
import shared_utils
from dla_utils import _dla_utils as dla_utils
from shared_utils import styleguide, altair_utils, portfolio_utils

from siuba import *
import pandas as pd

from IPython.display import display, Markdown, HTML

import altair as alt

import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl

from calitp import query_sql, magics




In [2]:
district = '04 - Oakland'

In [3]:
district_short_name = 'District 4'

In [4]:
## alternatively 
may4 = query_sql(f'''
SELECT * FROM `cal-itp-data-infra.views.gtfs_rt_vs_schedule_trips_may4_sample`
''')

In [5]:
may4['service_date'] = pd.to_datetime(may4['service_date'])
may4['weekday'] = pd.Series(may4.service_date).dt.day_name()    
may4['month'] =  pd.Series(may4.service_date).dt.month_name()

In [6]:
itpid_district = portfolio_utils.add_caltrans_district()
may4 = pd.merge(may4, itpid_district, on='calitp_itp_id', how='left')

In [7]:
df_all = utils.get_correct_url(may4)

In [8]:
## organizations with no agency names 
## probably duplicates
#df_all>>filter(_.agency_name.isnull())>>count(_.calitp_itp_id)

In [9]:
d = df_all>>filter(_.caltrans_district== district)

* Note to add functions in `utils` that create these sub dataframes already for 
    * district average
    * agency average 
    * overall average

In [10]:
all_dist_avg = (utils.get_agg_pct(df_all, groupings = ['caltrans_district'], sum_sched = 'num_sched', sum_vp = 'num_vp'))
all_dist_avg['District Average'] = all_dist_avg['avg'].astype(float).map("{:.2%}".format)

In [11]:
display(HTML("<h2>District Stats</h2>"))

In [12]:
display(HTML(dla_utils.pretify_tables((all_dist_avg>>select(_.caltrans_district, _['District Average'])>>filter(_.caltrans_district.notnull())>>arrange(_.caltrans_district)))))

Caltrans District,District Average
01 - Eureka,25.59%
02 - Redding,29.60%
03 - Marysville,4.26%
04 - Oakland,64.13%
05 - San Luis Obispo,26.97%
06 - Fresno,39.14%
07 - Los Angeles,68.81%
08 - San Bernardino,52.52%
09 - Bishop,62.90%
10 - Stockton,68.23%


In [13]:
d_avg = (utils.get_agg_pct(d,
                            groupings = ['agency_name', 'calitp_itp_id'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [14]:
display(HTML(f"<h3>{district_short_name} Scheduled Data vs Vehicle Position Data</h3>"))


In [15]:
#trying using utils bar chart
display(HTML('<strong>Agency Average Scheduled & RT Vehicle Position Data</strong>'))
scatter = ((utils.bar_chart_over_time(d_avg,
                          x_col = 'num_sched', 
                          y_col = 'num_vp', 
                          color_col = 'agency_name', 
                          yaxis_format = '',
                          sort = 'x', 
                          title_txt = ''))
           .mark_circle(size=60)
           .interactive())
(utils.add_tooltip(scatter, 'agency_name', 'calitp_itp_id', 'avg')).properties(width=700)

In [16]:
display(HTML('<strong>Districts Average of Scheduled & RT Vehicle Position Data</strong>'))
bar = (alt.Chart(all_dist_avg)
        .mark_bar(size=50)
        .encode(
            x=alt.X('caltrans_district', title=utils.labeling('caltrans_district')),
            y=alt.Y('avg', title=utils.labeling('Average'), axis=alt.Axis(format='%')),
            color=alt.condition(
        alt.FieldOneOfPredicate('caltrans_district', [f'{district}']),  # If the district is parameter then return true,
        alt.value('orange'),     # which sets the bar orange.
        alt.value('steelblue')   # And if it's not true it sets the bar steelblue.
            )
        ))
## code help: https://stackoverflow.com/questions/71082262/highlighting-specific-columns-in-bar-chart-in-python-using-altair
bar.properties(width=700)

In [17]:
dist_avg_long = all_dist_avg.copy()
dist_avg_long = dist_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_avg_long =  (dist_avg_long>>select(_.caltrans_district,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      )
            )

In [18]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district}</strong>"))

(utils.bar_chart_over_time((dist_avg_long),
                           'caltrans_district','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [19]:
d_overall_avg = (utils.get_agg_pct(d,
                            groupings = ['service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [20]:
dist_overall_avg_long = d_overall_avg.copy()
dist_overall_avg_long = dist_overall_avg_long.rename(columns={ 'num_sched':'Total Number Scheduled Trips',
                                            'num_vp':'Total Number Vehicle Position Trips'})
dist_overall_avg_long = (dist_overall_avg_long>>select(_.service_date,
                         _['Total Number Scheduled Trips'],
                         _['Total Number Vehicle Position Trips']
                        ) 
             >> gather('measure',
                       'value',
                       _['Total Number Scheduled Trips'],
                       _['Total Number Vehicle Position Trips']
                      ))

In [21]:
display(HTML(f"<strong>Number of Scheduled and Vehicle Postion Trips in {district} by Day</strong>"))

(utils.bar_chart_over_time((dist_overall_avg_long),
                           'service_date','value','measure',',f', 'x', '')).mark_bar(size=50).encode(y=alt.Y('value', stack = None,)).properties(width=800)

In [22]:
display(HTML(f"<h3>Organization Averages in {district_short_name}</h3>"))

In [23]:
d_agency_date_avg = (utils.get_agg_pct(d,
                            groupings = ['calitp_itp_id', 'agency_name', 'service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [24]:
display(HTML(f"<strong>{district_short_name} "
             f"Average Percent of Scheduled Trips with Vehicle Postions Data "
             f"by Route Over Time</strong>"))
display(Markdown("To utilize the multi-select, use `shift` when clicking routes in the legend"))

line = alt.Chart(d_agency_date_avg).mark_line(point={"filled": False, "fill":"white"}).encode(
    x=alt.X('service_date', title=utils.labeling('service_date')),
    y=alt.Y('avg', title=('Percent with Vehicle Positions'), axis=alt.Axis(format='%')),
    color=alt.Color('agency_name', title=['Agency Name'],  legend=alt.Legend(columns=3, symbolLimit=0)),
    strokeDash='agency_name')

line_chart = utils.add_tooltip(line, 'agency_name','service_date', 'avg')
line_chart = utils.add_chart_selection(line_chart, 'agency_name')
line_chart.properties(width=700)

To utilize the multi-select, use `shift` when clicking routes in the legend

In [25]:
## funky one
#may4>>filter(_.agency_name== 'Commute.org Shuttles')

In [26]:
display((HTML("<h2>How Much Data is Available in the Current Model?</h2>")))

In [27]:
itp_id_avg = ("{:.2f}%".format(((len(d_avg>>filter(_.num_vp==0)>>arrange(_.calitp_itp_id)))/(len(d_avg)))*100))

In [28]:
display(HTML(f"Out of <strong>{len(d_avg)} organizations in District {district_short_name} </strong>"
             f"with Scheduled Data, there are "
             f"{(len(d_avg>>filter(_.num_vp==0)>>arrange(_.calitp_itp_id)))} or "
             f"<strong>{itp_id_avg} of organizations</strong> "
             "have <strong>no</strong> "
             "vehicle position data."))

In [29]:
df_all_avg = (utils.get_agg_pct(df_all,
                            groupings = ['calitp_itp_id', 'agency_name'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))

In [30]:
have_vp_list = ((df_all_avg>>filter(_.num_vp!=0)>>arrange(_.calitp_itp_id))>>filter(_.agency_name.notnull())).agency_name.to_list()

In [31]:
display(HTML(f"For all organizations in this dataset, only <strong>"
             f"{('{:.2%}'.format((len(have_vp_list))/(len(df_all_avg>>filter(_.agency_name.notnull())))))}"
             f"</strong> of the organizations have RT Vehicle Postion Data present"))

In [32]:
df_all['has_vp'] = df_all['agency_name'].isin(have_vp_list)


In [33]:
has_vp = df_all>>filter(_.has_vp==True)

In [34]:
display(HTML("<strong> District Averages filtered for Organizations with RT Present </strong>"))
with_rt = (utils.get_agg_pct(has_vp,
                            groupings = ['caltrans_district'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))>>select(_.caltrans_district, _.avg)>>arrange(_.caltrans_district)
with_rt['avg_pct'] = with_rt['avg'].astype(float).map("{:.2%}".format)
display(HTML(dla_utils.pretify_tables(with_rt>>select(_.caltrans_district, _.avg_pct))))

Caltrans District,Avg Pct
01 - Eureka,31.01%
02 - Redding,40.36%
03 - Marysville,9.20%
04 - Oakland,86.21%
05 - San Luis Obispo,71.33%
06 - Fresno,92.43%
07 - Los Angeles,88.23%
08 - San Bernardino,98.75%
09 - Bishop,62.90%
10 - Stockton,88.16%


In [35]:
display(HTML("<strong> District Averages NOT filtered for Organizations with RT Present </strong>"))
all_dist_avg['avg_pct'] = all_dist_avg['avg'].astype(float).map("{:.2%}".format)
display(HTML(dla_utils.pretify_tables(all_dist_avg>>select(_.caltrans_district, _.avg_pct)>>arrange(_.caltrans_district))))

Caltrans District,Avg Pct
01 - Eureka,25.59%
02 - Redding,29.60%
03 - Marysville,4.26%
04 - Oakland,64.13%
05 - San Luis Obispo,26.97%
06 - Fresno,39.14%
07 - Los Angeles,68.81%
08 - San Bernardino,52.52%
09 - Bishop,62.90%
10 - Stockton,68.23%


In [36]:
display((HTML(f"<h2>Operators in {district_short_name}</h2>")))

In [37]:
from shared_utils import calitp_color_palette as cp

In [193]:
#from Amanda's utils for Project Prioritization
def dual_chart_with_dropdown(
    df,
    dropdown_list: list,
    dropdown_field: str,
    #reference_dict: dict,
    itp_id:str
):
    """
    Two bar charts controlled by a dropdown
    Args:
        df: the dataframe subsetted to district
        dropdown_list(list): a list of all the values in the dropdown menu,
        dropdown_field(str): column where the dropdown menu's values are drawn from,
    Returns:
        Returns two  bar charts that are controlled by a dropdown
    """
    # Create drop down menu
    input_dropdown = alt.binding_select(options=dropdown_list, name="Select")

    # The field tied to the drop down menu
    selection = alt.selection_single(fields=[dropdown_field], bind=input_dropdown)
    print(selection)
    
    #chart to get single agency vs total in district
    chart1 = ((utils.total_average_with_1op_chart(df, itp_id))
              .mark_trail()
              .add_selection(selection)
              .transform_filter(selection))

#     #for second chart: subset df
#     df_short = df>>select(_.agency_name, _.route_id,
#                               _.route_short_name, _.service_date, _.pct_w_vp)
#     #title
#     display(HTML(f"<strong>{(df.iloc[0]['agency_name'])} "
#              f"Average Percent of Scheduled Trips with Vehicle Postions Data "
#              f"by Route Over Time</strong>"))
#     display(Markdown("To utilize the multi-select, use `shift` when clicking routes in the legend"))
    
#     #line chart
#     line = alt.Chart(df_short).mark_line().encode(
#         x=alt.X('service_date', title=utils.labeling('service_date')),
#         y=alt.Y('pct_w_vp', title=('Percent with Vehicle Positions'), axis=alt.Axis(format='%')),
#         color=alt.Color('route_short_name', title=['Route Name']),
#         strokeDash='route_short_name').add_selection(selection)

#     line_chart = utils.add_tooltip(line, 'route_short_name','service_date', 'pct_w_vp')
#     line_chart = utils.add_chart_selection(line_chart, 'route_short_name')
#     line_chart.properties(width=700)
    
    # return input_dropdown
    return chart1 
# | line_chart

In [199]:
list(d_avg.agency_name.unique())

['Tri Delta Transit',
 None,
 'Stanford Marguerite Shuttle',
 'Sonoma County Transit',
 'Petaluma Transit',
 'San Francisco International Airport',
 'SamTrans',
 'Union City Transit',
 'Livermore Amador Valley Transit Authority',
 'Mission Bay TMA',
 'County Connection',
 'WestCat (Western Contra Costa)',
 'Rio Vista Delta Breeze',
 'VTA',
 'Capitol Corridor Joint Powers Authority',
 'Sonoma Marin Area Rail Transit',
 'Vacaville City Coach',
 'Golden Gate Transit',
 'Marin Transit',
 'VINE Transit',
 'Caltrain',
 'Petaluma',
 'SolTrans',
 'Santa Rosa CityBus',
 'PresidiGo',
 'Emery Go-Round',
 'Fairfield and Suisun Transit',
 'Commute.org Shuttles',
 'San Francisco Municipal Transportation Agency',
 'AC TRANSIT',
 'Bay Area Rapid Transit',
 'San Francisco Bay Ferry']

In [200]:
d_agency_list = d_avg.agency_name.unique().tolist()

In [201]:
d_agency_list

['Tri Delta Transit',
 None,
 'Stanford Marguerite Shuttle',
 'Sonoma County Transit',
 'Petaluma Transit',
 'San Francisco International Airport',
 'SamTrans',
 'Union City Transit',
 'Livermore Amador Valley Transit Authority',
 'Mission Bay TMA',
 'County Connection',
 'WestCat (Western Contra Costa)',
 'Rio Vista Delta Breeze',
 'VTA',
 'Capitol Corridor Joint Powers Authority',
 'Sonoma Marin Area Rail Transit',
 'Vacaville City Coach',
 'Golden Gate Transit',
 'Marin Transit',
 'VINE Transit',
 'Caltrain',
 'Petaluma',
 'SolTrans',
 'Santa Rosa CityBus',
 'PresidiGo',
 'Emery Go-Round',
 'Fairfield and Suisun Transit',
 'Commute.org Shuttles',
 'San Francisco Municipal Transportation Agency',
 'AC TRANSIT',
 'Bay Area Rapid Transit',
 'San Francisco Bay Ferry']

In [197]:
d_agency_list[:1]

['Tri Delta Transit']

In [198]:
d_agencies>>filter(_.agency_name=='Tri Delta Transit')

Unnamed: 0,calitp_itp_id,agency_name,n
30,336,Tri Delta Transit,15


In [186]:
d_agencies = (d>>group_by(_.calitp_itp_id, _.agency_name)>>count(_.agency_name))

In [187]:
d_agencies_list2 = d_agencies.set_index('agency_name')['calitp_itp_id'].to_dict()

In [188]:
#d_agencies_list3 = d_agencies_list2['Tri Delta Transit']

In [189]:
d_agencies_list3 = {'Tri Delta Transit':336}

In [190]:
d_agencies_list3['Tri Delta Transit']

336

In [192]:
d.sample()

Unnamed: 0,calitp_itp_id,agency_name,calitp_url_number,route_id,route_short_name,service_date,calitp_extracted_at,calitp_deleted_at,num_sched,num_vp,pct_w_vp,weekday,month,caltrans_district
2101,4,AC TRANSIT,1,65,65,2022-05-04,2022-03-01,2099-01-01,26,20,0.77,Wednesday,May,04 - Oakland


In [195]:
dual_chart_with_dropdown(
    d,
    dropdown_list = d_agency_list,
    dropdown_field= 'agency_name',
   # reference_dict= d_agencies_list3
    itp_id = 4)


Selection('selector026', SelectionDef({
  bind: BindRadioSelect({
    input: 'select',
    name: 'Select',
    options: ['Tri Delta Transit', None, 'Stanford Marguerite Shuttle', 'Sonoma County Transit', 'Petaluma Transit', 'San Francisco International Airport', 'SamTrans', 'Union City Transit', 'Livermore Amador Valley Transit Authority', 'Mission Bay TMA', 'County Connection', 'WestCat (Western Contra Costa)', 'Rio Vista Delta Breeze', 'VTA', 'Capitol Corridor Joint Powers Authority', 'Sonoma County Transit', 'Sonoma Marin Area Rail Transit', 'Vacaville City Coach', 'Golden Gate Transit', 'Marin Transit', 'VINE Transit', 'Caltrain', 'Petaluma', None, 'SolTrans', 'Santa Rosa CityBus', 'PresidiGo', 'Emery Go-Round', 'Fairfield and Suisun Transit', 'Commute.org Shuttles', 'San Francisco Municipal Transportation Agency', None, 'AC TRANSIT', 'SamTrans', None, 'Bay Area Rapid Transit', 'San Francisco Bay Ferry', 'SamTrans']
  }),
  fields: ['agency_name'],
  type: 'single'
}))


In [59]:
#using dataframe with the just the entries in the same district
(utils.total_average_with_1op_chart(d, 282)).mark_trail().encode(size='Percent with Vehicle Position Data:Q')

In [230]:
def dual_chart_with_dropdown2(
    df,
    dropdown_list: list,
    dropdown_field: str,
    x_axis_chart1: str,
    y_axis_chart1: str,
    color_col1: str,
    chart1_tooltip_cols: list,
    x_axis_chart2: str,
    y_axis_chart2: str,
    color_col2: str,
    chart2_tooltip_cols: list
):
    """Two bar charts controlled by a dropdown
    Args:
        df: the dataframe
        dropdown_list(list): a list of all the values in the dropdown menu,
        dropdown_field(str): column where the dropdown menu's values are drawn from,
        x_axis_chart1(str): x axis value for chart 1 - encode as Q or N,
        y_axis_chart1(str): y axis valuefor chart 1 - encode as Q or N,
        color_col1(str): column to color the graphs for chart 1,
        chart1_tooltip_cols(list): list of all the columns to populate the tooltip,
        x_axis_chart2(str): x axis value for chart 2 - encode as Q or N,
        y_axis_chart2(str): x axis value for chart 2 - encode as Q or N,
        color_col2(str): column to color the graphs for chart 2,
        chart2_tooltip_cols(list): list of all the columns to populate the tooltip,
    Returns:
        Returns two  bar charts that are controlled by a dropdown
    """
    # Create drop down menu
    input_dropdown = alt.binding_select(options=dropdown_list, name="Select ")

    # The field tied to the drop down menu
    selection = alt.selection_single(fields=[dropdown_field], bind=input_dropdown)
    df = (df>>filter(_.agency_name.notnull()))
    df_agg = (utils.get_agg_pct(df,
                            groupings = ['calitp_itp_id', 'agency_name', 'service_date'],
                            sum_sched = 'num_sched',
                            sum_vp = 'num_vp'))
    chart1 = (
        alt.Chart(df_agg)
        .mark_bar()
        .encode(
            x=x_axis_chart1,
            y=(y_axis_chart1),
            color=alt.Color(color_col1, scale=alt.Scale(range=cp.CALITP_DIVERGING_COLORS)))
        .properties(width=300)
        .add_selection(selection)
        .transform_filter(selection)
    )
    
    df_short = df>>select(_.agency_name, _.route_id, _.route_short_name, _.service_date, _.pct_w_vp)

    chart2 = (
        alt.Chart(df_short)
        .mark_bar()
        .encode(
            x=x_axis_chart2,
            y=(y_axis_chart2),
            color=alt.Color(
                color_col2, scale=alt.Scale(range=cp.CALITP_DIVERGING_COLORS), legend = None))
        .properties(width=300)
        .add_selection(selection)
        .transform_filter(selection)
    )

    return chart1 | chart2

In [231]:
dual_chart_with_dropdown2(
    d,
    dropdown_list = d_agency_list,
    dropdown_field= 'agency_name',
    x_axis_chart1 = 'service_date:O',
    y_axis_chart1= 'avg:Q',
    color_col1= 'avg:Q',
    chart1_tooltip_cols = ['agency_name','service_date', 'avg'],
    x_axis_chart2= 'service_date:O',
    y_axis_chart2= 'avg:Q',
    color_col2= 'route_short_name',
    chart2_tooltip_cols= ['agency_name', 'service_date', 'route_short_name', 'avg']
)