# Script work for Generating Charting Outputs

In [3]:
#! pip install cpi

In [2]:
import numpy as np
import pandas as pd
from siuba import *

import altair as alt
import altair_saver
from plotnine import *

from IPython.display import Markdown

from shared_utils import altair_utils
from shared_utils import geography_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

from calitp import to_snakecase
import intake

import _clean_data
import _dla_utils

In [3]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

In [4]:
df= pd.read_parquet("dla_df.parquet")

In [5]:
len(df)

21059

In [8]:
#subset= 11

In [9]:
#df = df>>filter(_.dist==subset)

In [6]:
def labeling(word):
    # Add specific use cases where it's not just first letter capitalized
    LABEL_DICT = { "prepared_y": "Year",
              "dist": "District",
              "total_requested": "Total Requested",
              "fed_requested":"Fed Requested",
              "ac_requested": "Advance Construction Requested",
              "nunique":"Number of Unique",
              "project_no": "Project Number"}
    
    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        word = word.replace('n_', 'Number of ').title()
        word = word.replace('unique_', "Number of Unique ").title()
        word = word.replace('_', ' ').title()
    
    return word


In [166]:
def basic_bar_chart_test(df, x_col, y_col, color_col, subset, chart_title=''):

    if chart_title == "":
        chart_title = (f"{labeling(x_col)} by {labeling(y_col)}")

    
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X(x_col, title=labeling(x_col), sort=('-y')),
                 y=alt.Y(y_col, title=labeling(y_col)),
                 color = alt.Color(color_col,
                                  scale=alt.Scale(
                                      range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS),
                                      legend=alt.Legend(title=(labeling(color_col)))
                                  ))
             .properties( 
                          title=chart_title)
    )

    chart=styleguide.preset_chart_config(chart)
   # chart.save(f"./chart_outputs/d{subset}_outputs/bar_{x_col}_by_{y_col}.png")
    
    return chart


def basic_line_chart_test(df, x_col, y_col, subset, chart_title=''):
    
    if chart_title == "":
        chart_title = (f"{labeling(x_col)} by {labeling(y_col)}")
    
    
    chart = (alt.Chart(df)
             .mark_line()
             .encode(
                 x=alt.X(x_col, title=labeling(x_col)),
                 y=alt.Y(y_col, title=labeling(y_col))
                                   )
              ).properties( 
                          title=chart_title)

    chart=styleguide.preset_chart_config(chart)
    #chart.save(f"./chart_outputs/d{subset}_outputs/line_{x_col}_by_{y_col}.png")
    
    return chart


In [167]:
#df_years= _dla_utils.count_all_years(df)

In [168]:
#basic_bar_chart_test(df_years, 'prepared_y', 'total_requested_sum', 'prepared_y', subset, 'Total Requested Funds by Year')


In [169]:
#df = (df>>filter(_.dist==subset))
#df_top = _dla_utils.find_top(df)

#basic_bar_chart_test((df_top>>filter(_.variable=='primary_agency_name')), 'value', 'count', 'count', subset, chart_title='Primary Agency name')

In [170]:
df_top = _dla_utils.find_top(df)

In [171]:
#df_top>>filter(_.variable=='prefix')


## Transit Agencies

In [172]:
# transit_agencies = df[df['primary_agency_name'].str.contains(
#               'Transit|tranist|Rail',
#               case=False, na=False)]

In [173]:
#transit_agencies>>count(_.primary_agency_name)

In [174]:
##Add to chart function
# chart_title='Obligations by Year'
# chart_title.replace(" ", "_")

## Charts

In [187]:
def make_charts(df, subset):     
    df= pd.read_parquet("dla_df.parquet")
    
    df = (df>>filter(_.dist==subset))
    
    #subsetting the data
    df_years = _dla_utils.count_all_years(df)
    df_top = _dla_utils.find_top(df)
    
    
    ##print statements
    display(Markdown(f"# **District {subset} Analysis**"))
    
    display(Markdown(f"**Unique Agencies Funding**"))
    display(Markdown(f'There are {(df.primary_agency_name.nunique())} Unique Agencies'))
    
    
    display(Markdown(f"Out of {len(df)} obligations, {len(transit)} are transit-related."))
    display(Markdown(f"{(transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0]} has the highest Transit obligations"))
    
    q = df>>count(_.primary_agency_name)>>arrange(_.n)
    
    q2 = q.n.quantile(.95)
    display(Markdown(f"There are {len(q>>filter(_.n> (q2)))} agencies have over {q2} obligations (95th percentile) since {(df.prepared_y.min())}")
           )   
    
    q3 = q.n.quantile(.1)
    display(Markdown((f"There are {len(q>>filter(_.n< (q3)))} agencies have less than {q3} obligations since {(df.prepared_y.min())}")))


    ##tables
    
    display(Markdown(f"**Number of Unique Prefix Codes by Agency**"))
    display((_dla_utils.get_nunique(df, 'prefix', 'primary_agency_name')).head(5))
    
    display(Markdown(f"**Number of Unique Agencies by Prefix Codes**"))
    display((_dla_utils.get_nunique(df, 'primary_agency_name', 'prefix')).head(5))
    
    display(Markdown(f"**Top 5 Types of Work**"))
    display((transit>>count(_.type_of_work)>>arrange(-_.n)>>select(_.type_of_work)).head(5))
    
    
#     ###charting df_years
    
#     column_names = list(df_years)

#     for column in column_names:
    
#         display(Markdown(f"**{labeling(column)} Over Prepared Year**"))
#         display(basic_bar_chart_test(df_years, "prepared_y", column, "prepared_y", subset))
    
    
#     ###charting df_tops 
    
#     values = sorted(df_top['variable'].unique())

#     for value in values:
#         display(Markdown(f"**Top Values in {labeling(value)}**"))
#         display(basic_bar_chart_test((df_top>>filter(_.variable==value)), 'value', 'count', 'count', subset,
#                                     chart_title=f'Top 20 {labeling(value)}'))
        
        
       
   #Other Charting
    display(Markdown(f"**Number of Obligations**"))
    
    #Line chart for Obligations by Year
    chart1= (basic_line_chart_test((df_top>>filter(_.variable=='prepared_y')), 'value', 'count', subset, 'Obligations by Year'))
    display(chart1)
    
    
    #Bar chart Agencies With The Most Obligations
    chart2= (basic_bar_chart_test((df_top>>filter(_.variable=='primary_agency_name')), 'value', 'count', 'value', subset, 'Agencies With The Most Obligations'))
    display(chart2)
    
    
    display(Markdown(f"**Prefix Codes**"))
    
    #Bar chart Agencies With The Most Unique Prefix Codes
    chart3 = (basic_bar_chart_test(((_dla_utils.get_nunique(df, 'prefix', 'primary_agency_name')).head(30)),
                            'primary_agency_name', 'n', 'primary_agency_name', subset, 'Agencies With The Most Unique Prefix Codes'))
    display(chart3)
    
    # Bar chart Average Total Requested Funds by Prefix
    chart8 = (basic_bar_chart_test((((_dla_utils.calculate_data_all(df, 'total_requested', 'prefix', aggfunc="mean"))
                          >>arrange(-_.total_requested)).head(30)), 'prefix','total_requested', 'prefix', subset,
                           'Average Total Requested Funds by Prefix'
                       ))
    display(chart8)
    
    #Bar chart with the Most Used Prefix Counts
    chart9= (basic_bar_chart_test((df_top>>filter(_.variable=='prefix')), 'value', 'count', 'value', subset, 'Most Used Prefix Codes'))
    display(chart9)
    
    
    display(Markdown(f"**Funding Amounts**"))
    
    #Bar chart Average Total Requested Funds by Agency
    chart4=(basic_bar_chart_test((((_dla_utils.calculate_data_all(df, 'total_requested', 'primary_agency_name', aggfunc="mean"))
                          >>arrange(-_.total_requested)).head(30)
                        ), 'primary_agency_name','total_requested', 'primary_agency_name', subset,
                           'Average Total Requested Funds by Agency'
                       ))
    display(chart4)
    
    
    #Bar chart Bottom Average Total Requested Funds by Agency
    avg_funds_bottom = (df>>group_by(_.primary_agency_name)>>summarize(avg_funds=_.total_requested.mean())>>arrange(-_.avg_funds)).tail(50)

    chart5=( basic_bar_chart_test((avg_funds_bottom.tail(40)), 'primary_agency_name','avg_funds', 'primary_agency_name', subset, 
                          'Lowest Average Total Funds by Agency'))
    display(chart5)
   

    
    #Bar chart Average Total Requested Funds by Agency: Transit Related Funding
#     chart6 = (basic_bar_chart_test((((_dla_utils.calculate_data_all(transit, 'total_requested', 'primary_agency_name', aggfunc="mean"))
#                           >>arrange(-_.total_requested)).head(30)
#                         ), 'primary_agency_name','total_requested', 'primary_agency_name', subset,
#                            'Average Total Requested Funds by Agency: Transit Related Funding'
#                        ))
#     display(chart6)
    
    # #Bar chart Transit Agencies Average Funding
    # chart7 = ( basic_bar_chart_test((_dla_utils.calculate_data_all(transit_agencies, 'total_requested', aggregate_by='primary_agency_name', aggfunc="mean")),
    #                        'primary_agency_name', 'total_requested', 'primary_agency_name', subset,
    #                        'Transit Agencies Average Funding'))
    # display(chart7)
    
    
    
    #work categories info and charts
    display(Markdown(f"**Work Categories**"))
    
    work_cat = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']
    
    for i in work_cat:
        subset = (df>>filter(_[i]==1))
        subset_2 = ((_dla_utils.find_top(subset))>>filter(_.variable=='primary_agency_name')
                >>select(_.value,_.count)).head(5)
        subset_2['Percent of Category'] = (((subset_2['count'])/(len(subset)))*100)
        subset_2 =subset_2.rename(columns = {'value':'Agency', 'count':f'{_dla_utils.labeling(i)} Obligations'})
    
    
        #generate chart:
    
        subset_3= ((subset.groupby(['primary_agency_name']).agg({i:'sum',
                                                    'process_days':'mean',
                                                    'adjusted_total_requested':'mean',
                                                    'adjusted_fed_requested':'mean',
                                                    'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_[i])).head(5)
    
        subset_3 =subset_3.rename(columns = {'primary_agency_name':'Agency',
                                         'adjusted_total_requested':'Total Requested',
                                         'adjusted_fed_requested':'Fed Requested',
                                         'adjusted_ac_requested':'AC Requested'})
    
        subset_4 = pd.melt(subset_3, id_vars=['Agency'],
            value_vars=['Total Requested','Fed Requested','AC Requested'],
            var_name='Categories', value_name='value'
           )

        ## following cell block makes it hard to change name
        # subset_4 = (subset_3 >> gather('category', 'value', _.adjusted_total_requested,
        #                 _.adjusted_fed_requested,
        #                 _.adjusted_ac_requested,
        #                 ))
    
        chart = (alt.Chart(subset_4).mark_bar().encode(
            x=alt.X('value', axis=alt.Axis(format='$', title='Obligated Funding ($2021)')),
            y=alt.Y("Agency"),
            color='Categories:N',
            row='Categories:N'
        ))
    
        display(Markdown(f'**Top Agencies using {_dla_utils.labeling(i)} Projects**'))
        display(subset_2.style.format(formatter={('Percent of Category'): "{:.2f}%"}))
        display(chart)
    
    

In [188]:
#parameters cell
subset = 4

In [189]:
make_charts(df, subset)

# **District 4 Analysis**

**Unique Agencies Funding**

There are 151 Unique Agencies

Out of 3404 obligations, 432 are transit-related.

Los Angeles has the highest Transit obligations

There are 8 agencies have over 92.5 obligations (95th percentile) since 2011.0

There are 9 agencies have less than 2.0 obligations since 2011.0

**Number of Unique Prefix Codes by Agency**

Unnamed: 0,primary_agency_name,n
28,Contra Costa County,18
107,San Francisco County,17
110,San Jose,17
119,Santa Clara County,17
2,Alameda County,16


**Number of Unique Agencies by Prefix Codes**

Unnamed: 0,prefix,n
91,STPL,112
19,CML,90
62,HSIPL,52
16,BRLS,32
75,RPSTPL,30


**Top 5 Types of Work**

Unnamed: 0,type_of_work
137,"Public Outreach And Marketing For ""the Bus"" (tc)"
12,Bridge Rail Replacement
13,Bridge Rail Replacement (tc)
208,Upgrade Traffic System Controller; Brt Phase( Tc)
212,Yarts Public Outreach And Marketing (tc)


**Number of Obligations**

**Prefix Codes**

**Funding Amounts**

**Work Categories**

**Top Agencies using Active Transp Projects**

Unnamed: 0,Agency,Active Transp Obligations,Percent of Category
0,San Jose,79,7.41%
1,Oakland,65,6.10%
2,"City & County of San Francisco, MTA/Parking & Traffic",46,4.32%
3,Sunnyvale,45,4.22%
4,Contra Costa County,39,3.66%


**Top Agencies using Transit Projects**

Unnamed: 0,Agency,Transit Obligations,Percent of Category
0,Metropolitan Transportation Commission,18,23.08%
1,Contra Costa County,6,7.69%
2,Sunnyvale,6,7.69%
3,Marin County,4,5.13%
4,Oakland,4,5.13%


**Top Agencies using Bridge Projects**

Unnamed: 0,Agency,Bridge Obligations,Percent of Category
0,Santa Clara County,68,14.23%
1,Contra Costa County,44,9.21%
2,Sonoma County,33,6.90%
3,Napa County,25,5.23%
4,Oakland,22,4.60%


**Top Agencies using Street Projects**

Unnamed: 0,Agency,Street Obligations,Percent of Category
0,Oakland,82,6.25%
1,Contra Costa County,81,6.17%
2,San Jose,66,5.03%
3,San Francisco County,45,3.43%
4,Sunnyvale,45,3.43%


**Top Agencies using Freeway Projects**

Unnamed: 0,Agency,Freeway Obligations,Percent of Category
0,Metropolitan Transportation Commission,11,28.21%
1,Contra Costa County,7,17.95%
2,Caltrans,4,10.26%
3,Metropolitan Transportation Commission - SAFE,3,7.69%
4,Alameda County Congestion Management Agency,2,5.13%


**Top Agencies using Infra Resiliency Er Projects**

Unnamed: 0,Agency,Infra Resiliency Er Obligations,Percent of Category
0,Contra Costa County,102,5.24%
1,Napa County,88,4.52%
2,Santa Clara County,82,4.21%
3,Oakland,70,3.60%
4,Sonoma County,69,3.55%


**Top Agencies using Congestionumber Of Relief Projects**

Unnamed: 0,Agency,Congestionumber Of Relief Obligations,Percent of Category
0,Contra Costa Transportation Authority,6,22.22%
1,Metropolitan Transportation Commission,5,18.52%
2,Sonoma County Transportation Authority,4,14.81%
3,San Mateo,3,11.11%
4,Solano Transportation Authority,3,11.11%


In [165]:
transit = (df>>filter(_.transit==1))

In [13]:
len(transit)

432

In [15]:
print(f"Out of {len(df)} obligations, {len(transit)} are transit-related.")

Out of 21059 obligations, 432 are transit-related.


In [16]:
print(f"")




In [19]:
q = df>>count(_.primary_agency_name)

In [52]:
q2 = q.n.quantile(.95)

In [54]:
len(q>>filter(_.n> (q2)))

30

In [60]:
print(f"There are {len(q>>filter(_.n> (q2)))} agencies have over {q2} obligations (95th percentile) since {(df.prepared_y.min())} ")

There are 30 agencies have over 133.0 obligations (95th percentile) since 2010.0 


In [108]:
q3 = q.n.quantile(.05)

In [109]:
len(q>>filter(_.n< (q3)))

31

In [113]:
print(f"There are {len(q>>filter(_.n< (q3)))} agencies have less than {q3} obligations since {(df.prepared_y.min())} ")


There are 31 agencies have less than 1.9000000000000021 obligations since 2010.0 


In [95]:
((transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0])

'Los Angeles'

In [98]:
print(f"{((transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0])} had the highest Transit obligations")

Los Angeles had the highest Transit obligations


## To add
* most agencies use # of prefix codes
* fix chart output labels