# Script work for Generating Charting Outputs

In [2]:
#! pip install cpi

In [1]:
import numpy as np
import pandas as pd
from siuba import *

import altair as alt
import altair_saver
from plotnine import *

from IPython.display import Markdown

from shared_utils import altair_utils
from shared_utils import geography_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

from calitp import to_snakecase
import intake

import _clean_data
import _dla_utils

E0401 21:12:56.722668268    1421 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0401 21:12:57.155898279    1421 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [2]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
df= pd.read_parquet("dla_df.parquet")

In [4]:
len(df)

21117

## Charts

In [5]:
def make_charts(df, subset):     
    df= pd.read_parquet("dla_df.parquet")
    
    df = (df>>filter(_.dist==subset))
    
    #subsetting the data
    df_years = _dla_utils.count_all_years(df)
    df_top = _dla_utils.find_top(df)
    
    
    ##print statements
    display(Markdown(f"# **District {subset} Analysis**"))
    
    
    display(Markdown(f"## **Quick Stats**"))
    
    display(Markdown(f'There are **{(df.primary_agency_name.nunique())} Unique Agencies**'))
    
    transit = (df>>filter(_.transit==1))
    display(Markdown(f"Out of **{len(df)}** obligations, **{len(transit)} are transit-related**."))
    display(Markdown(f"**{(transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0]}** has the **highest transit** obligations"))
    
    q = df>>count(_.primary_agency_name)>>arrange(_.n)
    
    q2 = q.n.quantile(.95)
    display(Markdown(f"There are **{len(q>>filter(_.n> (q2)))} agencies have over {q2}** obligations (95th percentile) since {(df.prepared_y.min())}")
           )   
    
    q3 = q.n.quantile(.1)
    display(Markdown((f"There are **{len(q>>filter(_.n< (q3)))} agencies have less than {q3}** obligations (5th percentile) since {(df.prepared_y.min())}")))


    ##tables
    
    display(Markdown(f"**Number of Unique Prefix Codes by Agency**"))
    display((_dla_utils.get_nunique(df, 'prefix', 'primary_agency_name'))
            .rename(columns={'primary_agency_name':'Agency',
                            'n':'Number of Unqiue Prefix Codes'})
            .head(5))
    
    display(Markdown(f"**Number of Unique Agencies by Prefix Codes**"))
    display((_dla_utils.get_nunique(df, 'primary_agency_name', 'prefix'))
            .rename(columns={'prefix':'Prefix',
                            'n':'Number of Unqiue Agencies'})
            .head(5))
    
    display(Markdown(f"**Top 5 Types of Work**"))
    display((transit>>count(_.type_of_work)>>arrange(-_.n)>>select(_.type_of_work))
            .rename(columns={'type_of_work':'Type of Work'})
            .head(5))

       
    #Visual Charts

    display(Markdown(f"## **Number of Obligations**"))
    
    #Line chart for Obligations by Year
    chart_df = (df_top>>filter(_.variable=='prepared_y')).rename(columns= {"value":"Year"})
    
    chart1= (_dla_utils.basic_line_chart_test_no_save(chart_df, 'Year:O', 'count', subset, 'Obligations by Year'))
    
    display(chart1)
    
    # Unique Agencies by Dist
    dist_years_agency = ((
        df
        >> group_by(_.prepared_y, _.dist)
        >> summarize(n=_.primary_agency_name.nunique())
        >> arrange(-_.prepared_y)
    )
        .rename(columns={'dist':'District', 'n':'Count'})
    )
    

    chart10 = (alt.Chart(dist_years_agency).mark_bar().encode(
        column='District:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('Count:Q', title='Number of Unique Agencies'),
        color = alt.Color("District:N", 
                              scale=alt.Scale(
                                  range=altair_utils.CALITP_SEQUENTIAL_COLORS),  
                               legend=alt.Legend(title="Prepared Year")
                              )
                              ).properties( 
                       title="Number of Unique Agencies by District"))
    chart10 = styleguide.preset_chart_config(chart10)
    chart10 = _dla_utils.add_tooltip(chart10, 'prepared_y', 'Count')
    display(chart10)
    
    # Unique Prefixes by Dist
    dist_years_prefix = ((
        df
        >> group_by(_.prepared_y, _.dist)
        >> summarize(n=_.prefix.nunique())
        >> arrange(-_.prepared_y)
    ).rename(columns={'dist':'District', 'n':'Count'}))

    chart11 = (alt.Chart(dist_years_prefix).mark_bar().encode(
        column='District:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('Count:Q', title='Number of Unique Agencies'),
        color = alt.Color("District:N", 
                              scale=alt.Scale(
                                  range=altair_utils.CALITP_SEQUENTIAL_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              ).properties( 
                       title="Number of Unique Prefix Codes by District"))
    chart11 = styleguide.preset_chart_config(chart11)
    chart11 = _dla_utils.add_tooltip(chart11, 'prepared_y','Count')
    display(chart11)

    
    #Bar chart Agencies With The Most Obligations
    chart_df = (df_top>>filter(_.variable=='primary_agency_name')).rename(columns={"value":"Agency",
                                 "count":"Number of Obligations"})
    chart2= (_dla_utils.basic_bar_chart_no_save(chart_df, 'Agency', 'Number of Obligations', 'Agency', subset, 'Agencies With The Most Obligations'))
    display(chart2)
    
    
    display(Markdown(f"## **Prefix Codes**"))
    
    #Bar chart Agencies With The Most Unique Prefix Codes
    
    chart3 = (_dla_utils.basic_bar_chart_no_save(((_dla_utils.get_nunique(df, 'prefix', 'primary_agency_name')).head(30)),
                            'primary_agency_name', 'n', 'primary_agency_name', subset, 'Agencies With The Most Unique Prefix Codes'))
    display(chart3)
    
    # Bar chart Average Total Requested Funds by Prefix
    chart8 = (_dla_utils.basic_bar_chart_no_save((((_dla_utils.calculate_data_all(df, 'adjusted_total_requested', 'prefix', aggfunc="mean"))
                          >>arrange(-_.adjusted_total_requested)).head(30)), 'prefix','adjusted_total_requested', 'prefix', subset,
                           'Average Total Requested Funds by Prefix ($2021)'
                       ))
    display(chart8)
    
    #Bar chart with the Most Used Prefix Counts
    chart_df = (df_top>>filter(_.variable=='prefix')).rename(columns={"value":"Prefix",
                                 "count":"Number of Obligations"})
    chart9= (_dla_utils.basic_bar_chart_no_save(chart_df, 'Prefix', 'Number of Obligations', 'Prefix', subset, 'Most Used Prefix Codes'))
    display(chart9)
    
    
    display(Markdown(f"## **Funding Distribution**"))
    
    #Bar chart Average Total Requested Funds by Agency
    chart4=(_dla_utils.basic_bar_chart_no_save((((_dla_utils.calculate_data_all(df, 'adjusted_total_requested', 'primary_agency_name', aggfunc="mean"))
                          >>arrange(-_.adjusted_total_requested)).head(30)
                        ), 'primary_agency_name','adjusted_total_requested', 'primary_agency_name', subset,
                           'Average Total Requested Funds by Agency ($2021)'
                       ))
    display(chart4)
    
    
    #Bar chart Bottom Average Total Requested Funds by Agency
    avg_funds_bottom = (df>>group_by(_.primary_agency_name)>>summarize(avg_funds=_.adjusted_total_requested.mean())>>arrange(-_.avg_funds)).tail(50)

    chart5=( _dla_utils.basic_bar_chart_no_save((avg_funds_bottom.tail(40)), 'primary_agency_name','avg_funds', 'primary_agency_name', subset, 
                          'Lowest Average Total Funds by Agency ($2021'))
    display(chart5)
    
    
    #work categories info and charts
    display(Markdown(f"## **Work Categories**"))
    
    work_cat = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']
    
    for i in work_cat:
        subset = (df>>filter(_[i]==1))
        subset_2 = ((_dla_utils.find_top(subset))>>filter(_.variable=='primary_agency_name')
                >>select(_.value,_.count)).head(5)
        subset_2['Percent of Category'] = (((subset_2['count'])/(len(subset)))*100)
        subset_2 =subset_2.rename(columns = {'value':'Agency', 'count':f'{_dla_utils.labeling(i)} Obligations'})
    
    
        #generate chart:
    
        subset_3= ((subset.groupby(['primary_agency_name']).agg({i:'sum',
                                                    'process_days':'mean',
                                                    'adjusted_total_requested':'mean',
                                                    'adjusted_fed_requested':'mean',
                                                    'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_[i])).head(5)
    
        subset_3 =subset_3.rename(columns = {'primary_agency_name':'Agency',
                                         'adjusted_total_requested':'Total Requested',
                                         'adjusted_fed_requested':'Fed Requested',
                                         'adjusted_ac_requested':'AC Requested'})
    
        subset_4 = pd.melt(subset_3, id_vars=['Agency'],
                           value_vars=['Total Requested','Fed Requested','AC Requested'],
                           var_name='Categories', value_name='Funding Amount')

    
        chart = (alt.Chart(subset_4).mark_bar().encode(
            x=alt.X('Funding Amount', axis=alt.Axis(format='$.2s', title='Obligated Funding ($2021)')),
            y=alt.Y("Agency"),
            color = alt.Color("Categories:N", 
                                   scale=alt.Scale(
                                      range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS)),
            row='Categories:N'
        ))
        
        chart = _dla_utils.add_tooltip(chart, 'Agency','Funding Amount')
    
        display(Markdown(f'**Top Agencies using {_dla_utils.labeling(i)} Projects**'))
        display(subset_2.style.format(formatter={('Percent of Category'): "{:.2f}%"}))
        display(chart)


In [6]:
#parameters cell
subset = 4

In [7]:
make_charts(df, subset)

# **District 4 Analysis**

## **Quick Stats**

There are **151 Unique Agencies**

Out of **3417** obligations, **63 are transit-related**.

**Metropolitan Transportation Commission** has the **highest transit** obligations

There are **8 agencies have over 93.0** obligations (95th percentile) since 2011.0

There are **9 agencies have less than 2.0** obligations (5th percentile) since 2011.0

**Number of Unique Prefix Codes by Agency**

Unnamed: 0,Agency,Number of Unqiue Prefix Codes
28,Contra Costa County,18
107,San Francisco County,17
110,San Jose,17
119,Santa Clara County,17
2,Alameda County,16


**Number of Unique Agencies by Prefix Codes**

Unnamed: 0,Prefix,Number of Unqiue Agencies
91,STPL,112
19,CML,90
62,HSIPL,52
16,BRLS,32
75,RPSTPL,30


**Top 5 Types of Work**

Unnamed: 0,Type of Work
6,Construct Pedestrian Safety And Transit Access
14,"Implement Bike, Pedestrian, And Transit Access..."
28,"Ped. Crossing, Bike Racks, Bus Shelter"
4,Clipper Fare Collection System Phase 3 (tc)
7,Construction Of Transit Center Depot Building


## **Number of Obligations**

## **Prefix Codes**

## **Funding Distribution**

## **Work Categories**

**Top Agencies using Active Transportation Projects**

Unnamed: 0,Agency,Active Transportation Obligations,Percent of Category
0,San Jose,79,7.38%
1,Oakland,65,6.07%
2,"City & County of San Francisco, MTA/Parking & Traffic",46,4.30%
3,Sunnyvale,45,4.21%
4,Contra Costa County,39,3.64%


**Top Agencies using Transit Projects**

Unnamed: 0,Agency,Transit Obligations,Percent of Category
0,Metropolitan Transportation Commission,18,28.57%
1,Sunnyvale,6,9.52%
2,Oakland,4,6.35%
3,San Jose,4,6.35%
4,San Leandro,4,6.35%


**Top Agencies using Bridge Projects**

Unnamed: 0,Agency,Bridge Obligations,Percent of Category
0,Santa Clara County,68,14.17%
1,Contra Costa County,44,9.17%
2,Sonoma County,34,7.08%
3,Napa County,25,5.21%
4,Oakland,22,4.58%


**Top Agencies using Street Projects**

Unnamed: 0,Agency,Street Obligations,Percent of Category
0,Oakland,82,6.23%
1,Contra Costa County,81,6.16%
2,San Jose,66,5.02%
3,San Francisco County,45,3.42%
4,Sunnyvale,45,3.42%


**Top Agencies using Freeway Projects**

Unnamed: 0,Agency,Freeway Obligations,Percent of Category
0,Metropolitan Transportation Commission,11,28.21%
1,Contra Costa County,7,17.95%
2,Caltrans,4,10.26%
3,Metropolitan Transportation Commission - SAFE,3,7.69%
4,Alameda County Congestion Management Agency,2,5.13%


**Top Agencies using Infrastructure & Emergency Relief Projects**

Unnamed: 0,Agency,Infrastructure & Emergency Relief Obligations,Percent of Category
0,Contra Costa County,102,5.22%
1,Napa County,88,4.50%
2,Santa Clara County,82,4.20%
3,Sonoma County,71,3.63%
4,Oakland,70,3.58%


**Top Agencies using Congestion Relief Projects**

Unnamed: 0,Agency,Congestion Relief Obligations,Percent of Category
0,Contra Costa Transportation Authority,6,22.22%
1,Metropolitan Transportation Commission,5,18.52%
2,Sonoma County Transportation Authority,4,14.81%
3,San Mateo,3,11.11%
4,Solano Transportation Authority,3,11.11%


## Other

### test transit cat

In [8]:
transit = (df>>filter(_.transit==1))

In [9]:
len(transit)

316

In [10]:
print(f"Out of {len(df)} obligations, {len(transit)} are transit-related.")

Out of 21117 obligations, 316 are transit-related.


In [11]:
print(f"")




### test quantiles

In [12]:
q = df>>count(_.primary_agency_name)

In [13]:
q2 = q.n.quantile(.95)

In [14]:
len(q>>filter(_.n> (q2)))

31

In [15]:
print(f"There are {len(q>>filter(_.n> (q2)))} agencies have over {q2} obligations (95th percentile) since {(df.prepared_y.min())} ")

There are 31 agencies have over 132.10000000000002 obligations (95th percentile) since 2010.0 


In [16]:
q3 = q.n.quantile(.05)

In [17]:
len(q>>filter(_.n< (q3)))

31

In [18]:
print(f"There are {len(q>>filter(_.n< (q3)))} agencies have less than {q3} obligations since {(df.prepared_y.min())} ")


There are 31 agencies have less than 1.9000000000000021 obligations since 2010.0 


In [19]:
((transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0])

'Los Angeles'

In [20]:
print(f"{((transit>>count(_.primary_agency_name)>>arrange(-_.n)).iloc[0, 0])} had the highest Transit obligations")

Los Angeles had the highest Transit obligations


In [21]:
dist_years_prefix = (
    df
    >> group_by(_.prepared_y, _.dist)
    >> summarize(n=_.prefix.nunique())
    >> arrange(-_.prepared_y)
).rename(columns={"dist": "District"})

In [22]:
dist_years_prefix

Unnamed: 0,prepared_y,District,n
117,2022.00,1,8
118,2022.00,2,9
119,2022.00,3,25
120,2022.00,4,19
121,2022.00,5,7
...,...,...,...
6,2012.00,8,2
1,2011.00,4,1
2,2011.00,8,3
3,2011.00,11,1


In [23]:
## adding the amount of agencies and prefix codes by year: what districts workload by year looks like

In [24]:

dist_years_agency = ((
    df
    >> group_by(_.prepared_y, _.dist)
    >> summarize(n=_.primary_agency_name.nunique())
    >> arrange(-_.prepared_y)
).rename(columns={"dist":"District"}))

In [25]:
dist_years_agency

Unnamed: 0,prepared_y,District,n
117,2022.00,1,5
118,2022.00,2,10
119,2022.00,3,28
120,2022.00,4,52
121,2022.00,5,13
...,...,...,...
6,2012.00,8,2
1,2011.00,4,1
2,2011.00,8,2
3,2011.00,11,1


In [26]:


chart = alt.Chart((dist_years_agency)).mark_bar().encode(
        column='District:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('n:Q', title='Number of Unique Agencies'),
        color = alt.Color("District:N", 
                              scale=alt.Scale(
                                  range=altair_utils.CALITP_SEQUENTIAL_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              ).properties(title="Number of Unique Prefix Codes by District")

chart = styleguide.preset_chart_config(chart)
    
chart

In [27]:
#of agencies using certain prefix code

In [29]:
display(Markdown(f"**Test Work Categories Axis Modification**"))
    
work_cat = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']
    
for i in work_cat:
    subset = (df>>filter(_[i]==1))
    subset_2 = ((_dla_utils.find_top(subset))>>filter(_.variable=='primary_agency_name')
                >>select(_.value,_.count)).head(5)
    subset_2['Percent of Category'] = (((subset_2['count'])/(len(subset)))*100)
    subset_2 =subset_2.rename(columns = {'value':'Agency', 'count':f'{_dla_utils.labeling(i)} Obligations'})
    
    
    #generate chart:
    
    subset_3= ((subset.groupby(['primary_agency_name']).agg({i:'sum',
                                                    'process_days':'mean',
                                                    'adjusted_total_requested':'mean',
                                                    'adjusted_fed_requested':'mean',
                                                    'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_[i])).head(5)
    
    subset_3 =subset_3.rename(columns = {'primary_agency_name':'Agency',
                                         'adjusted_total_requested':'Total Requested',
                                         'adjusted_fed_requested':'Fed Requested',
                                         'adjusted_ac_requested':'AC Requested'})
    
    subset_4 = pd.melt(subset_3, id_vars=['Agency'],
        value_vars=['Total Requested','Fed Requested','AC Requested'],
        var_name='Categories', value_name='value'
        )

    chart = (alt.Chart(subset_4).mark_bar().encode(
        x=alt.X('value', axis=alt.Axis(format='$.2s', title='Obligated Funding ($2021)')),
        y=alt.Y("Agency"),
        color='Categories:N',
        row='Categories:N'
    ))
    
    display(Markdown(f'**Top Agencies using {_dla_utils.labeling(i)} Projects**'))
    display(subset_2.style.format(formatter={('Percent of Category'): "{:.2f}%"}))
    display(chart)
    

**Test Work Categories Axis Modification**

**Top Agencies using Active Transportation Projects**

Unnamed: 0,Agency,Active Transportation Obligations,Percent of Category
0,Los Angeles,153,3.11%
1,Sacramento County,112,2.28%
2,Los Angeles County,96,1.95%
3,Fresno,94,1.91%
4,El Dorado County,92,1.87%


**Top Agencies using Transit Projects**

Unnamed: 0,Agency,Transit Obligations,Percent of Category
0,Los Angeles,35,11.08%
1,Metropolitan Transportation Commission,18,5.70%
2,Stanislaus County,17,5.38%
3,Stockton,14,4.43%
4,Yosemite Area Regional Transportation System JPA,14,4.43%


**Top Agencies using Bridge Projects**

Unnamed: 0,Agency,Bridge Obligations,Percent of Category
0,Tulare County,110,3.14%
1,Fresno County,108,3.08%
2,Humboldt County,104,2.96%
3,El Dorado County,94,2.68%
4,Lake County,94,2.68%


**Top Agencies using Street Projects**

Unnamed: 0,Agency,Street Obligations,Percent of Category
0,Santa Cruz County,303,3.51%
1,Los Angeles,214,2.48%
2,Los Angeles County,195,2.26%
3,Fresno,194,2.24%
4,Stockton,181,2.09%


**Top Agencies using Freeway Projects**

Unnamed: 0,Agency,Freeway Obligations,Percent of Category
0,Caltrans,38,16.38%
1,San Bernardino Associated Governments,19,8.19%
2,Placer County Transportation Planning Agency,15,6.47%
3,Orange County Transportation Authority,14,6.03%
4,Bakersfield,13,5.60%


**Top Agencies using Infrastructure & Emergency Relief Projects**

Unnamed: 0,Agency,Infrastructure & Emergency Relief Obligations,Percent of Category
0,Humboldt County,596,4.77%
1,Santa Cruz County,416,3.33%
2,Los Angeles County,318,2.55%
3,Los Angeles,219,1.75%
4,Mendocino County,215,1.72%


**Top Agencies using Congestion Relief Projects**

Unnamed: 0,Agency,Congestion Relief Obligations,Percent of Category
0,Kern County Council of Governments,20,12.66%
1,Stanislaus Council of Governments,18,11.39%
2,Modesto,17,10.76%
3,San Diego Association of Governments,12,7.59%
4,Ventura County Transportation Commission,10,6.33%


## To add
* most agencies use # of prefix codes
* ~~fix chart output labels~~
* ~~change axis on charts for Project Categories to $M~~
    * [currency formatter](https://github.com/d3/d3-format#precisionRound)
* ~~add tool tips~~
* ~~alt colors in work cat charts~~
