# Project Categories Analysis  

In [1]:
import pandas as pd
from siuba import *
import numpy as np

In [2]:
import altair as alt
from shared_utils import geography_utils
from shared_utils import altair_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

E0317 15:12:03.567689804     863 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0317 15:12:06.221515688     863 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [3]:
from IPython.display import Markdown

In [4]:
import _dla_utils

In [5]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', 1000)

In [6]:
df= pd.read_parquet("dla_df.parquet")

In [7]:
df['process_days']= (df['dist_processing_days'] + df['hq_processing_days'] + df['fhwa_processing_days'])

In [8]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,work_categories,process_days
15828,Obligated,CMLN,5466(021),Yucca Valley,2019-11-12,2019-12-12,2019-12-30,2019-12-30,2020-01-24,0.0,0.0,21052.34,Authorized,5466,8,E-76 approved on,44.0,18.0,25.0,20150301,Twentynine Palms Highway (sr 62) From Sage Avenue To The Intersection Of Yucca,Traffic Light Synchronization,2,2019-10-29,2019-12-12,SCAG,5466,2019.0,Yucca Valley,22313.27,0.0,0.0,0,0,0,1,0,0,0,1,87.0


In [9]:
df.work_categories.value_counts()

1    8816
2    8731
0    1958
3    1444
4      68
5       7
Name: work_categories, dtype: int64

## Group By: Prepared Year 

In [10]:
sum_groups = (df.groupby(['prepared_y'])
               .agg({'active_transp':'sum', 'transit':'sum', 'bridge':'sum',
                      'street':'sum','freeway':'sum','infra_resiliency_er':'sum',
                     'congestion_relief':'sum'}).reset_index())
sum_groups.set_index('prepared_y', inplace=True)
sum_groups

Unnamed: 0_level_0,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief
prepared_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010.0,0,0,0,1,0,0,0
2011.0,0,0,1,3,1,2,0
2012.0,4,0,0,2,0,0,0
2013.0,39,11,36,58,2,102,0
2014.0,631,112,431,1045,12,1487,14
2015.0,600,105,455,924,23,1427,16
2016.0,606,118,459,952,52,1360,19
2017.0,599,113,420,976,25,1363,19
2018.0,576,138,423,1142,22,1781,28
2019.0,586,158,419,1167,31,1697,17


In [11]:
sum_groups = sum_groups.append(sum_groups.sum().rename('total'))



In [12]:
pct_years = sum_groups.copy()

In [13]:
pct_years['totals'] = pct_years.sum(numeric_only=True, axis=1)

In [14]:

cols = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']
for col in cols: 
    pct_years[f'pct_{col}'] = (pct_years[col]/(pct_years['totals']))*100


In [15]:
pct_years.drop(pct_years.tail(1).index,inplace=True) # drop last rows

In [16]:
pct_years= pct_years.reset_index()

In [17]:
def labeling(word):
    # Add specific use cases where it's not just first letter capitalized
    LABEL_DICT = { "prepared_y": "Year",
              "dist": "District",
              "total_requested": "Total Requested",
              "fed_requested":"Fed Requested",
              "ac_requested": "Advance Construction Requested",
              "nunique":"Number of Unique",
              "project_no": "Project Number"}
    
    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        word = word.replace('n_', 'Number of ').title()
        word = word.replace('unique_', "Number of Unique ").title()
        word = word.replace('_', ' ').title()
    
    return word
    

#### selecting just the percentages

In [18]:
pct_years= pct_years>>select(_.prepared_y, _.pct_active_transp, _.pct_transit, _.pct_bridge,
                    _.pct_street, _.pct_freeway, _.pct_infra_resiliency_er,
                    _.pct_congestion_relief)

In [19]:
pct_years = (pct_years >> gather('category', 'percent', _.pct_active_transp,
                    _.pct_transit,
                    _.pct_bridge,
                    _.pct_street,
                    _.pct_freeway,
                    _.pct_infra_resiliency_er,
                    _.pct_congestion_relief
                    ))


In [20]:
pct_years

Unnamed: 0,prepared_y,category,percent
0,2010.00,pct_active_transp,0.00
1,2011.00,pct_active_transp,0.00
2,2012.00,pct_active_transp,66.67
3,2013.00,pct_active_transp,15.73
4,2014.00,pct_active_transp,16.91
...,...,...,...
86,2018.00,pct_congestion_relief,0.68
87,2019.00,pct_congestion_relief,0.42
88,2020.00,pct_congestion_relief,0.47
89,2021.00,pct_congestion_relief,0.68


#### Charting

In [21]:
highlight = alt.selection(type='single', on="mouseover", 
                              fields=['category', "prepared_y"], nearest=True)
selection = alt.selection_multi(
                          fields=['category'], bind='legend')
    

chart = (alt.Chart(pct_years).mark_line().encode(
    x=alt.X('prepared_y:O', title=labeling('prepared_y')),
    y=alt.Y('percent', title=labeling('percent')),
    color=alt.Color("category:N", title = "Work Category"), 
    tooltip = alt.Tooltip(["prepared_y", 'percent', "category"]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    size=alt.condition(~highlight, alt.value(2), alt.value(5))
    ).properties(title={
                 "text": ["Percent Breakdown of Work Catergories"]}).add_selection(
                 selection, highlight)
    )

chart = styleguide.preset_chart_config(chart)
display(chart)

## Totals over the Years

In [22]:
sum_groups.iloc[13]

active_transp           4892
transit                 1115
bridge                  3483
street                  8600
freeway                  230
infra_resiliency_er    12417
congestion_relief        158
Name: total, dtype: int64

In [23]:
sums = sum_groups.transpose()
sums.reset_index()
# sums.rename(columns = {'index':'cat'})
sums

prepared_y,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,total
active_transp,0,0,4,39,631,600,606,599,576,586,536,605,110,4892
transit,0,0,0,11,112,105,118,113,138,158,140,172,48,1115
bridge,0,1,0,36,431,455,459,420,423,419,351,403,85,3483
street,1,3,2,58,1045,924,952,976,1142,1167,1078,1032,220,8600
freeway,0,1,0,2,12,23,52,25,22,31,22,33,7,230
infra_resiliency_er,0,2,0,102,1487,1427,1360,1363,1781,1697,1446,1388,364,12417
congestion_relief,0,0,0,0,14,16,19,19,28,17,17,25,3,158


In [24]:
sums['total'].sum()

30895

#### get a total percentage over all the years

In [25]:
sums['total_pct'] = (sums['total']/(sums['total'].sum()))*100

In [26]:
sums

prepared_y,2010.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0,total,total_pct
active_transp,0,0,4,39,631,600,606,599,576,586,536,605,110,4892,15.83
transit,0,0,0,11,112,105,118,113,138,158,140,172,48,1115,3.61
bridge,0,1,0,36,431,455,459,420,423,419,351,403,85,3483,11.27
street,1,3,2,58,1045,924,952,976,1142,1167,1078,1032,220,8600,27.84
freeway,0,1,0,2,12,23,52,25,22,31,22,33,7,230,0.74
infra_resiliency_er,0,2,0,102,1487,1427,1360,1363,1781,1697,1446,1388,364,12417,40.19
congestion_relief,0,0,0,0,14,16,19,19,28,17,17,25,3,158,0.51


#### select just the total numbers and percentage

In [27]:
sums_short = (sums>>select(_.total, _.total_pct)).reset_index()

In [28]:
sums_short.rename(columns={'index':'cat'}, inplace=True)

In [29]:
sums_short

prepared_y,cat,total,total_pct
0,active_transp,4892,15.83
1,transit,1115,3.61
2,bridge,3483,11.27
3,street,8600,27.84
4,freeway,230,0.74
5,infra_resiliency_er,12417,40.19
6,congestion_relief,158,0.51


#### Charting

In [30]:
chart = alt.Chart(sums_short).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="total_pct", type="quantitative"),
    color=(alt.Color(field="cat", type="nominal", scale=alt.Scale(
                                      range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS)
                     ,
                    legend=alt.Legend(title=('Project Categories'))
                    )
          )
)
chart

## By District

In [31]:
sum_dist = (df.groupby(['dist'])
               .agg({'active_transp':'sum', 'transit':'sum', 'bridge':'sum',
                      'street':'sum','freeway':'sum','infra_resiliency_er':'sum',
                     'congestion_relief':'sum'}).reset_index())
sum_dist.set_index('dist', inplace=True)
sum_dist = sum_dist.append(sum_dist.sum().rename('total'))
sum_dist['totals'] = sum_dist.sum(numeric_only=True, axis=1)
sum_dist



Unnamed: 0_level_0,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,totals
dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,1,3,0,4
1,133,30,282,129,2,1017,0,1593
2,69,58,272,224,7,481,2,1113
3,799,156,618,947,26,1613,12,4171
4,1065,193,475,1313,39,1943,27,5055
5,165,120,349,546,4,997,4,2185
6,720,137,351,1498,29,1800,20,4555
7,821,178,298,1432,40,1493,10,4272
8,253,53,250,576,34,816,15,1997
9,58,24,12,168,0,152,0,414


In [32]:
dpct = sum_dist.copy()

#### get percentages by districts

In [33]:

cols = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']
for col in cols: 
    dpct[f'pct_{col}'] = (dpct[col]/(dpct['totals']))*100


In [34]:
dpct

Unnamed: 0_level_0,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,totals,pct_active_transp,pct_transit,pct_bridge,pct_street,pct_freeway,pct_infra_resiliency_er,pct_congestion_relief
dist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0,0,0,0,1,3,0,4,0.0,0.0,0.0,0.0,25.0,75.0,0.0
1,133,30,282,129,2,1017,0,1593,8.35,1.88,17.7,8.1,0.13,63.84,0.0
2,69,58,272,224,7,481,2,1113,6.2,5.21,24.44,20.13,0.63,43.22,0.18
3,799,156,618,947,26,1613,12,4171,19.16,3.74,14.82,22.7,0.62,38.67,0.29
4,1065,193,475,1313,39,1943,27,5055,21.07,3.82,9.4,25.97,0.77,38.44,0.53
5,165,120,349,546,4,997,4,2185,7.55,5.49,15.97,24.99,0.18,45.63,0.18
6,720,137,351,1498,29,1800,20,4555,15.81,3.01,7.71,32.89,0.64,39.52,0.44
7,821,178,298,1432,40,1493,10,4272,19.22,4.17,6.98,33.52,0.94,34.95,0.23
8,253,53,250,576,34,816,15,1997,12.67,2.65,12.52,28.84,1.7,40.86,0.75
9,58,24,12,168,0,152,0,414,14.01,5.8,2.9,40.58,0.0,36.71,0.0


In [35]:

dpct = (dpct>>select( _.pct_active_transp, _.pct_transit, _.pct_bridge,
                    _.pct_street, _.pct_freeway, _.pct_infra_resiliency_er,
                    _.pct_congestion_relief))

In [36]:
#dpct = dpct.transpose()

In [37]:
dpct = dpct.reset_index()

In [38]:
dpct

Unnamed: 0,dist,pct_active_transp,pct_transit,pct_bridge,pct_street,pct_freeway,pct_infra_resiliency_er,pct_congestion_relief
0,0,0.0,0.0,0.0,0.0,25.0,75.0,0.0
1,1,8.35,1.88,17.7,8.1,0.13,63.84,0.0
2,2,6.2,5.21,24.44,20.13,0.63,43.22,0.18
3,3,19.16,3.74,14.82,22.7,0.62,38.67,0.29
4,4,21.07,3.82,9.4,25.97,0.77,38.44,0.53
5,5,7.55,5.49,15.97,24.99,0.18,45.63,0.18
6,6,15.81,3.01,7.71,32.89,0.64,39.52,0.44
7,7,19.22,4.17,6.98,33.52,0.94,34.95,0.23
8,8,12.67,2.65,12.52,28.84,1.7,40.86,0.75
9,9,14.01,5.8,2.9,40.58,0.0,36.71,0.0


In [39]:
# df_select= (dpct>>filter(_.dist==2))

In [40]:
# alt.Chart(df_select).mark_arc().encode(
#     theta=alt.Theta(field=cols, type="quantitative"),
#     color=(alt.Color(field=('pct_active_transp','pct_transit'),type="nominal", scale=alt.Scale(
#         range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS)
#                     )
#           ))

In [41]:
# dist_row = [0,1,2,3,4,5,6,7,8,9,10,11,12]

# for row in dist_row: 
    
#     cols = ['pct_active_transp', 'pct_transit', 'pct_bridge',
#                     'pct_street', 'pct_freeway', 'pct_infra_resiliency_er',
#                     'pct_congestion_relief']
    
#     df_select= (dpct>>filter(_.dist==row))
    
#     chart = alt.Chart(df_select).mark_arc().encode(
#         theta=alt.Theta(field=cols, type="quantitative"),
#         color=(alt.Color(field=cols, type="nominal", scale=alt.Scale(
#                                           range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS)
#                       #   ,
#                       #  legend=alt.Legend(title=(f'{_dla_utils.labeling(col)} Project Categories'))
#                         )
#               )
#     )
#     display(chart)

In [42]:
# dist_col = ['0', '1', '2', '3','4', '5','6','7','8','9','10','11','12']

# for col in dist_col: 

#     chart = alt.Chart(dpct).mark_arc().encode(
#         theta=alt.Theta(field=[col], type="quantitative"),
#         color=(alt.Color(field="index", type="nominal", scale=alt.Scale(
#                                           range=altair_utils.CALITP_CATEGORY_BRIGHT_COLORS)
#                       #   ,
#                       #  legend=alt.Legend(title=(f'{_dla_utils.labeling(col)} Project Categories'))
#                         )
#               )
#     )
#     display(chart)

## Top Agencies by Project Category

In [43]:
work_cat = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']

for i in work_cat:
    subset = (df>>filter(_[i]==1))
    subset_2 = ((_dla_utils.find_top(subset))>>filter(_.variable=='primary_agency_name')
            >>select(_.value,_.count)).head(5)
    subset_2['Percent of Category'] = (((subset_2['count'])/(len(subset)))*100)
    subset_2 =subset_2.rename(columns = {'value':'Agency', 'count':f'{_dla_utils.labeling(i)} Obligations'})
    
    
    #generate chart:
    
    subset_3= ((subset.groupby(['primary_agency_name']).agg({i:'sum',
                                                    'process_days':'mean',
                                                    'adjusted_total_requested':'mean',
                                                    'adjusted_fed_requested':'mean',
                                                    'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_[i])).head(5)
    
    subset_3 =subset_3.rename(columns = {'primary_agency_name':'Agency',
                                         'adjusted_total_requested':'Total Requested',
                                         'adjusted_fed_requested':'Fed Requested',
                                         'adjusted_ac_requested':'AC Requested'})
    
    subset_4 = pd.melt(subset_3, id_vars=['Agency'],
        value_vars=['Total Requested','Fed Requested','AC Requested'],
        var_name='Categories', value_name='value'
       )

    ## following cell block makes it hard to change name
    # subset_4 = (subset_3 >> gather('category', 'value', _.adjusted_total_requested,
    #                 _.adjusted_fed_requested,
    #                 _.adjusted_ac_requested,
    #                 ))
    
    chart = (alt.Chart(subset_4).mark_bar().encode(
        x=alt.X('value', axis=alt.Axis(format='$', title='Obligated Funding ($2021)'), scale=alt.Scale(domain=(-50000, 2000000))),
        y=alt.Y("Agency"),
        color='Categories:N',
        row='Categories:N'
    ))
    
    display(Markdown(f'**Top Agencies using {_dla_utils.labeling(i)} Projects**'))
    display(subset_2.style.format(formatter={('Percent of Category'): "{:.2f}%"}))
    display(chart)

**Top Agencies using Active Transp Projects**

Unnamed: 0,Agency,Active Transp Obligations,Percent of Category
0,Los Angeles,153,3.13%
1,Sacramento County,111,2.27%
2,Los Angeles County,96,1.96%
3,Fresno,94,1.92%
4,El Dorado County,92,1.88%


**Top Agencies using Transit Projects**

Unnamed: 0,Agency,Transit Obligations,Percent of Category
0,Los Angeles County,44,3.94%
1,Los Angeles,39,3.49%
2,Monterey County,38,3.40%
3,Fresno,32,2.86%
4,Visalia,29,2.59%


**Top Agencies using Bridge Projects**

Unnamed: 0,Agency,Bridge Obligations,Percent of Category
0,Tulare County,110,3.15%
1,Fresno County,108,3.10%
2,Humboldt County,104,2.98%
3,El Dorado County,94,2.70%
4,Lake County,94,2.70%


**Top Agencies using Street Projects**

Unnamed: 0,Agency,Street Obligations,Percent of Category
0,Santa Cruz County,300,3.49%
1,Los Angeles,211,2.45%
2,Fresno,194,2.26%
3,Los Angeles County,192,2.23%
4,Stockton,180,2.09%


**Top Agencies using Freeway Projects**

Unnamed: 0,Agency,Freeway Obligations,Percent of Category
0,Caltrans,37,16.09%
1,San Bernardino Associated Governments,19,8.26%
2,Placer County Transportation Planning Agency,15,6.52%
3,Orange County Transportation Authority,14,6.09%
4,Bakersfield,13,5.65%


**Top Agencies using Infra Resiliency Er Projects**

Unnamed: 0,Agency,Infra Resiliency Er Obligations,Percent of Category
0,Humboldt County,594,4.78%
1,Santa Cruz County,413,3.32%
2,Los Angeles County,315,2.53%
3,Los Angeles,216,1.74%
4,Mendocino County,215,1.73%


**Top Agencies using Congestionumber Of Relief Projects**

Unnamed: 0,Agency,Congestionumber Of Relief Obligations,Percent of Category
0,Kern County Council of Governments,20,12.66%
1,Stanislaus Council of Governments,18,11.39%
2,Modesto,17,10.76%
3,San Diego Association of Governments,12,7.59%
4,Ventura County Transportation Commission,10,6.33%


#### Using transit to get the code

In [44]:
subset3 = (df>>filter(_.transit==1))
subset4 = ((_dla_utils.find_top(subset3))>>filter(_.variable=='primary_agency_name')
            >>select(_.value,_.count)).head(10)

In [45]:
subset4

Unnamed: 0,value,count
0,Los Angeles County,44
1,Los Angeles,39
2,Monterey County,38
3,Fresno,32
4,Visalia,29
5,Santa Cruz County,28
6,Butte County,22
7,Shasta County,22
8,Stockton,22
9,El Dorado County,18


In [46]:
subset4 = ((subset3.groupby(['primary_agency_name']).agg({'transit':'sum','process_days':'mean',
                                               'adjusted_total_requested':'mean',
                                               'adjusted_fed_requested':'mean',
                                               'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_.transit)).head(5)

In [47]:
subset4

Unnamed: 0,primary_agency_name,transit,process_days,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested
72,Los Angeles County,44,34.4,567736.76,439146.57,27615.01
71,Los Angeles,39,32.86,1771717.01,805674.56,0.0
90,Monterey County,38,42.16,91884.3,83555.32,663.08
41,Fresno,32,21.52,296260.0,141386.67,0.0
185,Visalia,29,28.11,264946.39,186105.09,200.87


In [48]:
## this did not work
# alt.Chart(subset4).mark_bar().encode(
#     x=alt.X("primary_agency_name"),
#     y=alt.Y(alt.repeat('layer'), title="Obligated Funding"),
#     color=alt.ColorDatum(alt.repeat('layer'))
# ).repeat(layer=["adjusted_total_requested", "adjusted_fed_requested", "adjusted_ac_requested"])

In [49]:
subset5 = (subset4 >> gather('category', 'value', _.adjusted_total_requested,
                    _.adjusted_fed_requested,
                    _.adjusted_ac_requested,
                    # _.process_days,
                    # _.transit
                    ))

In [50]:
subset5

Unnamed: 0,primary_agency_name,transit,process_days,category,value
0,Los Angeles County,44,34.4,adjusted_total_requested,567736.76
1,Los Angeles,39,32.86,adjusted_total_requested,1771717.01
2,Monterey County,38,42.16,adjusted_total_requested,91884.3
3,Fresno,32,21.52,adjusted_total_requested,296260.0
4,Visalia,29,28.11,adjusted_total_requested,264946.39
5,Los Angeles County,44,34.4,adjusted_fed_requested,439146.57
6,Los Angeles,39,32.86,adjusted_fed_requested,805674.56
7,Monterey County,38,42.16,adjusted_fed_requested,83555.32
8,Fresno,32,21.52,adjusted_fed_requested,141386.67
9,Visalia,29,28.11,adjusted_fed_requested,186105.09


In [51]:
alt.Chart(subset5).mark_bar().encode(
    x=alt.X("primary_agency_name"),
    y=alt.Y('value', title="Obligated Funding"),
    color='category:N',
     column='category:N'
)

## Project Time-frames
- how long does it take to get a project approved for each category? 
- does this differ from the prefix codes?

In [52]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,work_categories,process_days
12213,FTA Transferred,FTACML,6005(047),Sacramento Regional Transit District,2014-03-27,2014-03-27,2014-03-27,2014-04-11,2014-04-11,1571200.0,0.0,1571200.0,Prog Code M400,6005,3,FTA transferred on 4/11/2014,0.0,15.0,0.0,,,FTA Transfer,1,NaT,NaT,SACOG,6005,2014.0,Sacramento Regional Transit District,1798407.46,1798407.46,0.0,0,0,0,0,0,1,0,1,15.0


In [53]:
## grouping by prefix

In [54]:
time = (df.groupby(['prefix'])
               .agg({'primary_agency_name':'nunique', 'process_days':'mean', 'dist_processing_days':'mean', 'hq_processing_days':'mean',
                      'fhwa_processing_days':'mean'}).reset_index())

In [55]:
#filtering for prefixes most used by primary_agency_name and keeping top 50. 
time = (time>>arrange(-_.primary_agency_name)).head(50)

In [56]:
time>>arrange(_.process_days)

Unnamed: 0,prefix,primary_agency_name,process_days,dist_processing_days,hq_processing_days,fhwa_processing_days
315,STPLH,9,8.4,0.67,4.8,2.93
131,ESPL,16,8.66,0.74,4.0,3.97
313,STPLER,13,14.95,4.86,5.76,5.84
289,SRTSLNI,63,16.31,6.34,3.85,5.29
194,HPLU,16,19.61,4.22,4.97,9.87
156,FTACML,72,21.25,0.0,21.7,9.61
272,RPSTPL,148,22.49,10.42,7.48,4.24
204,HRRRL,40,25.82,11.02,9.43,4.87
56,CASB,8,26.08,18.54,2.54,5.5
174,FTASTPL,26,27.86,0.0,33.78,7.49


In [57]:
by_dist = (df.groupby(['dist'])
               .agg({'process_days':'mean', 'dist_processing_days':'mean', 'hq_processing_days':'mean',
                      'fhwa_processing_days':'mean'}).reset_index())

In [58]:
by_dist>>arrange(_.process_days)

Unnamed: 0,dist,process_days,dist_processing_days,hq_processing_days,fhwa_processing_days
0,0,19.0,0.0,6.0,13.0
9,9,25.23,11.69,7.06,6.22
6,6,28.19,14.12,7.41,6.38
3,3,33.73,16.19,12.93,4.72
4,4,34.92,21.53,9.41,4.19
10,10,35.38,13.59,16.47,4.83
5,5,35.66,14.91,14.39,6.51
7,7,38.37,21.15,10.3,7.27
11,11,39.94,24.67,7.62,7.49
1,1,40.87,23.62,12.29,4.22


- District 9, Bishop, and District 6, Fresno, are some of the more rural districts in the State, and have the lowest mean processing days for E-76s. 
- District 4 and 7 are scattered in the middle

In [59]:
#interested in seeing the mpo breakdown

mpos = (df.groupby(['mpo'])
               .agg({'primary_agency_name':'nunique', 'process_days':'mean', 'dist_processing_days':'mean', 'hq_processing_days':'mean',
                      'fhwa_processing_days':'mean'}).reset_index())
mpos>>arrange(_.process_days)

Unnamed: 0,mpo,primary_agency_name,process_days,dist_processing_days,hq_processing_days,fhwa_processing_days
11,MNOLTC,1,0.0,0.0,0.0,0.0
6,FSTIP,1,8.0,0.0,8.0,0.0
27,STNCOG,3,8.67,2.0,5.67,1.0
25,STACOG,10,9.0,1.5,6.9,0.6
3,COFCG,22,11.12,1.27,7.38,2.46
4,ER NONE,15,13.38,6.81,3.44,3.12
20,SHASTA,1,19.0,9.0,8.0,2.0
8,KCOG,44,24.87,10.82,7.24,6.53
16,SANDAG,74,25.93,15.34,9.51,1.08
2,CFCG,24,28.1,13.84,7.77,6.17


## Project Categories by Agency Size 
(Agency Size based on obligation)

In [60]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,work_categories,process_days
1328,Obligated,DEM10L,5209(008),Beaumont,2018-02-08,2018-05-08,2018-05-18,2018-06-11,2018-06-15,0.0,0.0,-6115477.08,Authorized,5209,8,E-76 approved on,117.0,34.0,4.0,RIV050535,Intersection Of State Route 60 And Potrero Boulevard,"New Overcrossing For Future Interchange, Phase 1",2,2018-01-11,2018-03-05,SCAG,5209,2018.0,Beaumont,-6599225.99,0.0,0.0,0,0,0,0,0,1,0,1,155.0


In [61]:
group = (df>>count(_.primary_agency_name)>>arrange(_.n))

In [62]:
group

Unnamed: 0,primary_agency_name,n
4,Alameda Corridor Transportation Authority,1
8,Alameda County Transportation Improvement Authority,1
9,Alameda County Waste Management Authority,1
20,Antelope Valley Transit Authority,1
89,Chula Vista Elementary School District,1
...,...,...
175,Fresno,468
486,Santa Cruz County,479
277,Los Angeles County,481
276,Los Angeles,504


In [63]:
group.describe()

Unnamed: 0,n
count,619.0
mean,33.96
std,63.59
min,1.0
25%,6.0
50%,14.0
75%,34.5
max,709.0


In [64]:
print(group.n.quantile(0.33))
print(group.n.quantile(0.66))
print(group.n.quantile(0.9))

8.0
24.0
83.0


In [65]:
q33 = group.n.quantile(0.33).astype(float)
q66 = group.n.quantile(0.66).astype(float)

In [66]:
def fleet_size (row):
    if ((row.n > 0) and (row.n <= q33)):
        return "Small"
    elif ((row.n > q33) and (row.n <= q66)):
        return "Medium"
    elif ((row.n > q66)):
        return "Large"
    else:
        return "No Info"

In [67]:
group["agency_size"] = group.apply(lambda x: fleet_size(x), axis=1)

In [68]:
group

Unnamed: 0,primary_agency_name,n,agency_size
4,Alameda Corridor Transportation Authority,1,Small
8,Alameda County Transportation Improvement Authority,1,Small
9,Alameda County Waste Management Authority,1,Small
20,Antelope Valley Transit Authority,1,Small
89,Chula Vista Elementary School District,1,Small
...,...,...,...
175,Fresno,468,Large
486,Santa Cruz County,479,Large
277,Los Angeles County,481,Large
276,Los Angeles,504,Large


In [69]:
# make a dictonary to map the agency names to the main df

In [70]:
agency_map = dict(zip(group['primary_agency_name'], 
                          group['agency_size']))


In [71]:
df['agency_size'] = df['primary_agency_name'].map(agency_map)

In [72]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,work_categories,process_days,agency_size
8083,Obligated,CML,5170(040),Kingsburg,2015-10-28,2015-10-28,2015-10-28,2015-10-28,2015-11-05,-0.93,0.0,-0.63,Authorized,5170,6,E-76 approved on E-76 approved on,0.0,0.0,8.0,FRE090125,14th Avenue Bikelanes - From Sierra To Stroud,Bikelanes And Reconstruct Pavement,4,NaT,NaT,CFCG,5170,2015.0,Kingsburg,-0.72,-1.06,0.0,1,0,0,1,0,1,0,3,8.0,Large


In [73]:
((df.groupby(['agency_size']).agg({'process_days':'mean',
                                   'adjusted_total_requested':'mean',
                                   'adjusted_fed_requested':'mean',
                                   'adjusted_ac_requested':'mean'}).reset_index()))

Unnamed: 0,agency_size,process_days,adjusted_total_requested,adjusted_fed_requested,adjusted_ac_requested
0,Large,38.18,1084719.95,670649.68,49431.53
1,Medium,39.17,789045.77,625984.18,7876.88
2,Small,32.73,638717.08,528074.71,6304.76


In [74]:
## looking at the category breakdowns by fleet size

In [75]:
work_cat = ['active_transp', 'transit', 'bridge', 'street','freeway', 'infra_resiliency_er',
       'congestion_relief']

for i in work_cat:
    subset = (df>>filter(_[i]==1))

    #generate chart:
    
    subset_3= ((subset.groupby(['agency_size']).agg({i:'sum',
                                                     'process_days':'mean',
                                                     'adjusted_total_requested':'mean',
                                                     'adjusted_fed_requested':'mean',
                                                     'adjusted_ac_requested':'mean'}).reset_index())>>arrange(-_[i])).head(5)
    
    subset_3 =subset_3.rename(columns = {'agency_size':'Agency Size',
                                         'adjusted_total_requested':'Total Requested',
                                         'adjusted_fed_requested':'Fed Requested',
                                         'adjusted_ac_requested':'AC Requested'})
    
    subset_4 = pd.melt(subset_3, id_vars=['Agency Size'],
        value_vars=['Total Requested','Fed Requested','AC Requested'],
        var_name='Categories', value_name='value'
       )
    
    chart = (alt.Chart(subset_4).mark_bar().encode(
        x=alt.X('value', axis=alt.Axis(format='$', title='Obligated Funding ($2021)'), scale=alt.Scale(domain=(-5000, 10000000))),
        y=alt.Y("Agency Size"),
        color='Categories:N',
        row='Categories:N'
    ))
    
    display(Markdown(f'**Breakdown {_dla_utils.labeling(i)} Projects**'))
    display(subset_3.style.format(formatter={('process_days'): "{:.2f}",
                                             ('Total Requested'): "${:.2f}",
                                             ('Fed Requested'): "${:.2f}",
                                             ('AC Requested'): "${:.2f}"}))
    display(chart)

**Breakdown Active Transp Projects**

Unnamed: 0,Agency Size,active_transp,process_days,Total Requested,Fed Requested,AC Requested
0,Large,3738,33.87,$504379.83,$337392.95,$-2425.40
1,Medium,924,38.06,$507226.80,$273770.56,$-2491.32
2,Small,229,27.52,$622995.06,$397299.47,$6.96


**Breakdown Transit Projects**

Unnamed: 0,Agency Size,transit,process_days,Total Requested,Fed Requested,AC Requested
0,Large,878,34.96,$608183.68,$422509.91,$3048.62
1,Medium,189,38.33,$810408.71,$368701.23,$-12166.68
2,Small,51,29.77,$273958.18,$217196.61,$31.25


**Breakdown Bridge Projects**

Unnamed: 0,Agency Size,bridge,process_days,Total Requested,Fed Requested,AC Requested
0,Large,3038,53.54,$1197550.52,$708743.26,$244022.68
1,Medium,367,46.37,$878433.02,$642636.65,$54739.20
2,Small,82,43.05,$821259.04,$778165.64,$68460.57


**Breakdown Street Projects**

Unnamed: 0,Agency Size,street,process_days,Total Requested,Fed Requested,AC Requested
0,Large,6867,34.81,$582099.59,$366675.57,$25145.37
1,Medium,1391,40.26,$500532.46,$296977.55,$188.40
2,Small,345,37.68,$421440.81,$268147.21,$4.40


**Breakdown Freeway Projects**

Unnamed: 0,Agency Size,freeway,process_days,Total Requested,Fed Requested,AC Requested
0,Large,208,47.81,$9557793.95,$5092038.01,$1215744.30
1,Medium,15,38.07,$2503297.47,$2203327.18,$0.00
2,Small,7,35.71,$871102.93,$589068.12,$0.00


**Breakdown Infra Resiliency Er Projects**

Unnamed: 0,Agency Size,infra_resiliency_er,process_days,Total Requested,Fed Requested,AC Requested
0,Large,10323,39.19,$988466.55,$719249.07,$68268.00
1,Medium,1565,40.89,$1138484.28,$957938.81,$16569.88
2,Small,538,36.31,$760741.77,$649974.76,$10699.67


**Breakdown Congestionumber Of Relief Projects**

Unnamed: 0,Agency Size,congestion_relief,process_days,Total Requested,Fed Requested,AC Requested
0,Large,121,27.7,$896154.52,$862727.35,$1605.28
1,Medium,31,24.78,$185992.98,$182251.77,$0.00
2,Small,6,10.2,$23033.54,$16515.52,$0.00


### Other Metrics for Agency Size

In [76]:
from shared_utils import geography_utils

In [77]:
_dla_utils.get_nunique(df, 'primary_agency_name', 'agency_size')

Unnamed: 0,agency_size,n
2,Small,215
0,Large,208
1,Medium,196


In [78]:
geography_utils.aggregate_by_geography(
    df,
    group_cols = ['agency_size'],
    count_cols=['location'], #using this to see how many obliations there are
    sum_cols = ["adjusted_total_requested", "adjusted_ac_requested", "adjusted_fed_requested", 'work_categories'],
    mean_cols = ["adjusted_total_requested", "adjusted_ac_requested", "adjusted_fed_requested", 'process_days'],
    nunique_cols = ["primary_agency_name", "prefix"],
    rename_cols=True)

In [79]:
years_df = (geography_utils.aggregate_by_geography(
    df,
    group_cols = ['agency_size', 'prepared_y'],
    count_cols=['location'], #using this to see how many obliations there are
    sum_cols = ["adjusted_total_requested", "adjusted_ac_requested", "adjusted_fed_requested", 'work_categories'],
    mean_cols = ["adjusted_total_requested", "adjusted_ac_requested", "adjusted_fed_requested", 'process_days'],
    nunique_cols = ["primary_agency_name", "prefix"],
    rename_cols=True))

In [80]:
highlight = alt.selection(type='single', on="mouseover", 
                              fields=['agency_size', "prepared_y"], nearest=True)
selection = alt.selection_multi(
                          fields=['agency_size'], bind='legend')
    

chart = (alt.Chart(years_df).mark_line().encode(
    x=alt.X('prepared_y:O', title=labeling('prepared_y')),
    y=alt.Y('adjusted_total_requested_mean:Q', title=('Total Requested Mean (in $2021)')),
    color=alt.Color("agency_size:N", title = "Agency Size"), 
    tooltip = alt.Tooltip(["prepared_y", 'adjusted_total_requested_mean', "agency_size"]),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
    size=alt.condition(~highlight, alt.value(2), alt.value(5))
    ).properties(title={
                 "text": ["Average Total Requested Funds by Year"]}).add_selection(
                 selection, highlight)
    )

chart = styleguide.preset_chart_config(chart)
display(chart)