# Preliminary Analysis of DLA Functions

In [1]:
import numpy as np
import pandas as pd
from siuba import *
import _clean_data
import altair as alt
import altair_saver
from plotnine import *

from shared_utils import altair_utils
alt.themes.enable("fivethirtyeight")



ThemeRegistry.enable('fivethirtyeight')

In [2]:
df = _clean_data.read_data()
df = _clean_data.clean_data(df)
df = _clean_data.prefix_cleaning(df)
df = _clean_data.clean_agency_names(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [52]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name
1703,FTA Transferred,FTASTPL,6429(023),Napa County Transportation Planning Agency,2018-04-05,2018-04-05,2018-04-05,2018-05-04,2018-05-07,89685.61,0.0,89685.61,Prog Code M23E,6429,4,FTA transferred on 5/7/2018,,29.0,3.0,,,FTA Transfer,1,NaT,NaT,MTC,6429,2018.0,Napa County Transportation Planning Agency


In [4]:
print("Number of Unique Agencies with uncleaned Data")
print(len(df>>count(_.agency)>>arrange(-_.n)))
print("Number of True Unique Agencies")
print(len(df>>count(_.primary_agency_name)>>arrange(-_.n)))

Number of Unique Agencies with uncleaned Data
671
Number of True Unique Agencies
619


## By District

In [5]:
dist = df>>count(_.dist)>>arrange(-_.n)

In [6]:
dist

Unnamed: 0,dist,n
4,4,3248
6,6,3084
7,7,2787
3,3,2517
10,10,2203
5,5,1337
8,8,1309
1,1,1161
11,11,920
12,12,666


In [7]:
dist.rename(columns={'n': 'total_obligations'}, inplace=True)

In [8]:
dist_agencyn = (df >> group_by(_.dist) >> summarize(n=_.primary_agency_name.nunique()) >> arrange(-_.n))

In [9]:
dist_agencyn.rename(columns={'n': 'n_agency_unique'}, inplace=True)

In [10]:
dist_prefixn = (df >> group_by(_.dist) >> summarize(n=_.prefix.nunique()) >> arrange(-_.n))

In [11]:
dist_prefixn.rename(columns={'n': 'n_prefix_unique'}, inplace=True)

In [12]:
dist = full_join(dist, dist_agencyn, on = "dist")

In [13]:
dist = full_join(dist, dist_prefixn, on = "dist")

In [14]:
dist

Unnamed: 0,dist,total_obligations,n_agency_unique,n_prefix_unique
0,4,3248,151,110
1,6,3084,55,71
2,7,2787,108,107
3,3,2517,62,98
4,10,2203,56,65
5,5,1337,39,63
6,8,1309,64,92
7,1,1161,20,25
8,11,920,31,70
9,12,666,36,41


In [15]:
chart = (alt.Chart(dist)
         .mark_bar()
         .encode(
             x=alt.X("dist", title="District", scale=alt.Scale(domain=(0, 12))),
             y=alt.Y("total_obligations", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("dist",
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District")
                              )
         )
        .properties( 
                       title="Number of Obligations by District")

)



chart.mark_bar(size=20)

In [16]:
dist_years1 = df >> group_by(_.prepared_y, _.dist) >> summarize(n=_.primary_agency_name.nunique()) >> arrange(-_.prepared_y)

In [17]:
dist_years1.rename(columns={'n': 'n_agency_unique'}, inplace=True)

In [18]:
dist_years1>>arrange(-_.n_agency_unique)

Unnamed: 0,prepared_y,dist,n_agency_unique
47,2016.0,4,113
34,2015.0,4,109
22,2014.0,4,108
60,2017.0,4,98
72,2018.0,4,97
...,...,...,...
43,2016.0,0,1
5,2012.0,7,1
1,2011.0,4,1
3,2011.0,11,1


In [19]:

chart = alt.Chart(dist_years1).mark_bar().encode(
        # x='prepared_y:O',
        # y='n_agency_unique:Q',
        #color='dist:N',
        column='dist:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('n_agency_unique:Q', title='Number of Unique Agencies'),
        color = alt.Color("dist:N", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              ).properties( 
                       title="Number of Unique Agencies by District")

chart


In [20]:
chart = (alt.Chart(dist_years1)
         .mark_bar()
         .encode(
             x=alt.X("prepared_y", title="Year"),
             y=alt.Y("n_agency_unique", title="Number of Unique Agencies"),
             #column = "payment:N",
             color = alt.Color("dist", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              )
        .properties( 
                       title="Number of Obligations by District")

)


chart

In [21]:
dist_years2 = df >> group_by(_.prepared_y, _.dist) >> summarize(n=_.prefix.nunique()) >> arrange(-_.prepared_y)

In [22]:
dist_years2.rename(columns={'n': 'n_prefix_unique'}, inplace=True)

In [23]:
dist_years2>>arrange(-_.n_prefix_unique)

Unnamed: 0,prepared_y,dist,n_prefix_unique
22,2014.0,4,52
83,2019.0,3,51
72,2018.0,4,51
47,2016.0,4,50
34,2015.0,4,48
...,...,...,...
56,2017.0,0,1
5,2012.0,7,1
1,2011.0,4,1
3,2011.0,11,1


In [24]:
chart = alt.Chart(dist_years2).mark_bar().encode(
        # x='prepared_y:O',
        # y='n_agency_unique:Q',
        #color='dist:N',
        column='dist:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('n_prefix_unique:Q', title='Number of Unique Prefix Codes'),
        color = alt.Color("dist:N", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District"
                              )
                              )).properties( 
                       title="Number of Unique Prefix Codes by District")

chart

In [25]:
df>>count(_.prepared_y)>>arrange(-_.n)

Unnamed: 0,prepared_y,n
8,2018.0,2827
4,2014.0,2724
9,2019.0,2665
5,2015.0,2575
6,2016.0,2505
10,2020.0,2350
7,2017.0,2333
11,2021.0,1917
3,2013.0,193
2,2012.0,7


In [26]:
sum_funds = df>>group_by(_.dist)>>summarize(n=_.total_requested.sum()) >> arrange(-_.n)
sum_funds.style.format(precision=2, na_rep='MISSING', thousands=",")

Unnamed: 0,dist,n
7,7,4264832122.16
12,12,2858146336.25
4,4,2820381522.31
8,8,2589841597.99
6,6,1597301076.08
3,3,1561960671.34
11,11,1159054839.35
10,10,1069134164.63
5,5,492622627.53
2,2,228882744.46


## Charts

In [53]:
df_agency = (df>>count(_.primary_agency_name)>>arrange(-_.n)).head(50)

In [67]:
barchart = (alt.Chart(df_agency)
         .mark_bar()
         .encode(
             x=alt.X("primary_agency_name", title="Agency Name"),
             y=alt.Y("n", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="Most Obligations by Agency")

)
text = barchart.mark_text(
    align='left',
    baseline='middle',
     dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='n:Q'
)

    
chart = (barchart + text).properties(height=900)


chart

In [56]:
df>>filter(_.primary_agency_name=='Humboldt County')>>count(_.prefix)>>arrange(-_.n)

Unnamed: 0,prefix,n
9,ER,379
0,ACSTER,64
4,BPMP,50
1,ACSTP,31
5,BRLO,22
7,BRLS,20
12,HSIPL,19
13,RPSTPL,18
6,BRLOZB,9
2,ATPL,5


In [57]:
humboldt = df>>filter(_.primary_agency_name=='Humboldt County')>>count(_.prefix)>>arrange(-_.n)

In [58]:
chart = (alt.Chart(humboldt)
         .mark_bar()
         .encode(
             x=alt.X("prefix", title="Prefix"),
             y=alt.Y("n", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="Prefixes used by Humboldt County")

)


chart

In [59]:
mpo = df>>count(_.mpo)>>arrange(-_.n)

In [60]:
chart = (alt.Chart(mpo)
         .mark_bar()
         .encode(
             x=alt.X("mpo", title="MPOs"),
             y=alt.Y("n", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="Obligations by MPO")

)


chart

In [61]:
prefix = df>>count(_.prefix)>>arrange(-_.n)

In [62]:
prefix.head(10)

Unnamed: 0,prefix,n
68,CML,3293
208,HSIPL,3282
306,STPL,2779
128,ER,1590
40,BRLO,1402
46,BRLS,904
10,ATPL,587
269,RPSTPL,576
194,HPLUL,332
284,SRTSL,331


In [63]:
df>>filter(_.prefix=='FTACML')

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name
732,FTA Transferred,FTACML,5108(186),Long Beach,2018-07-17,2018-07-17,2018-07-17,2018-08-02,2018-08-24,2102405.0,0.0,2102405.0,Prog Code Z400,5108,7,FTA transferred on 8/24/2018,,16.0,22.0,,,FTA Transfer,1,NaT,NaT,SCAG,5108,2018.0,Long Beach
829,FTA Transferred,FTACML,5107(048),Santa Monica,2018-07-09,2018-07-09,2018-07-09,2018-07-25,2018-08-09,1765344.0,0.0,1765344.0,Prog Code Z400,5107,7,FTA transferred on 8/9/2018,,16.0,15.0,,,FTA Transfer,1,NaT,NaT,SCAG,5107,2018.0,Santa Monica
910,FTA Transferred,FTACML,6065(231),Los Angeles County Metropolitan Transportation...,2018-07-03,2018-07-03,2018-07-03,2018-07-24,2018-07-26,25000000.0,0.0,25000000.0,Prog Code Z400,6065,7,FTA transferred on 7/26/2018,,21.0,2.0,,,FTA Transfer,1,NaT,NaT,SCAG,6065,2018.0,Los Angeles County Metropolitan Transportation...
911,FTA Transferred,FTACML,6046(015),Omnitrans,2018-07-02,2018-07-02,2018-07-02,2018-07-23,2018-07-26,17818000.0,0.0,17818000.0,Prog Code Z400,6046,8,FTA transferred on 7/26/2018,,21.0,3.0,,,FTA Transfer,1,NaT,NaT,NON-MPO,6046,2018.0,OmniTrans
1030,FTA Transferred,FTACML,5132(048),Fairfield,2018-04-24,2018-04-24,2018-04-24,2018-07-03,2018-07-16,1333719.0,0.0,1333719.0,Prog Code Z003,5132,4,FTA transferred on 7/16/2018,,70.0,13.0,,,FTA Transfer,1,NaT,NaT,MTC,5132,2018.0,Fairfield
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19819,FTA Transferred,FTACML,7505(001),Gold Coast Transit District,2021-07-01,2021-07-01,2021-07-01,2021-07-22,2021-08-03,4587076.0,0.0,4587076.0,Prog Code Z40E,7505,7,FTA transferred on 8/3/2021,,21.0,12.0,,,FTA Transfer,1,NaT,NaT,SCAG,7505,2021.0,
19820,FTA Transferred,FTACML,7505(002),Gold Coast Transit District,2021-07-06,2021-07-06,2021-07-06,2021-07-22,2021-08-03,77000.0,0.0,77000.0,Prog Code Z40E,7505,7,FTA transferred on 8/3/2021,,16.0,12.0,,,FTA Transfer,1,NaT,NaT,SCAG,7505,2021.0,
19821,FTA Transferred,FTACML,6187(008),Southern California Regional Rail Authority,2021-07-26,2021-07-26,2021-07-26,2021-08-03,2021-08-03,25000000.0,0.0,25000000.0,Prog Code Z40E,6187,7,FTA transferred on 8/3/2021,,8.0,0.0,,,FTA Transfer,1,NaT,NaT,SCAG,6187,2021.0,Southern California Regional Rail Authority
19822,FTA Transferred,FTACML,6261(025),Victor Valley Transit Authority,2021-07-19,2021-07-19,2021-07-19,2021-08-02,2021-08-03,5115000.0,0.0,5115000.0,Prog Code Z400,6261,8,FTA transferred on 8/3/2021,,14.0,1.0,,,FTA Transfer,1,NaT,NaT,SCAG,6261,2021.0,Victor Valley Transit Authority


In [69]:
df>>count(_.type_of_work)>>arrange(-_.n)

Unnamed: 0,type_of_work,n
592,Bridge Replacement (tc),679
582,Bridge Replacement,519
2014,FTA Transfer,462
3979,Permanent Restoration,282
5289,Road Rehabilitation,269
...,...,...
6873,Work Occur In 2 Road Segments; 1st Segment Is ...,1
6874,Work Zone Data Exchange Demonstration Project....,1
6877,Yarts: Public Outreach And Marketing (yosemite...,1
6878,Years 2 To 5 Environmental Compliance (mitigat...,1


In [72]:
type_of_work = (df>>count(_.type_of_work)>>arrange(-_.n)).head(20)

In [74]:
chart = (alt.Chart(type_of_work)
         .mark_bar()
         .encode(
             x=alt.X("type_of_work", title="Type of Work"),
             y=alt.Y("n", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="Most Used Types of Work")

)


chart

In [75]:
project_location = (df>>count(_.project_location)>>arrange(-_.n)).head(20)


In [80]:
chart = (alt.Chart(project_location)
         .mark_bar()
         .encode(
             x=alt.X("project_location", title="Project Location"),
             y=alt.Y("n", title="Number of Obligations"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="Project Locations Most Used")

)

chart