# Preliminary Analysis of DLA Functions

In [1]:
import numpy as np
import pandas as pd
from siuba import *
import _clean_data
import altair as alt
import altair_saver
from plotnine import *

from shared_utils import altair_utils
alt.themes.enable("fivethirtyeight")



ThemeRegistry.enable('fivethirtyeight')

In [2]:
import _dla_utils

In [3]:
df= pd.read_parquet("dla_df.parquet")

In [4]:
df.sample()

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,...,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,projectID,prepared_y,primary_agency_name
13882,Obligated,CML,5006(779),Los Angeles,2019-01-04,2019-01-25,2019-01-28,2019-01-28,2019-01-30,1853000.0,...,LAF3653,Pasadena Ave Pedestrian Connection To Gold Lin...,"Side Walk Improvements, Safety Lighting Etc",2,2018-12-19,2019-01-25,SCAG,5006,2019.0,Los Angeles


In [5]:
print("Number of Unique Agencies with uncleaned Data")
print(len(df>>count(_.agency)>>arrange(-_.n)))
print("Number of True Unique Agencies")
print(len(df>>count(_.primary_agency_name)>>arrange(-_.n)))

Number of Unique Agencies with uncleaned Data
671
Number of True Unique Agencies
619


## Get Aggs

In [6]:
df_agg =_dla_utils.count_all_years(df)

In [7]:
df_agg

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_mpo,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
120,2021,0,0.000000e+00,-1.140550e+03,-1.140550e+03,0.000000,-1140.550000,-1140.550000,1.0,1.0,1.0,1.0,1.0,1.0
106,2021,1,6.484611e+06,4.167653e+06,1.217239e+07,67548.033437,43413.056250,126795.768542,1.0,13.0,9.0,86.0,85.0,49.0
107,2021,2,-1.135074e+06,6.084834e+06,7.705425e+06,-15134.315867,81131.117467,102739.006133,5.0,15.0,13.0,69.0,69.0,55.0
108,2021,3,1.035237e+08,1.089277e+08,2.378141e+08,417434.091169,439224.708387,958927.647621,5.0,34.0,45.0,191.0,198.0,164.0
109,2021,4,2.533793e+06,1.061114e+08,1.602708e+08,7541.050387,315807.822321,476996.483929,3.0,37.0,91.0,272.0,277.0,253.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2010,8,0.000000e+00,3.400000e+05,3.850000e+05,0.000000,340000.000000,385000.000000,1.0,1.0,1.0,1.0,1.0,1.0
118,,3,,,,,,,,,,,,
119,,4,,,,,,,,,,,,
36,,7,,,,,,,,,,,,


In [8]:
df_top = _dla_utils.find_top(df)

In [9]:
df_top.sample()

Unnamed: 0,value,count,variable
4,Throughout Kern County,22,project_location


## By Dist

In [10]:
df_top>>filter(_.variable=='dist')

Unnamed: 0,value,count,variable
0,4,3248,dist
1,6,3084,dist
2,7,2787,dist
3,3,2517,dist
4,10,2203,dist
5,5,1337,dist
6,8,1309,dist
7,1,1161,dist
8,11,920,dist
9,12,666,dist


In [11]:
#using df_top

_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='dist')), "value", "count").properties(
    title="Most Obligations by Agency")

In [12]:
df_top.sample()

Unnamed: 0,value,count,variable
15,BHLS,261,prefix


In [16]:
dist_years1 = df >> group_by(_.prepared_y, _.dist) >> summarize(n=_.primary_agency_name.nunique()) >> arrange(-_.prepared_y)

In [17]:
dist_years1.rename(columns={'n': 'n_agency_unique'}, inplace=True)

In [18]:
dist_years1>>arrange(-_.n_agency_unique)

Unnamed: 0,prepared_y,dist,n_agency_unique
47,2016.0,4,113
34,2015.0,4,109
22,2014.0,4,108
60,2017.0,4,98
72,2018.0,4,97
...,...,...,...
43,2016.0,0,1
5,2012.0,7,1
1,2011.0,4,1
3,2011.0,11,1


In [19]:

chart = alt.Chart(dist_years1).mark_bar().encode(
        # x='prepared_y:O',
        # y='n_agency_unique:Q',
        #color='dist:N',
        column='dist:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('n_agency_unique:Q', title='Number of Unique Agencies'),
        color = alt.Color("dist:N", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              ).properties( 
                       title="Number of Unique Agencies by District")

chart


In [21]:
_dla_utils.basic_scatter_chart(dist_years1,'n_agency_unique', 'prepared_y', 'dist')

In [21]:
chart = (alt.Chart(dist_years1)
         .mark_bar()
         .encode(
             x=alt.X("prepared_y", title="Year"),
             y=alt.Y("n_agency_unique", title="Number of Unique Agencies"),
             #column = "payment:N",
             color = alt.Color("dist", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District")
                              )
                              )
        .properties( 
                       title="Number of Obligations by District")

)


chart

In [25]:
_dla_utils.basic_bar_chart(dist_years1, "prepared_y", "n_agency_unique")

In [26]:
dist_years2 = df >> group_by(_.prepared_y, _.dist) >> summarize(n=_.prefix.nunique()) >> arrange(-_.prepared_y)

In [27]:
dist_years2.rename(columns={'n': 'n_prefix_unique'}, inplace=True)

In [28]:
dist_years2>>arrange(-_.n_prefix_unique)

Unnamed: 0,prepared_y,dist,n_prefix_unique
22,2014.0,4,52
83,2019.0,3,51
72,2018.0,4,51
47,2016.0,4,50
34,2015.0,4,48
...,...,...,...
56,2017.0,0,1
5,2012.0,7,1
1,2011.0,4,1
3,2011.0,11,1


In [29]:
chart = alt.Chart(dist_years2).mark_bar().encode(
        # x='prepared_y:O',
        # y='n_agency_unique:Q',
        #color='dist:N',
        column='dist:N',
        x=alt.X('prepared_y:O', title='Prepared Year'),
        y=alt.Y('n_prefix_unique:Q', title='Number of Unique Prefix Codes'),
        color = alt.Color("dist:N", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS),  
                               legend=alt.Legend(title="District"
                              )
                              )).properties( 
                       title="Number of Unique Prefix Codes by District")

chart

In [30]:
df>>count(_.prepared_y)>>arrange(-_.n)

Unnamed: 0,prepared_y,n
8,2018.0,2827
4,2014.0,2724
9,2019.0,2665
5,2015.0,2575
6,2016.0,2505
10,2020.0,2350
7,2017.0,2333
11,2021.0,1917
3,2013.0,193
2,2012.0,7


In [35]:
df_agg>>group_by(_.dist)>>summarize(sum_total_requested=_.total_requested_sum.sum())

Unnamed: 0,dist,sum_total_requested
0,0,3152859.0
1,1,175141600.0
2,2,228882700.0
3,3,1561211000.0
4,4,2817360000.0
5,5,492622600.0
6,6,1597301000.0
7,7,4263828000.0
8,8,2589842000.0
9,9,56075220.0


## Charts

In [36]:
#using df_tops 
_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='primary_agency_name')), "value", "count")


In [37]:
_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='primary_agency_name')), "value", "count").properties(
    title="Most Obligations by Agency")

In [38]:
_dla_utils.basic_bar_chart(df_top>>filter(_.variable=='prefix'), "value", "count").properties(
    title="Most Obligations by Prefix")

In [31]:
df>>filter(_.primary_agency_name=='Humboldt County')>>count(_.prefix)>>arrange(-_.n)

Unnamed: 0,prefix,n
9,ER,379
0,ACSTER,64
4,BPMP,50
1,ACSTP,31
5,BRLO,22
7,BRLS,20
12,HSIPL,19
13,RPSTPL,18
6,BRLOZB,9
2,ATPL,5


In [34]:
#using df_tops

_dla_utils.basic_bar_chart(
    ((_dla_utils.find_top(
        df>>filter(_.primary_agency_name=='Humboldt County')))
    >>filter(_.variable=='prefix')), "value", "count").properties(title="Most Used Prefix Codes in Humboldt County")

In [35]:
#using df_tops
_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='mpo')), "value", "count").properties(
    title="Most Obligations by MPO")

In [36]:
#using df_top
_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='type_of_work')), "value", "count").properties(
    title="Most Used Type of Work")

In [37]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Lato');
</style>


In [38]:
#using df_top
_dla_utils.basic_bar_chart((df_top>>filter(_.variable=='project_location')), "value", "count").properties(
    title="Most Used Project Locations")