# Obligations in District 7

Using data from the Division of Local Assistance [Obligation list](https://dot.ca.gov/programs/local-assistance/reports/e-76-obligated)

In [1]:
import numpy as np
import pandas as pd
from siuba import *
import clean_data
#import _dla_utils    #_dla_utils taking a long time to read... too long
import altair as alt
import altair_saver
from calitp import *
from plotnine import *

from shared_utils import altair_utils
alt.themes.enable("fivethirtyeight")





ThemeRegistry.enable('fivethirtyeight')

In [2]:
#df = clean_data.make_clean_data()
# df = clean_data.read_data()
# df = clean_data.clean_data(df)
# df = clean_data.prefix_cleaning(df)
# df = clean_data.clean_agency_names(df)



KeyboardInterrupt: 

In [None]:
df.sample()

In [None]:
la_df = df>>filter(_.dist==7)

In [None]:
la_df.head()

## Adding Catalog Datasets

In [None]:
import intake
import geopandas as gpd

In [None]:
def read_catalog(df):    
    catalog = intake.open_catalog("catalog.yml")

    city_boundary = catalog.ca_open_data.city_boundary.read()
    county_bound = catalog.ca_open_data.county_boundary.read()
 
    district_bound= catalog.district_bound.read()
    rtpa_bound= catalog.rtpa_bound.read()
    locode_df = pd.concat(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx', sheet_name=None), ignore_index=True)
    locode_df = to_snakecase(locode_df)

    #renaming
    county_bound['name'] =  county_bound['name'] + ' County'
    county_bound.rename(columns={'name': 'county_name', 'geometry': 'geometry2'}, inplace=True)

    # deleting Calaveras County because the location of the project is not in district 7
    delete_row = df[df["primary_agency_name"]== 'Calaveras County'].index
    df = df.drop(delete_row)

    new_df1 = pd.merge(df, locode_df,  how='left', left_on=['primary_agency_name'], right_on = ['agency_name'])
    new_df2 = pd.merge(new_df1, city_bound,  how='left', left_on=['primary_agency_name'], right_on = ['NAME'])
    new_df3 = left_join(new_df2, county_bound, on = "county_name")

    return new_df3



In [None]:
df_test = read_catalog(la_df)

In [None]:
df_test.sample()

### number of unique agency names that are **cities**

In [None]:
len(df_test>>filter(_.NAME.notnull())>>count(_.primary_agency_name))

### number of unique agency names that are **not** cities

In [None]:
len(df_test>>filter(_.NAME.isnull())>>count(_.primary_agency_name))

In [None]:
topten_city =(df_test>>filter(_.NAME.notnull())>>count(_.primary_agency_name)>>arrange(-_.n)).head(10)
topten_city.rename(columns={'primary_agency_name': 'Primary Agency Name', 'n': 'Count'}, inplace=True)

### Cities with the most obligations

dataframes to test in function in place of `df`
* df_test


In [18]:
def totalfunds_chart(df, col):
    test= (df
           >>group_by(_.primary_agency_name)
           >>summarize(Total_Funds = _.col.sum())
           >>arrange(-_.Total_Funds))
    #test.rename(columns={'primary_agency_name': 'Primary Agency Name', 'col': 'Sum Funds'}, inplace=True)
    test.style.format(precision=2, na_rep='MISSING', thousands=",")
    chart = (alt.Chart(test.head(20))
             .mark_bar()
             .encode(
                 x=alt.X("primary_agency_name", title="col"),
                 y=alt.Y("Total Funds", title="Total Funds Obligated"),
                 #column = "payment:N",
                 color = alt.Color("primary_agency_name", 
                                  scale=alt.Scale(
                                      range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                                  )
                                  )
             )
            .properties( 
                           title="District 7 Agencies with the Most Total Funds Obligated")

    )
    return chart

In [19]:
totalfunds_chart(df_test, "total_requested")

AttributeError: 'DataFrame' object has no attribute 'col'

In [21]:
test2= (df_test
       >>group_by(_.primary_agency_name)
       >>summarize(Total_fed_Funds = _.fed_requested.sum())
       >>arrange(-_.Total_fed_Funds))
test2.rename(columns={'primary_agency_name': 'Primary Agency Name', 'Total_fed_Funds': 'Total Federal Funds'}, inplace=True)
test2.style.format(precision=2, na_rep='MISSING', thousands=",")


test3= (df_test
       >>group_by(_.primary_agency_name)
       >>summarize(Total_ac_Funds = _.ac_requested.sum())
       >>arrange(-_.Total_ac_Funds))
test3.rename(columns={'primary_agency_name': 'Primary Agency Name', 'Total_ac_Funds': 'Total Advance Construction Funds'}, inplace=True)
test3.style.format(precision=2, na_rep='MISSING', thousands=",")

all_sum = full_join(test, test2, on = "Primary Agency Name")
all_sum2 = full_join(all_sum, test3, on = "Primary Agency Name")
all_sum3 = full_join(all_sum, test3, on = "Primary Agency Name")

all_sum3['Totals']= all_sum2['Total Funds']+all_sum2['Total Federal Funds'] +all_sum2['Total Advance Construction Funds']
all_sum3 = all_sum3>>arrange(-_.Totals)

all_sum3.style.format(precision=2, na_rep='MISSING', thousands=",")
all_sum2.style.format(precision=2, na_rep='MISSING', thousands=",")



In [72]:
chart = (alt.Chart(all_sum3.head(20))
         .mark_bar()
         .encode(
             x=alt.X("Primary Agency Name", title="Agency Name"),
             y=alt.Y("Totals", title="Sum of Funds Obligated"),
             #column = "payment:N",
             color = alt.Color("Primary Agency Name", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
        .properties( 
                       title="District 7 Agencies with the Highest Sum of Obligated Funds")

)


#chart.save("dist7chart.png")
chart

### Non-Cities with the most obligations

In [73]:
topten_noncity =(df_test
 >>filter(_.NAME.isnull())
 >>count(_.primary_agency_name)
 >>arrange(-_.n)).head(10)

In [74]:
topten_noncity.rename(columns={'primary_agency_name': 'Primary Agency Name', 'n': 'Count'}, inplace=True)

In [75]:
topten_noncity.style.set_caption('District 7 Agencies with the Most Oblgiations (non-city)')

Unnamed: 0,Primary Agency Name,Count
5,Los Angeles County,451
3,Caltrans,234
18,Ventura County,115
6,Los Angeles County Metropolitan Transportation Authority,85
10,San Buenaventura,35
19,Ventura County Transportation Commission,29
11,San Gabriel Valley Council of Governments,18
0,Access Services,17
14,Southern California Association of Governments,13
13,South Coast Area Transit,7


In [None]:
la_nunique= (la_df >> group_by(_.primary_agency_name) >> summarize(n=_.prefix.nunique()) >> arrange(-_.n)>>filter(_.n>=5))

In [112]:
chart = (alt.Chart(la_nunique)
         .mark_bar()
         .encode(
             x=alt.X("primary_agency_name", title="Agency Name"),
             y=alt.Y("n", title="Number of Unique Prefixes"),
             #column = "payment:N",
             color = alt.Color("n", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
         .properties( 
                      title="Number of Unique Prefixes by Agency in District 7")
)


#chart.save("dist7chart.png")
chart

## Most Common Types of Work by Agnecy

### Chart

In [125]:
chart = (alt.Chart(la_df_year)
         .mark_bar()
         .encode(
             x=alt.X("prepared_y", title="Prepared Year"),
             y=alt.Y("n", title="Number of Obligations in each Year"),
             #column = "payment:N",
             color = alt.Color("primary_agency_name", 
                              scale=alt.Scale(
                                  range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
                              )
                              )
         )
         .properties( 
                      title="Number of Obliations per Year")
)


#chart.save("dist7chart.png")
chart

## Functions to look by Prefix and by Agency: 

### By Agency:

In [7]:
    @interact
    
    def dla_get_prefix(place=la_df.primary_agency_name.sort_values().unique().tolist()):
        
        agencies = la_df[la_df.primary_agency_name==place]
    
        prefix_count_n = agencies >> count(_.prefix)
    
        display(Markdown(f"**Summary Statistics for {place}**"))
        display(Markdown(f"The number of obligations {place} has is {len(agencies)}"))
        
        display(Markdown(f"The number of prefix codes {place} uses is {len(prefix_count_n)}"))
        
        
        
        # for the table- using one as some agencies only have one entry
        pd.set_option("display.max_columns", None)
        display(df[df.agency == place][['fed_requested','ac_requested','total_requested']].describe())
        
        display(Markdown(f"**Top Project Types in {place}**"))
        display((la_df[la_df.primary_agency_name == place] >> count(_.type_of_work) >> arrange(-_.n)).head(5)) 
        # graphs 
    
         
        ax1 = (prefix_count_n
            >> ggplot(aes("prefix", "n", fill="prefix")) 
               + geom_col() 
               + theme(axis_text_x = element_text(angle = 45 , hjust=1))
               + labs(title='Agency Program Codes', x='Program Codes', y='Number of Obligations', fill="Program Type")
               #+ scale_fill_manual(altair_utils.FIVETHIRTYEIGHT_CATEGORY_COLORS)
               #FIVETHIRTYEIGHT_CATEGORY_COLORS doesnt have enough colors for values 
        )
        return ax1


NameError: name 'interact' is not defined

### _DLA_UTILS By Prefix:

In [8]:
import _dla_utils

In [9]:
import importlib

In [10]:
importlib.reload(_dla_utils)

<module '_dla_utils' from '/home/jovyan/data-analyses/dla/e76_obligated_funds/_dla_utils.py'>

In [None]:
_dla_utils.prefix_all_agencies_4(df, "CML")

In [19]:
from functools import partial

In [None]:
interact(partial(_dla_utils.prefix_all_agencies_4, df), prefix_unique=(df.prefix.sort_values().unique().tolist()))

AttributeError: 'functools.partial' object has no attribute '__name__'

In [None]:
#
@interact
def show_chart(prefix_unique=(df.prefix.sort_values().unique().tolist())):
    return _dla_utils.prefix_all_agencies_4(df, prefix_unique)

In [None]:
import ipywidgets as widgets
from ipywidgets import *
from IPython.display import Markdown
from IPython.core.display import display

In [22]:
prefix_unique=(df.prefix.sort_values().unique().tolist())

In [21]:
prefix_unique=(df.prefix.sort_values().unique().tolist())
    
    # graphs 
prefixes = df[df.prefix== prefix_unique("ER")]
    
prefix_count_num = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(50)
    
prefix_count = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(20)

display(Markdown(f"**The number of agencies using {prefix} is {len(prefix_count_num)}**"))

# for the table- using one as some agencies only have one entry
display(df[(df.prefix == prefix)].sample(1))


ax1 = (prefix_count
        >> ggplot(aes("primary_agency_name", "n", fill="primary_agency_name")) 
            + geom_col() 
            + theme(axis_text_x = element_text(angle = 45 , hjust=1))
            + labs(title='Top Agencies using Prefix', x='Agency', y='Number of Obligations', fill="Agency")
        )    


ValueError: ('Lengths must match to compare', (20117,), (351,))

In [11]:

def prefix_all_agencies_4(df):
    
    prefix=(df.prefix.sort_values().unique().tolist())
    
    # graphs 
    prefixes = df[df.prefix== prefix]
    
    prefix_count_num = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(50)
    
    prefix_count = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(20)
    
    display(Markdown(f"**The number of agencies using {prefix} is {len(prefix_count_num)}**"))
    
    # for the table- using one as some agencies only have one entry
    display(df[(df.prefix == prefix)].sample(1))
    
    
    ax1 = (prefix_count
            >> ggplot(aes("primary_agency_name", "n", fill="primary_agency_name")) 
                + geom_col() 
                + theme(axis_text_x = element_text(angle = 45 , hjust=1))
                + labs(title='Top Agencies using Prefix', x='Agency', y='Number of Obligations', fill="Agency")
            )    
    return ax1
    
              

In [12]:
interact(prefix_all_agencies_4(la_df))

ValueError: ('Lengths must match to compare', (2787,), (107,))

## Additional Information