In [1]:
import numpy as np
import pandas as pd
from siuba import *
import altair as alt
import altair_saver
from plotnine import *
from shared_utils import geography_utils
from shared_utils import altair_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import styleguide

from dla_utils import _dla_utils



In [2]:
alt.themes.register("calitp_theme", styleguide.calitp_theme)
# enable
alt.themes.enable("calitp_theme")

ThemeRegistry.enable('calitp_theme')

In [3]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Lato');
</style>


In [4]:
#df = _clean_data.make_clean_data()
df= pd.read_parquet("gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/dla_df.parquet")

In [5]:
df.sample(1)

Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit__to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,...,adjusted_ac_requested,obligation_cat,active_transp,transit,bridge,street,freeway,infra_resiliency_er,congestion_relief,work_categories
20114,Obligated,ATPL,5208(159),Clovis,2021-10-04,2021-10-04,2021-10-04,2021-10-04,2021-10-15,-3.86,...,0.0,Large,1,0,0,0,0,0,0,1


## Function to Count by Year

In [6]:
df_years = _dla_utils.count_all_years((df>>filter(_.dist==7)))

In [7]:
#from Tiffany's branch DLA functions
def count_all_years(df, groupedby=["dist"]):
    count_years = geography_utils.aggregate_by_geography(
        df, 
        group_cols = ["prepared_y", "dist"],
        sum_cols = ["total_requested", "ac_requested", "fed_requested"],
        mean_cols = ["total_requested", "ac_requested", "fed_requested"],
        nunique_cols = ["primary_agency_name", "prefix", "project_no", "project_location", "type_of_work"]
    ).sort_values(["prepared_y", "dist"], ascending=[False, True])
    
    count_years= count_years.rename(columns={"ac_requested_x": "ac_requested_sum",
                                "fed_requested_x": "fed_requested_sum",
                                "total_requested_x": "total_requested_sum",
                                "ac_requested_y": "ac_requested_mean",
                                "fed_requested_y": "fed_requested_mean",
                                "total_requested_y": "total_requested_mean",
                                "status": "counts",
                                "prefix": "unique_prefix",
                                "primary_agency_name": "unique_primary_agency_name",
                                "project_location": "unique_project_location",
                                "project_no":"unique_project_no",
                                "type_of_work":"unique_type_of_work"})

    return count_years

In [8]:
df1 = count_all_years(df)

In [9]:
df1

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
120,2022.00,1,-3038306.36,12136866.49,10429434.12,-34526.21,137918.94,118516.30,9.00,6.00,83.00,83.00,29.00
121,2022.00,2,-1305386.31,3219171.96,2046330.61,-43512.88,107305.73,68211.02,9.00,11.00,29.00,29.00,25.00
122,2022.00,3,-5317762.02,83539755.73,104642173.73,-51132.33,803266.88,1006174.75,26.00,28.00,92.00,92.00,75.00
123,2022.00,4,-19637178.90,63931553.61,55685262.48,-160960.48,524029.13,456436.58,21.00,58.00,106.00,106.00,99.00
124,2022.00,5,-24197970.42,23001143.54,-1122576.85,-483959.41,460022.87,-22451.54,8.00,13.00,46.00,46.00,44.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,,3,,,,,,,,,,,
119,,4,,,,,,,,,,,
36,,7,,,,,,,,,,,
132,,11,,,,,,,,,,,


In [10]:
df2 = count_all_years((df>>filter(_.dist==4)))

In [11]:
df2

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
12,2022.0,4,-19637178.9,63931553.61,55685262.48,-160960.48,524029.13,456436.58,21.0,58.0,106.0,106.0,99.0
10,2021.0,4,-7581867.92,150810954.28,207275549.66,-18537.57,368730.94,506786.18,42.0,102.0,319.0,329.0,292.0
9,2020.0,4,-16015853.21,215076760.46,380929514.28,-55999.49,752016.64,1331921.38,34.0,89.0,234.0,243.0,215.0
8,2019.0,4,-70929402.87,314851234.42,319675519.69,-177323.51,787128.09,799188.8,41.0,93.0,322.0,333.0,299.0
0,2018.0,4,-19612828.51,286153969.05,374039033.36,-43200.06,630295.09,823874.52,51.0,97.0,344.0,369.0,315.0
1,2017.0,4,-12268568.69,279804721.77,332810620.13,-33429.34,762410.69,906840.93,43.0,97.0,292.0,319.0,269.0
2,2016.0,4,59687248.94,304782072.96,502320621.82,144172.1,736188.58,1213334.84,50.0,113.0,336.0,361.0,300.0
3,2015.0,4,68364142.0,239952043.43,448655372.86,152258.67,534414.35,999232.46,48.0,109.0,374.0,394.0,300.0
4,2014.0,4,-21667366.0,178332621.75,229278385.0,-44491.51,366186.08,470797.51,52.0,108.0,376.0,400.0,295.0
5,2013.0,4,928825.0,48608230.09,57966808.58,24442.76,1279163.95,1525442.33,15.0,28.0,35.0,38.0,38.0


In [12]:
df3= count_all_years((df>>filter(_.prefix=="CML")))

In [13]:
df3

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
90,2022.00,3,2840642.00,2103134.31,6560517.31,710160.50,525783.58,1640129.33,1,4,4,4,4
91,2022.00,4,0.00,6470029.09,9193648.33,0.00,647002.91,919364.83,1,7,10,10,10
92,2022.00,6,0.00,10629878.05,15576522.33,0.00,379638.50,556304.37,1,13,26,26,25
93,2022.00,7,0.00,2445000.00,1282505.01,0.00,244500.00,128250.50,1,6,10,10,10
95,2022.00,8,0.00,-405164.88,0.00,0.00,-101291.22,0.00,1,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,2013.00,10,0.00,-19261.00,-191630.00,0.00,-9630.50,-95815.00,1,2,2,2,2
55,2013.00,11,0.00,821000.00,927648.00,0.00,273666.67,309216.00,1,3,3,3,3
58,2013.00,12,0.00,546814.00,620993.00,0.00,182271.33,206997.67,1,1,3,3,1
49,2012.00,4,0.00,3842690.66,10576983.00,0.00,1280896.89,3525661.00,1,2,3,3,3


In [14]:
count_all_years(df>>filter(_.primary_agency_name=="Humboldt County"))

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
9,2022.0,1,-2040454.75,10524514.3,9807289.53,-27206.06,140326.86,130763.86,7,1,70,70,20
8,2021.0,1,4776055.24,4824894.61,10617008.32,67268.38,67956.26,149535.33,9,1,63,62,32
7,2020.0,1,-3136109.97,12204768.4,10262741.19,-37334.64,145294.86,122175.49,9,1,59,60,32
6,2019.0,1,4946140.05,14222150.75,21295247.46,33647.21,96749.32,144865.63,9,1,77,78,43
0,2018.0,1,177241.53,3015726.53,3600444.63,1036.5,17635.83,21055.23,8,1,110,111,37
1,2017.0,1,0.0,5158080.59,5531542.79,0.0,184217.16,197555.1,8,1,24,24,16
3,2016.0,1,0.0,4649799.0,5028074.27,0.0,140903.0,152365.89,10,1,29,30,21
2,2015.0,1,0.0,4659366.97,5363881.78,0.0,89603.21,103151.57,8,1,35,35,22
4,2014.0,1,0.0,4554254.42,4981763.78,0.0,89299.11,97681.64,9,1,45,45,23
5,2013.0,1,0.0,100863.0,113931.0,0.0,100863.0,113931.0,1,1,1,1,1


## Chart Part 1: Aggregate the Data

In [15]:
def part1_chart(df, col, aggregate_by=["dist"]):
    aggfunc = (
        geography_utils.aggregate_by_geography(
            df[df[aggregate_by].notna()], 
            group_cols = [aggregate_by],
            nunique_cols = [col]))       
    return aggfunc

In [16]:
part1_chart(df, "type_of_work", aggregate_by="primary_agency_name")

Unnamed: 0,primary_agency_name,type_of_work
0,Humboldt County,90
1,Mendocino County,54
2,Sacramento County,84
3,Sutter County,12
4,Contra Costa County,60
...,...,...
614,Palos Verdes Est,1
615,San Diego Metropolitan Transit System,1
616,Corte Madera,1
617,Hillsborough,1


## Find Top Nuniques

In [17]:
## want to create a function that gets a new dataframe that will hold all the counts and top twenty 
def find_top(df):
    
    cols= [["prefix",
           "prepared_y",
           "status_comment",
           "project_location",
           "type_of_work",
           "seq",
           "mpo",
           "primary_agency_name"]]
    
    tops_1 = (df
          >>count(_.prefix)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_prefix"})
    tops_2 = (df
          >>count(_.prepared_y)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_prepared_y"})
    tops_3 = (df
          >>count(_.status_comment)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_status_comment"})
    tops_4 = (df
          >>count(_.project_location)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_project_location"})
    tops_5 = (df
          >>count(_.type_of_work)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_type_of_work"})
    tops_6 = (df
          >>count(_.mpo)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_mpo"})
    tops_7 = (df
          >>count(_.primary_agency_name)
          >>arrange(-_.n)).head(20
                               ).reset_index(drop=True
                                            ).rename(columns={"n": "unique_primary_agency_name"})
    
    
    test = tops_1.join(tops_2, how='outer')
    test2 = test.join(tops_3, how='outer')
    test3 = test2.join(tops_4, how='outer')
    test4 = test3.join(tops_5, how='outer')
    test5 = test4.join(tops_6, how='outer')
    tops = test5.join(tops_7, how='outer')
                       
    return tops

In [18]:
tops = find_top(df)
tops

Unnamed: 0,prefix,unique_prefix,prepared_y,unique_prepared_y,status_comment,unique_status_comment,project_location,unique_project_location,type_of_work,unique_type_of_work,mpo,unique_mpo,primary_agency_name,unique_primary_agency_name
0,HSIPL,3463,2018.0,2819.0,Authorized,20498,San Francisco Bay Area,68,Bridge Replacement (tc),727,SCAG,5235,Humboldt County,713
1,CML,3441,2014.0,2724.0,..,190,Various Locations,32,Bridge Replacement,553,MTC,3419,Los Angeles,508
2,STPL,2909,2019.0,2655.0,Prog Code Z400,108,Sacog Region,29,FTA Transfer,471,NON-MPO,2766,Los Angeles County,488
3,ER,1800,2015.0,2575.0,Prog Code M400,76,Golden Gate Bridge,27,Permanent Restoration,324,SACOG,1946,Santa Cruz County,482
4,BRLO,1496,2016.0,2503.0,Prog Code Z230,47,Throughout Kern County,24,Road Rehabilitation,273,CFCG,1455,Fresno,471
5,BRLS,957,2017.0,2329.0,Prog Code Z003,30,Citywide,18,Bridge Preventive Maintenance,178,KCOG,1045,El Dorado County,312
6,ATPL,627,2020.0,2326.0,Prog Code M40E,26,FTA transfer,16,Emergency Opening,162,AMBAG,797,Kern County,301
7,RPSTPL,591,2021.0,2317.0,Prog Code M0E3,22,Various Locations (see Comments),14,Storm Damage Restoration,129,STANCOG,795,Bakersfield,280
8,HPLUL,344,2022.0,740.0,Prog Code M23E,17,Various Locations In San Diego County,14,Pavement Rehabilitation,125,SJCG,553,Fresno County,280
9,SRTSL,332,2013.0,193.0,Prog Code Z301,14,County Of Madera,13,Seismic Retrofit,123,SDAG,518,Stockton,277


In [19]:
_dla_utils.basic_bar_chart(tops, "prepared_y","unique_prepared_y", "prepared_y", "subset")

In [20]:
df2.sample()

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
6,2012.0,4,0.0,3652066.6,10376579.05,0.0,913016.65,2594144.76,2.0,3.0,4.0,4.0,4.0


In [21]:
_dla_utils.basic_bar_chart(
    (df2), "prepared_y", "unique_prefix", "prepared_y", "subset", ""
)

In [22]:
_dla_utils.basic_bar_chart(
    (tops),"unique_primary_agency_name", "primary_agency_name", "primary_agency_name", "subset", ""
)

In [23]:
_dla_utils.basic_scatter_chart(df1, "unique_primary_agency_name", "total_requested_sum", "dist", "subset", "")

In [32]:
_dla_utils.basic_line_chart(df1, "prepared_y:O", "total_requested_sum", "subset", "")

In [30]:
_dla_utils.basic_line_chart(df1, "prepared_y", "ac_requested_sum", "subset", "")

#### Issues/what to look for:

* get a df of information not grouped by year with 
    - total requested
    - ac requested 
    - fed requested
* fix name for charts of find_top 


## Chart Function

### Old Function with filtering in the function

In [39]:

def basic_agg(df, col, aggregate_by=["dist"]):
    df1 = ((df >> group_by(_[aggregate_by]) >> summarize(n=_[col].nunique()) >> arrange(-_.n)).head(20))
    chart = (alt.Chart(df1)
             .mark_bar()
             .encode(
                 x=alt.X(aggregate_by, title=f"{_dla_utils.labeling(aggregate_by)}"),
                 y=alt.Y("n", title=f"Number of Unique {_dla_utils.labeling(col)}"),
                 color = alt.Color("n",
                                  scale=alt.Scale(
                                      range=altair_utils.CALITP_SEQUENTIAL_COLORS),
                                      legend=alt.Legend(title=f"{(_dla_utils.labeling(col))}")
                                  ))
             .properties( 
                          title=f"Number of Unique {_dla_utils.labeling(col)} by {_dla_utils.labeling(aggregate_by)}")
    )

    return chart

In [40]:
basic_agg(df, "prefix", aggregate_by="primary_agency_name")

## Function for Selecting Calculating Method

In [41]:
def calculate_data(df, col, aggregate_by=["dist"], aggfunc="sum"):
    df1 = (df.groupby(aggregate_by)
          .agg({col: aggfunc})
          .reset_index()
          .sort_values(col, ascending=False)
         )
    #df = df[df[col] > 0]
    df1 = (df1>>arrange(-_[col])).head(20)
    return df1

In [42]:
calculate_data(df, "total_requested", aggregate_by="dist", aggfunc="sum")

Unnamed: 0,dist,total_requested
7,7,4511609279.52
8,8,3347009098.99
4,4,2922037710.95
12,12,2870621628.36
3,3,1663449026.64
6,6,1646031873.89
11,11,1154607072.07
10,10,1085645851.34
5,5,508086469.78
2,2,233866650.73


In [43]:
# #calcualte_sum2 might be more versatile
# def calculate_sum(df, col, agg_col):
#     dftest = geography_utils.aggregate_by_geography(
#         df, 
#         group_cols = [agg_col],
#         sum_cols = [col])
#     dftest = (dftest>>arrange(-_[col])).head(20)
#     #dftest = (dftest>>filter(_[col] >0))
#     return dftest


In [44]:
# calculate_sum(df, "total_requested", agg_col="dist")

## Adding to Charting Function

In [45]:
def test_chartfunc_sum(df, col, aggregate_by, aggfunc):
    
    df1 = (calculate_data(df, col, aggregate_by, aggfunc))
    
    chart = (alt.Chart(df1)
             .mark_bar()
             .encode(
                 x=alt.X(aggregate_by, title=_dla_utils.labeling(aggregate_by), sort=('-y')),
                 y=alt.Y(col, title=_dla_utils.labeling(col)),
                 color = alt.Color(col,
                                  scale=alt.Scale(
                                      range=altair_utils.CALITP_SEQUENTIAL_COLORS),
                                      legend=alt.Legend(title=(_dla_utils.labeling(col)))
                                  ))
             .properties( 
                          title=f"Highest {_dla_utils.labeling(col)} {_dla_utils.labeling(aggfunc)}s by {_dla_utils.labeling(aggregate_by)}")
    )
    
    return chart



In [46]:
test_chartfunc_sum(df, "total_requested", aggregate_by="type_of_work", aggfunc="mean")

In [47]:
test_chartfunc_sum(df, "prefix", aggregate_by="primary_agency_name", aggfunc="nunique")

In [48]:
#making sure it is right
df>>group_by(_.primary_agency_name)>> summarize(n=_.prefix.nunique()) >> arrange(-_.n)

Unnamed: 0,primary_agency_name,n
276,Los Angeles,35
277,Los Angeles County,31
434,San Bernardino Associated Governments,28
71,Caltrans,24
534,Stockton,23
...,...,...
591,Weed,1
595,Western Shasta Resource Conservation District,1
602,Willits,1
603,Willows,1


## Grouping Charts

In [49]:
#groupby_col_x axis = prepared_y
#agg_by_col = dist or mpo 
#sum_col= primary_agency_name or prefix
 
def group_chart_nunique(df, groupby_col_x, agg_by_col, sum_col):
    dist_years1 = (df >> group_by(_[groupby_col_x], _[agg_by_col]) 
                   >> summarize(n=_[sum_col].nunique()) 
                   >> arrange(-_[groupby_col_x]))

    chart = alt.Chart(dist_years1).mark_bar().encode(
            column=(f"{agg_by_col}:N"),
            x=alt.X((f"{groupby_col_x}:O"), title=_dla_utils.labeling(groupby_col_x)),
            y=alt.Y('n:Q', title=(f"Number of Unique {_dla_utils.labeling(sum_col)}")),
            color = alt.Color((f"{agg_by_col}:N"), 
                                  scale=alt.Scale(
                                      range=altair_utils.CALITP_SEQUENTIAL_COLORS),  
                                   legend=alt.Legend(title=(_dla_utils.labeling(sum_col)))
                                  )
                                  )
    #chart.save(f"./chart_outputs/grouped_{agg_by_col}_by_{groupby_col_x}.png")
    
    return chart


In [50]:
def group_chart_nunique_2(df, groupby_col_x, agg_by_col, sum_col):
    #without function
    # dist_years1 = (df >> group_by(_[groupby_col_x], _[agg_by_col]) 
    #                >> summarize(n=_[sum_col].nunique()) 
    #                >> arrange(-_[groupby_col_x]))

    chart = alt.Chart(df).mark_bar().encode(
            column=(f"{agg_by_col}:N"),
            x=alt.X((f"{groupby_col_x}:O"), title=_dla_utils.labeling(groupby_col_x)),
            y=alt.Y('n:Q', title=(f"Number of Unique {_dla_utils.labeling(sum_col)}")),
            color = alt.Color((f"{agg_by_col}:N"), 
                                  scale=alt.Scale(
                                      range=altair_utils.CALITP_SEQUENTIAL_COLORS),  
                                   legend=alt.Legend(title=(_dla_utils.labeling(sum_col)))
                                  )
                                  )
   # chart.save(f"./chart_outputs/grouped_{agg_by_col}_by_{groupby_col_x}.png")
    
    return chart

In [51]:
dist_years1 = (df >> group_by(_.prepared_y, _.mpo)
               >> summarize(n=_.prefix.nunique()) 
               >> arrange(-_.prepared_y))

In [52]:
dist_years1

Unnamed: 0,prepared_y,mpo,n
195,2022.00,AMBAG,6
196,2022.00,BCAG,7
197,2022.00,CFCG,10
198,2022.00,KCAG,3
199,2022.00,KCOG,11
...,...,...,...
1,2011.00,MTC,1
2,2011.00,NON-MPO,1
3,2011.00,SCAG,2
4,2011.00,STANCOG,1


In [53]:
df2

Unnamed: 0,prepared_y,dist,ac_requested_sum,fed_requested_sum,total_requested_sum,ac_requested_mean,fed_requested_mean,total_requested_mean,unique_prefix,unique_primary_agency_name,unique_project_location,unique_project_no,unique_type_of_work
12,2022.0,4,-19637178.9,63931553.61,55685262.48,-160960.48,524029.13,456436.58,21.0,58.0,106.0,106.0,99.0
10,2021.0,4,-7581867.92,150810954.28,207275549.66,-18537.57,368730.94,506786.18,42.0,102.0,319.0,329.0,292.0
9,2020.0,4,-16015853.21,215076760.46,380929514.28,-55999.49,752016.64,1331921.38,34.0,89.0,234.0,243.0,215.0
8,2019.0,4,-70929402.87,314851234.42,319675519.69,-177323.51,787128.09,799188.8,41.0,93.0,322.0,333.0,299.0
0,2018.0,4,-19612828.51,286153969.05,374039033.36,-43200.06,630295.09,823874.52,51.0,97.0,344.0,369.0,315.0
1,2017.0,4,-12268568.69,279804721.77,332810620.13,-33429.34,762410.69,906840.93,43.0,97.0,292.0,319.0,269.0
2,2016.0,4,59687248.94,304782072.96,502320621.82,144172.1,736188.58,1213334.84,50.0,113.0,336.0,361.0,300.0
3,2015.0,4,68364142.0,239952043.43,448655372.86,152258.67,534414.35,999232.46,48.0,109.0,374.0,394.0,300.0
4,2014.0,4,-21667366.0,178332621.75,229278385.0,-44491.51,366186.08,470797.51,52.0,108.0,376.0,400.0,295.0
5,2013.0,4,928825.0,48608230.09,57966808.58,24442.76,1279163.95,1525442.33,15.0,28.0,35.0,38.0,38.0


## Plot Nine

In [54]:
def prefix_all_agencies(df, prefix_unique):
    
    # graphs 
    prefixes = df[df.prefix== prefix_unique]
    
    prefix_count_num = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(50)
    
    prefix_count = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(20)
    
    display(Markdown(f"**The number of agencies using {prefix_unique} is {len(prefix_count_num)}**"))
    
    # for the table- using one as some agencies only have one entry
    display(df[(df.prefix == prefix_unique)].sample(1))
    
    
    ax1 = (prefix_count
            >> ggplot(aes("primary_agency_name", "n", fill="primary_agency_name")) 
                + geom_col() 
                + theme(axis_text_x = element_text(angle = 45 , hjust=1))
                + labs(title='Top Agencies using Prefix', x='Agency', y='Number of Obligations', fill="Agency")
                + theme_538()
                + theme(plot_background=element_rect(fill=backgroundColor, color=backgroundColor),
                     panel_background=element_rect(fill=backgroundColor, color=backgroundColor),
                     panel_grid_major_y=element_line(
                        color=axisColor, linetype='solid', size=1),
                     panel_grid_major_x=element_blank(),
                     figure_size=(7.0, 4.4),
                     title=element_text(weight="bold", size=font_size, 
                                        family=font, color=blackTitle),
                     axis_title=element_text(family=labelFont, size=12, color=guideTitleColor),
                     axis_text=element_text(family=labelFont, size=10, color=guideLabelColor, 
                                            margin={'r': 4}
                                           ),
                     axis_title_x=element_text(margin={'t': 10}),
                     axis_title_y=element_text(margin={'r': 10}),
                     legend_title=element_text(font=labelFont, size=14, color=blackTitle, 
                                               margin={'b': 10}),
                     legend_text=element_text(font=labelFont, size=11, color=blackTitle, 
                                              margin={'t': 5, 'b': 5, 'r': 5, 'l': 5}),
                    )
            )    
    #ax1 = preset_plotnine_config(ax1)
    return ax1
  

In [55]:
prefixes = df[df.prefix== "CML"]
    
prefix_count_num = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(50)
    
prefix_count = (prefixes >> count(_.primary_agency_name) >> arrange(-_.n)).head(20)

In [56]:
from shared_utils import calitp_color_palette as cp

In [57]:
# ax1 = (prefix_count
#             >> ggplot(aes("primary_agency_name", "n", fill="primary_agency_name")) 
#                 + geom_col() 
#                 + theme(axis_text_x = element_text(angle = 45 , hjust=1))
#                 + labs(title='Top Agencies using Prefix', x='Agency', y='Number of Obligations', fill="Agency")
#             )

# chart = (styleguide.preset_plotnine_config(ax1)
#          + scale_fill_manual(values=cp.CALITP_SEQUENTIAL_COLORS)
#         )

# chart

# Misc.

In [58]:
# def totalfunds_chart_(df, aggregate_by=[], col):
#      aggfunc = (
#         geography_utils.aggregate_by_geography(
#             df[df[aggregate_by].notna()], 
#             group_cols = [aggregate_by],
#             sum_cols = [col],
#             nunique_cols = [col]

            
    
#     test= (df
#            >>group_by(_.primary_agency_name)
#            >>summarize(Total_Funds = _[col].sum())
#            >>arrange(-_.Total_Funds))
#     #test.rename(columns={'primary_agency_name': 'Primary Agency Name', 'col': 'Sum Funds'}, inplace=True)
#     test.style.format(precision=2, na_rep='MISSING', thousands=",")
#     chart = (alt.Chart(test.head(20))
#              .mark_bar()
#              .encode(
#                  x=alt.X("primary_agency_name", title="col"),
#                  y=alt.Y("Total Funds", title="Total Funds Obligated"),
#                  #column = "payment:N",
#                  color = alt.Color("primary_agency_name", 
#                                   scale=alt.Scale(
#                                       range=altair_utils.FIVETHIRTYEIGHT_DIVERGING_COLORS
#                                   )
#                                   )
#              )
#             .properties( 
#                            title="District 7 Agencies with the Most Total Funds Obligated")

#     )
#     return chart