# DLA functions

In [1]:
import altair as alt
import pandas as pd

from siuba import *

import _clean_data
from shared_utils import geography_utils, styleguide
from shared_utils import calitp_color_palette as cp

alt.themes.register("calitp_theme", styleguide.calitp_theme)
# enable
alt.themes.enable("calitp_theme")



ThemeRegistry.enable('calitp_theme')

In [2]:
'''
df = _clean_data.make_clean_data()
df.to_parquet("./data.parquet")
'''

df = pd.read_parquet("./data.parquet")

## Aggregation

* Aggregate by some group, get count
* Also sort in descending order

In [3]:
# How to use the geography_utils function to return 1 df, no merging needed
dist_years1 = geography_utils.aggregate_by_geography(
    df, 
    group_cols = ["dist"],
    sum_cols = ["total_requested"],
    count_cols = ["project_no"],
    nunique_cols = ["primary_agency_name", "prefix"],
)

dist_years1.head()
#dist_agencyn = (df >> group_by(_.dist) >> summarize(n=_.primary_agency_name.nunique()) >> arrange(-_.n))
#dist_prefixn = (df >> group_by(_.dist) >> summarize(n=_.prefix.nunique()) >> arrange(-_.n))
#sum_funds = df>>group_by(_.dist)>>summarize(n=_.total_requested.sum()) >> arrange(-_.n)


Unnamed: 0,dist,total_requested,project_no,prefix,primary_agency_name
0,1,175141600.0,1161,25,20
1,3,1561961000.0,2517,98,62
2,4,2820382000.0,3248,110,151
3,6,1597301000.0,3084,71,55
4,7,4264832000.0,2787,107,108


In [4]:
dist_years2 = geography_utils.aggregate_by_geography(
    df, 
    group_cols = ["prepared_y", "dist"],
    sum_cols = ["total_requested"],
    count_cols = ["project_no"],
    nunique_cols = ["primary_agency_name", "prefix"]
).sort_values(["prepared_y", "dist"], ascending=[False, True])

dist_years2.head()

Unnamed: 0,prepared_y,dist,total_requested,project_no,prefix,primary_agency_name
120,2021.0,0,-1140.55,1.0,1.0,1.0
106,2021.0,1,12172390.0,96.0,13.0,9.0
107,2021.0,2,7705425.0,75.0,15.0,13.0
108,2021.0,3,237814100.0,248.0,34.0,45.0
109,2021.0,4,160270800.0,336.0,37.0,91.0


## Put it together

In [5]:
def aggregate_datasets(df, aggregate_by=["dist"]):
    """
    Parameters:
    df: pandas.DataFrame 
        Cleaned data, the result of _clean_data functions.
    aggregate_by: list.
        List of functions to group by
        Ex: district, county, MPO, RTPA
    
    Returns: two pandas.DataFrames.
            Metrics calculated: 
            sum of total requested, # unique agencies, # unique prefixes
            
            first df: aggregated to group (each row is district)
            second df: aggregated to year-group (each row is district-year)
    """
    
    by_geography = (
        geography_utils.aggregate_by_geography(
            df[df[aggregate_by].notna()], 
            group_cols = [aggregate_by],
            sum_cols = ["total_requested"],
            nunique_cols = ["primary_agency_name", "prefix"]
            ).sort_values(["primary_agency_name"], ascending=False)
    )
    
    by_geography_year = (
        geography_utils.aggregate_by_geography(
            df[(df[aggregate_by].notna()) & (df.prepared_y.notna())], 
            group_cols = ["prepared_y", aggregate_by],
            sum_cols = ["total_requested"],
            nunique_cols = ["primary_agency_name", "prefix"]
        ).sort_values(["prepared_y", aggregate_by], ascending=[False, True])
    )
        
    return by_geography, by_geography_year
  
    
# Do it all at once
# df = _clean_data.make_clean_data()
# OR, df = catalog.CLEANED_UP_DATASET.read()
df = pd.read_parquet("./data.parquet")


dist_years1, dist_years2 = aggregate_datasets(df, aggregate_by="dist")
mpo_years1, mpo_years2 = aggregate_datasets(df, aggregate_by="mpo")
agency_years1, agency_years2 = aggregate_datasets(df, aggregate_by="agency")

## Chart Functions

In [6]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Raleway');
@import url('https://fonts.googleapis.com/css?family=Nunito+Sans');
@import url('https://fonts.googleapis.com/css?family=Bitter');
</style>

With `altair`, you can reuse bits of the code to build a chart.

If there's a "base", some piece that is used over and over for a bar chart or a line chart, you can put that piece in its own function. Future functions can invoke that function, then add additional components, similar to how `.properties()`, `.configure_title()`, etc are added.

In [7]:
# Rename columns and then wrangle further for labeling with a function 
dist_years1 = dist_years1.rename(columns = {
    "primary_agency_name": "n_agencies",
    "prefix": "unique_prefixes",
})

# Or, use a dictionary to store all the various labeling needed
# Can use this to address other labeling, esp for random places
# Or, put this dictionary within 
LABEL_DICT = {
    "prepared_y": "Year",
    "dist": "District",
    "total_requested": "Total Requested",
}

# Then, use a function to clean up labels 
def labeling(word):
    # Add specific use cases where it's not just first letter capitalized
    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        word = word.replace('n_', 'Number of ').title()
        word = word.replace('unique_', "Number of Unique ").title()
        word = word.replace('_', ' ').title()
    
    return word


In [8]:
print(labeling("mpo"))
print(labeling("dist"))
print(labeling("total_requested"))

MPO
District
Total Requested


In [9]:
def base_bar(df, x_col, y_col):
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X(f"{x_col}:N", title=labeling(x_col)),
                 # Pass the labeling function here. Columns need to be renamed to use this.
                 y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
             )
            )
    
    return chart


def stacked_bar_chart(df, x_col, y_col, color_col, chart_title):
    base = base_bar(df, x_col, y_col)
    
    chart = (base.encode(
                 color=alt.Color(f"{color_col}:N", 
                                 scale=alt.Scale(
                                     range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                                 legend=alt.Legend(
                                     title=labeling(color_col))
                                )
             ).properties(title=chart_title)
    )
    
    return chart

In [10]:
base_bar(dist_years2, x_col="prepared_y", 
          y_col="primary_agency_name").properties(
    title="New Chart Title"
)

In [11]:
stacked_bar_chart(dist_years2, x_col="prepared_y", 
          y_col="primary_agency_name", color_col="dist",
          chart_title="Number of Obligations by District")

In [12]:
base_bar(dist_years1, x_col="dist", 
         y_col="unique_prefixes").properties(
    title="Number of Unique Prefixes by District")

In [13]:
base_bar(mpo_years1, x_col="mpo", 
         y_col="prefix").properties(
    title="Number of Unique Prefixes by MPO")

In [14]:
from plotnine import *

import ipywidgets as widgets
from ipywidgets import *
from IPython.display import Markdown
from IPython.core.display import display

In [15]:
def summarize_and_plot(df, select_col, place):
    subset = df[df[select_col]==place].rename(
        columns = {
            "fed_requested": "Federal",
            "ac_requested": "AC",
            "total_requested": "Total",
        }
    )

    display(Markdown(f"**Summary Statistics for {place}**"))
    display(Markdown(f"The number of obligations {place} has is {len(subset)}"))
    '''
    display(Markdown(
        f"The number of prefix codes {place} uses is {subset.prefix.nunique()}"))

    pd.set_option("display.max_columns", None)

    funds = subset[['Federal','AC','Total']].describe()
    display(funds.style.format(precision=2, na_rep='MISSING', thousands=","))

    display(Markdown(f"**Top Project Types in {place}**"))

    work_df = subset >> count(_.type_of_work) >> arrange(-_.n)
    display(work_df.head(5))
    '''

In [16]:
import ipywidgets

def interactive_widget(df, select_col):
    
    dropdown = ipywidgets.Dropdown(
        description=f"{select_col.title()}",
        options=df[select_col].sort_values().unique().tolist()
    )
    output = ipywidgets.Output()

    display(dropdown)
    display(output)

        
    def on_selection(*args):
        summarize_and_plot(df, select_col, dropdown.value)

    dropdown.observe(on_selection, names=["values"])
    on_selection()


In [17]:
interactive_widget(df, "agency")

Dropdown(description='Agency', options=('Access Services', 'Agoura Hills', 'Ala-Con Costa T', 'Alameda', 'Alam…

Output()

**Summary Statistics for Access Services**

The number of obligations Access Services has is 17

In [20]:
select_col = "agency"

dropdown = ipywidgets.Dropdown(
    description=f"{select_col.title()}",
    options=df[select_col].sort_values().unique().tolist()
)
output = ipywidgets.Output()

display(dropdown)
display(output)


def on_selection(*args):
    summarize_and_plot(df, select_col, dropdown.value)


dropdown.observe(on_selection, names=["values"])
on_selection()


Dropdown(description='Agency', options=('Access Services', 'Agoura Hills', 'Ala-Con Costa T', 'Alameda', 'Alam…

Output()

**Summary Statistics for Access Services**

The number of obligations Access Services has is 17

In [None]:
'''
@interact

def dla_get_prefix(place=df.agency.sort_values().unique().tolist()):

    agencies = df[df.agency==place]

    prefix_count_n = agencies >> count(_.prefix)

    display(Markdown(f"**Summary Statistics for {place}**"))
    display(Markdown(f"The number of obligations {place} has is {len(agencies)}"))

    display(Markdown(f"The number of prefix codes {place} uses is {len(prefix_count_n)}"))

    # for the table- using one as some agencies only have one entry
    pd.set_option("display.max_columns", None)
    display(df[df.agency == place][['fed_requested','ac_requested','total_requested']].describe())

    display(Markdown(f"**Top Project Types in {place}**"))
    display((df[df.agency == place] >> count(_.type_of_work) >> arrange(-_.n)).head(5)) 
    # graphs 
'''