In [None]:
#import warnings
#warnings.filterwarnings("ignore")

import altair as alt
import pandas as pd

from siuba import *
from IPython.display import Markdown


import _clean_data
import _dla_utils
from shared_utils import styleguide, geography_utils
from shared_utils import calitp_color_palette as cp

alt.themes.register("calitp_theme", styleguide.calitp_theme)
# enable
alt.themes.enable("calitp_theme")

In [None]:
# Need to get a parameters cell
district = 7

In [None]:
display(Markdown(f"# District {district} Analysis"))

In [None]:
ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', 
                         'pedestrian', 'crosswalk', 
                         'bulb out', 'bulb-out', 
                         'active transp', 'traffic reduction', 
                         'speed reduction', 
                        ]
TRANSIT = ['bus', 'metro', 'station', 'transit']
BRIDGE = ["bridge", 'viaduct']

NOT_TRANSIT = ['fueling', 'charging', 'chg']
    
def categorize_project_descriptions(row):
    """
    This function takes a individual type of work description (row of a dataframe)
    and returns a dummy flag of 1 if it finds keyword present in
    project categories (active transportation, transit, bridge, etc).
    A description can contain multiple keywords across categories.
    """
    # Make lowercase
    description = row.type_of_work.lower()
    
    # Store a bunch of columns that will be flagged
    # A project can involve multiple things...also, not sure what's in the descriptions
    active_transp = 0
    transit = 0
    bridge = 0
    
    if any(word in description for word in ACTIVE_TRANSPORTATION):
        active_transp = 1
    
    if (any(word in description for word in TRANSIT) and 
        not any(exclude_word in description for exclude_word in NOT_TRANSIT)
       ):
        transit = 1
        
    if any(word in description for word in BRIDGE):
        bridge = 1
        
    return pd.Series(
        [active_transp, transit, bridge], 
        index=['active_transp', 'transit', 'bridge']
    )

In [None]:
#df = _clean_data.make_clean_data()
df= pd.read_parquet("dla_df.parquet")

df = df>>filter(_.dist==district)

Some of these additional data wrangling steps might be able to be moved into a script.

In [None]:
work_categories = df.apply(categorize_project_descriptions, axis=1)
df = pd.concat([df, work_categories], axis=1)

df = df.assign(
    prepared_y = df.prepared_y.astype("Int64"),
    processing_days = df[["dist_processing_days", "hq_processing_days", 
                          "fhwa_processing_days"]].sum(axis=1)
)

In [None]:
df_years = _dla_utils.count_all_years(df)
df_top = _dla_utils.find_top(df)

In [None]:
# Add this chart function to take out saving it for now...display directly
def labeling(word):
    # Add specific use cases where it's not just first letter capitalized
    LABEL_DICT = { "prepared_y": "Year",
              "dist": "District",
              "total_requested": "Total Requested",
              "fed_requested":"Fed Requested",
              "ac_requested": "Advance Construction Requested",
              "nunique":"Number of Unique",
              "project_no": "Project Number"}
    
    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        word = word.replace('n_', 'Number of ').title()
        word = word.replace('unique_', "Number of Unique ").title()
        word = word.replace('_', ' ').title()
    
    return word

def basic_bar_chart(df, x_col, y_col):
    
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X(f"{x_col}:N", title=labeling(x_col), sort=('-y')),
                 y=alt.Y(f"{y_col}:N", title=labeling(y_col)),
                 color = alt.Color(y_col,
                                  scale=alt.Scale(
                                      range=cp.CALITP_SEQUENTIAL_COLORS),
                                      legend=alt.Legend(title=(labeling(y_col)))
                                  ),
                 tooltip=alt.Tooltip([x_col, y_col])
             )
             .properties( 
                          title=f"Highest {labeling(x_col)} by {labeling(y_col)}")
    )

    chart=styleguide.preset_chart_config(chart)
    display(chart)

In [None]:
unique_agencies = (_dla_utils.calculate_data_all(df, 'primary_agency_name', 
                                                 aggfunc="nunique") 
                   .primary_agency_name.iloc[0]
)

display(
    Markdown(f"## Obligations"),
    Markdown(f"There are {unique_agencies} unique agencies in District {district}.")
)

In [None]:
display(
    Markdown("### Agencies with the Most Obligations")
)

#filter_relabel(df_top, "primary_agency_name")

In [None]:
def make_funding_long(df, work_categories_list):
    funding_cols = [
        'adjusted_total_requested', 'adjusted_fed_requested', 'adjusted_ac_requested'
    ]
    
    final = pd.DataFrame()
    
    # Since categories are not mutually exclusive, loop over the columns to include
    # and create a new df that is long
    for i in work_categories_list:
        subset = df[df[i]==1]
        agg_df = geography_utils.aggregate_by_geography(
            subset,
            group_cols = ['prepared_y'],
            sum_cols = funding_cols,
            count_cols = ['project_no'],
            mean_cols = ["processing_days"],
        )
        agg_df = agg_df.assign(
            work_category = i
        )
        
        for c in funding_cols:
            new_var = f"mean_{c}"
            agg_df[new_var] = agg_df[c] / agg_df.project_no
        
        final = pd.concat([final, agg_df], axis=0, ignore_index=True)
    
    final = (final[final.prepared_y.notna()]
             .reset_index(drop=True)
             
             .astype({"project_no": "int64"}))
             
    return final

In [None]:
by_work_categories = make_funding_long(df, list(work_categories.columns))

by_work_categories.head(2) 

In [None]:
category_cols = ["active_transp", "transit", "bridge"]
MAX_Y = (df.groupby(["prepared_y"] + category_cols)
         .agg({"total_requested":"sum"})
         .reset_index()
        )[["total_requested"]].max()[0]
MAX_Y

In [None]:
from altair import datum

#https://stackoverflow.com/questions/61194028/adding-labels-at-end-of-line-chart-in-altair
# Might have to pull each additional line to visualize, then add label at end of line
# For legend to have encoding, dataset needs to be long
# Since each project can be tagged as several, making long dataset would require 
# additional step to create this new aggregated df, not a simple pivot from wide to long.
def line_chart(df):
    category_cols = ["active_transp", "transit", "bridge"]
    MAX_Y = (df.groupby(["prepared_y"] + category_cols)
             .agg({"total_requested":"sum"})
             .reset_index()
            )[["total_requested"]].max()[0]
    
    
    base = (alt.Chart(df)
        .mark_line()
        .encode(
            x=alt.X("year(prepared_date):O", title="Year"),
            y=alt.Y("sum(total_requested):Q", title="Total Requested (2021$)", 
                   scale=alt.Scale(domain=[0, 400_000_000])), 
        )
       )
    
    active = (base
             .encode(color=alt.value("blue"),
             ).transform_filter(datum.active_transp == 1)
            )
    
    transit = (base
             .encode(color=alt.value("green"),
             ).transform_filter(datum.transit == 1)
            )
    
    bridge = (base
             .encode(color=alt.value("orange"),
             ).transform_filter(datum.bridge == 1)
            )
    
    chart = (active + transit + bridge)
    return chart

In [None]:
#line_chart(df)

In [None]:
def line_chart2(df, y_col):
    chart = (alt.Chart(df)
             .mark_line()
             .encode(
                 x=alt.X("prepared_y:O", title="Year"),
                 y=alt.Y(f"sum({y_col}):Q", title=y_col, 
                        scale=alt.Scale(domain=[0, 
                                                by_work_categories[y_col].max()
                                               ])
                        ),
                 color=alt.Color("work_category:N")
             )
    )
    
    chart = styleguide.preset_chart_config(chart)
    return chart

In [None]:
line_chart2(by_work_categories, "mean_adjusted_total_requested")

In [None]:
def basic_bar_chart(df, x_col, y_col, title=""):
    if title == "":
        title = f"Highest {labeling(x_col)} by {labeling(y_col)}"
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X(f"{x_col}:N", title=labeling(x_col), sort=('-y')),
                 y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
                 color = alt.Color(y_col,
                                  scale=alt.Scale(
                                      range=cp.CALITP_SEQUENTIAL_COLORS),
                                      legend=alt.Legend(title=(labeling(y_col)))
                                  ),
                 tooltip=alt.Tooltip([x_col, y_col])
             )
             .properties(title= title)
    )

    chart=styleguide.preset_chart_config(chart)
    display(chart)
    
# Subset data
basic_bar_chart(df_top[df_top.variable=="prefix"], "value", "count", 
                title="Top 20 Prefixes")

basic_bar_chart(df_top[df_top.variable=="primary_agency_name"], "value", "count", 
                title="Top 20 Primary Agencies")