# District {district} ({district_name})

1. Unique obligations at district-level, summary sentence
1. Top agencies / core customers
1. Top prefixes / core customers
1. Type of work, ~5 main categories, show breakdown of spending and number of projects (aggregate values, inflation-adjusted). Need to think of normalizing to % is worthwhile

In [None]:
%%capture
import warnings

warnings.filterwarnings("ignore")

import altair as alt
import pandas as pd
from IPython.display import Markdown
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, styleguide
from siuba import *

import _dla_utils

alt.themes.register("calitp_theme", styleguide.calitp_theme)
# enable
alt.themes.enable("calitp_theme")

In [None]:
# Need to get a parameters cell
district = 7

In [None]:
df = pd.read_parquet("dla_df.parquet")

df = df >> filter(_.dist == district)

## Data Wrangling

Some of these additional data wrangling steps might be able to be moved into a script.

The wrangling specific to making a chart in `altair` can be left in notebook, but any data processing that is done on the entire dataset should be moved off.

In [None]:
df = df.assign(
    prepared_y=df.prepared_y.astype("Int64"),
    processing_days=df[
        ["dist_processing_days", "hq_processing_days", "fhwa_processing_days"]
    ].sum(axis=1),
)
df = df[df["prepared_y"].notna()]
# df[df[""].isna()].head()

In [None]:
df_years = _dla_utils.count_all_years(df)
df_top = _dla_utils.find_top(df)

## Obligations

In [None]:
unique_agencies = _dla_utils.calculate_data_all(
    df, "primary_agency_name", aggfunc="nunique"
).primary_agency_name.iloc[0]

display(
    Markdown(f"There are {unique_agencies} unique agencies in District {district}.")
)

### Core Customers

Of the top 20 primary agencies, what % of funds and projects are they responsible for?

Of the top 20 prefixes, what % of funds and projects are they responsible for?

In [None]:
def core_customer_stat(df, df_top, variable):
    list_of_top = list(df_top[df_top.variable == variable].value)

    t1 = geography_utils.aggregate_by_geography(
        df[df[variable].isin(list_of_top)],
        group_cols=[variable],
        sum_cols=["adjusted_total_requested"],
        count_cols=["project_no"],
    )

    # Add totals as new columns
    t2 = t1.assign(
        all_total_requested=df.adjusted_total_requested.sum(),
        all_projects=df.project_no.count(),
    )

    # Calculate percents
    t2 = t2.assign(
        pct_project=(t2.project_no.divide(t2.all_projects) * 100).round(1),
        pct_total_requested=(
            t2.adjusted_total_requested.divide(t2.all_total_requested) * 100
        ).round(1),
    )

    # Keep summary table?
    # But, if we want to drill into specific stuff in the top 20
    # like the top agency or top prefix, or top 5, can do so
    t3 = (
        t2
        >> mutate(
            pct_top20_project=_.pct_project.sum(),
            pct_top20_total_requested=_.pct_total_requested.sum(),
        )
        >> distinct(_.pct_top20_project, _.pct_top20_total_requested)
    )

    for c in t3.columns:
        t3[c] = t3[c].round(1)

    return t3

In [None]:
core_prefix = core_customer_stat(df, df_top, "prefix")
core_agency = core_customer_stat(df, df_top, "primary_agency_name")

In [None]:
# Add this chart function to take out saving it for now...display directly
def labeling(word):
    # Add specific use cases where it's not just first letter capitalized
    LABEL_DICT = {
        "prepared_y": "Year",
        "dist": "District",
        "total_requested": "Total Requested",
        "fed_requested": "Fed Requested",
        "ac_requested": "Advance Construction Requested",
        "nunique": "Number of Unique",
        "project_no": "Project Number",
    }

    if (word == "mpo") or (word == "rtpa"):
        word = word.upper()
    elif word in LABEL_DICT.keys():
        word = LABEL_DICT[word]
    else:
        word = word.replace("n_", "Number of ").title()
        word = word.replace("unique_", "Number of Unique ").title()
        word = word.replace("_", " ").title()

    return word


def basic_bar_chart(df, x_col, y_col, title=""):
    if title == "":
        title = f"Highest {labeling(x_col)} by {labeling(y_col)}"
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(f"{x_col}:N", title=labeling(x_col), sort=("-y")),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            color=alt.Color(
                y_col,
                scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
                legend=alt.Legend(title=(labeling(y_col))),
            ),
            tooltip=alt.Tooltip([x_col, y_col]),
        )
        .properties(title=title)
    )

    chart = styleguide.preset_chart_config(chart)
    display(chart)

In [None]:
display(
    Markdown(
        f"{df.prepared_y.min()}-{df.prepared_y.max()}: "
        f"<br>The **top 20 prefixes** were responsible for "
        f"**{core_prefix.pct_top20_project[0]}% of all projects** and "
        f"**{core_prefix.pct_top20_total_requested[0]}%** "
        "**of total requested funds** (2021\$)."
    )
)

basic_bar_chart(
    df_top[df_top.variable == "prefix"], "value", "count", title="Top 20 Prefixes"
)

In [None]:
display(
    Markdown(
        f"{df.prepared_y.min()}-{df.prepared_y.max()}: "
        f"<br>The **top 20 agencies** were responsible for "
        f"**{core_agency.pct_top20_project[0]}% of all projects** and "
        f"**{core_agency.pct_top20_total_requested[0]}%** "
        "**of total requested funds** (2021$)."
    )
)

basic_bar_chart(
    df_top[df_top.variable == "primary_agency_name"],
    "value",
    "count",
    title="Top 20 Primary Agencies",
)

In [None]:
def make_funding_long(df, work_categories_list):
    funding_cols = [
        "adjusted_total_requested",
        "adjusted_fed_requested",
        "adjusted_ac_requested",
    ]

    final = pd.DataFrame()

    # Since categories are not mutually exclusive, loop over the columns to include
    # and create a new df that is long
    for i in work_categories_list:
        subset = df[df[i] == 1]
        agg_df = geography_utils.aggregate_by_geography(
            subset,
            group_cols=["prepared_y"],
            sum_cols=funding_cols,
            count_cols=["project_no"],
            mean_cols=["processing_days"],
        )
        agg_df = agg_df.assign(category=i)

        for c in funding_cols:
            new_var = f"mean_{c}"
            agg_df[new_var] = agg_df[c] / agg_df.project_no

        final = pd.concat([final, agg_df], axis=0, ignore_index=True)

    final = (
        final[final.prepared_y.notna()]
        .reset_index(drop=True)
        .astype({"project_no": "int64"})
    )

    return final

In [None]:
WORK_CATEGORIES = [
    "active_transp",
    "transit",
    "bridge",
    "street",
    "freeway",
    "infra_resiliency_er",
    "congestion_relief",
]

by_work_categories = make_funding_long(df, WORK_CATEGORIES)

by_work_categories.head(2)

There's a possibility of needing to set the max y-axis value, especially if charts are being displayed side-by-side. Don't want one to be between 1-1M and another to be 1-10_000. 

May need to use some kind of calculation within chart function to set the Y_MAX

In [None]:
MAX_Y = (
    df.groupby(["prepared_y"] + WORK_CATEGORIES)
    .agg({"adjusted_total_requested": "sum"})
    .reset_index()
)[["adjusted_total_requested"]].max()[0]
MAX_Y

### Multiple Lines on Line Chart

Two options for having multiple lines on `altair` chart

1. Use `transform_filter`, then label at the end of the chart what that line represents
* Labeling at the end of the line: https://stackoverflow.com/questions/61194028/adding-labels-at-end-of-line-chart-in-altair

2. To have legend appear, the dataset must be long. It's the `alt.Color` encoding that is passed into the legend.

Since each project can be tagged as several, making long dataset would require additional step to create this new aggregated df, not a simple pivot from wide to long.

In [None]:
from altair import datum


def line_chart(df):
    category_cols = ["active_transp", "transit", "bridge"]
    MAX_Y = (
        df.groupby(["prepared_y"] + category_cols)
        .agg({"total_requested": "sum"})
        .reset_index()
    )[["total_requested"]].max()[0]

    base = (
        alt.Chart(df)
        .mark_line()
        .encode(
            x=alt.X("year(prepared_date):O", title="Year"),
            y=alt.Y(
                "sum(total_requested):Q",
                title="Total Requested (2021$)",
                scale=alt.Scale(domain=[0, 400_000_000]),
            ),
        )
    )

    active = base.encode(
        color=alt.value("blue"),
    ).transform_filter(datum.active_transp == 1)

    transit = base.encode(
        color=alt.value("green"),
    ).transform_filter(datum.transit == 1)

    bridge = base.encode(
        color=alt.value("orange"),
    ).transform_filter(datum.bridge == 1)

    chart = active + transit + bridge
    return chart

In [None]:
# line_chart(df)

Do some work to get the highlight/tooltip with selection/legend changes to work.


https://github.com/cal-itp/data-analyses/blob/main/msd_dashboard_metric/04_validation_errors.ipynb

https://stackoverflow.com/questions/66108224/combine-hover-and-click-selections-in-altair

https://altair-viz.github.io/gallery/interactive_legend.html

In [None]:
def line_chart2(df, y_col, chart_title):

    y_title = f"{y_col.replace('adjusted_', '').replace('_', ' ')}"

    if chart_title == "":
        y_title.title()

    # https://altair-viz.github.io/gallery/multiline_highlight.html
    # https://altair-viz.github.io/gallery/interactive_legend.html
    highlight = alt.selection(
        type="single", on="mouseover", fields=["category", "prepared_y"], nearest=True
    )
    selection = alt.selection_multi(fields=["category"], bind="legend")

    chart = (
        alt.Chart(df)
        .mark_line()
        .encode(
            x=alt.X("prepared_y:O", title="Year"),
            y=alt.Y(
                f"{y_col}:Q",
                title=f"{y_title} (2021$)",
                scale=alt.Scale(domain=[0, by_work_categories[y_col].max()]),
            ),
            color=alt.Color("category:N", title="Work Category"),
            tooltip=alt.Tooltip(["prepared_y", y_col, "category"]),
            opacity=alt.condition(selection, alt.value(1), alt.value(0.2)),
            size=alt.condition(~highlight, alt.value(2), alt.value(5)),
        )
        .properties(
            title={
                "text": [f"{chart_title} Funds", "by Work Categories"],
                "subtitle": "2021$",
            },
            width=500,
            height=200,
        )
        .add_selection(selection, highlight)
    )

    chart = styleguide.preset_chart_config(chart)
    return chart

In [None]:
line_chart2(by_work_categories, "adjusted_total_requested", "Total Requested")

### Side-by-side Charts

In [None]:
# Figure out which part of this function needs to be removed
# The hconcat needs to be done before, then chart title can be added on combined chart
# But sizing only be done on individual chart, not on combined chart

from shared_utils.styleguide import *


def preset_chart_config(chart):
    chart = (
        chart.configure(background=backgroundColor, font=font)
        .configure_axis(
            domainColor=axisColor,
            grid=True,
            gridColor=axisColor,
            gridWidth=1,
            labelColor=guideLabelColor,
            labelFont=labelFont,
            labelFontSize=10,
            titleColor=guideTitleColor,
            titleFont=font,
            tickColor=axisColor,
            tickSize=10,
            titleFontSize=12,
            titlePadding=10,
            labelPadding=4,
        )
        .configure_axisBand(grid=False)
        .configure_title(
            font=font,
            fontSize=font_size,
            anchor="middle",
            fontWeight=300,
            offset=20,
        )
        .configure_header(labelFont=labelFont, titleFont=font)
        .configure_legend(
            labelColor=blackTitle,
            labelFont=labelFont,
            labelFontSize=11,
            padding=1,
            symbolSize=30,
            symbolType="square",
            titleColor=blackTitle,
            titleFont=font,
            titleFontSize=14,
            titlePadding=10,
            labelLimit=0,
        )
    )
    return chart

In [None]:
def setup_bar_chart(df, category_list):
    subset = df[df.category.isin(category_list)]

    # Do the relabeling here in function
    # Easier to set up lists using column names that contain underscores
    CATEGORY_DICT = {
        "active_transp": "Active Transportation",
        "infra_resiliency_er": "Infrastructure Resiliency",
    }

    def category_labels(word):
        if word in CATEGORY_DICT.keys():
            word = CATEGORY_DICT[word]
        else:
            word = word.replace("_", " ").title()
        return word

    subset = subset.assign(
        category=subset.apply(lambda x: category_labels(x.category), axis=1)
    )

    chart = (
        alt.Chart(subset)
        .mark_bar()
        .encode(
            x=alt.X("category:N", title=""),
            y=alt.Y("mean(processing_days):Q", title="Average Processing Days"),
            color=alt.Color(
                "category:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            # https://stackoverflow.com/questions/68584375/how-do-i-limit-altair-tooltip-to-only-two-numbers-after-the-decimal-point
            tooltip=alt.Tooltip(
                ["category", "mean(processing_days)"], format={"number": ".2f"}
            ),
        )
        .properties(width=300, height=200)
    )

    return chart

In [None]:
ACTIVE = ["active_transp", "transit", "congestion_relief"]
ROADS = ["bridge", "street", "freeway"]

active_bar = setup_bar_chart(by_work_categories, ACTIVE)
roads_bar = setup_bar_chart(by_work_categories, ROADS)

combined_chart = alt.hconcat(active_bar, roads_bar)
(
    preset_chart_config(combined_chart).properties(
        title="Average Processing Time by Work Category"
    )
)

In [None]:
from pivottablejs import pivot_ui
from IPython.display import HTML

#pivot_ui(df[["agency", "transit", "bridge"]], outfile_path='pivottablejs.html')
#HTML('pivottablejs.html')

In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import pandas as pd
df2 = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
                   dtype={"fips": str})

import plotly.express as px

fig = px.choropleth_mapbox(df2, geojson=counties, locations='fips', color='unemp',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

import plotly.express as px
data = px.data.iris()
p = figure()
p.circle(data["sepal_width"], data["sepal_length"], fill_color=data["species"], size=data["sepal_length"])
show(p)

In [None]:
import ipywidgets as widgets

a = widgets.FloatText()
b = widgets.FloatSlider()
display(a, b)

mylink = widgets.jslink((a, "value"), (b, "value"))

In [None]:
tab_contents = ["P0", "P1", "P2", "P3", "P4"]
children = [widgets.Text(description=name) for name in tab_contents]
tab = widgets.Tab()
tab.children = children
for ii in range(len(children)):
    tab.set_title(ii, f"tab_{ii}")
tab