# Non-SHOPP Projects in Various Counties

In [19]:
# Turn off warnings
import warnings

warnings.filterwarnings("ignore")

# Normal packages
import geopandas as gpd
import numpy as np
import pandas as pd

# Format
from babel.numbers import format_currency

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Settings
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:,.2f}".format

# GCS, del later since this will presumbly be read from a script that cleans up the data
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"
FILE = "fake_data.xlsx"

# My utilities
import _utils

In [20]:
# Create dfs
df_statewide = pd.read_excel(f"{GCS_FILE_PATH}{FILE}", sheet_name="fake")

In [21]:
# Fill in empty county names with various
df_statewide["full_county_name"] = df_statewide["full_county_name"].fillna("Various")

In [22]:
county = "Various"

In [23]:
# Dataframe just for the parameter county
df_parameter = df_statewide.loc[df_statewide["full_county_name"] == county].reset_index(
    drop=True
)

In [24]:
# Statewide Objects/DF
# Number of projects
total_projects_statewide = df_statewide.project_name.nunique()

# Number of unique counties
unique_counties_statewide = df_statewide.full_county_name.nunique()

# Count of projects across ALL counties - for mapping
# Using county abbreviations.
counties_gdf_statewide = _utils.summarize_by_project_names(df_statewide, "county")

# Count of projects by county
counties_df_statewide = _utils.summarize_by_project_names(
    df_statewide, "full_county_name"
)

# Rank counties by number of total projects
counties_df_statewide["Project Rank"] = (
    counties_df_statewide["Total Projects"].rank(ascending=False).astype("int64")
)

# Rank counties by total project costs
counties_df_statewide["Project Cost"] = (
    counties_df_statewide["Total Project Cost  $1,000"]
    .rank(ascending=False)
    .astype("int64")
)

# Median benefit score
statewide_benefit_score = int(df_statewide.fake_benefit_score.median())

In [88]:
# County Objects/Df
# Count of projects by phases it is in
phases_df_county = _utils.summarize_by_project_names(df_parameter, "current_phase")

# Count of projects whether it is rural or urban
rural_urban_df_county = _utils.summarize_by_project_names(df_parameter, "urban_rural")

# Count of projects by project type
projects_df_county = _utils.summarize_by_project_names(df_parameter, "primary_mode")

# Count of projects by lead agency
agency_df_county = _utils.summarize_by_project_names(df_parameter, "lead_agency")

# Count of projects by district
district_df_county = _utils.summarize_by_project_names(
    df_parameter, "district_full_name"
)

# Count of projects by county
county_df_county = _utils.summarize_by_project_names(df_parameter, "county")

In [61]:
# County Objects
# Number of projects in this county by project name
total_number_projects_county = df_parameter["project_name"].nunique()

# Median benefit score
median_benefit_score_county = int(df_parameter.fake_benefit_score.median())

# Total Requested Funds
total_cost_county = format_currency(
    (df_parameter["total_project_cost__$1,000_"].sum()),
    currency="USD",
)

# Median project cost
median_cost_county = format_currency(
    (df_parameter["total_project_cost__$1,000_"].median()),
    currency="USD",
)

# Total Requested Funds
total_req_county = format_currency(
    (df_parameter["current_fake_fund_requested"].sum()),
    currency="USD",
)

# Median Requested Funds
median_req_county = format_currency(
    (df_parameter["current_fake_fund_requested"].median()),
    currency="USD",
)

# Project category with the most funding
project_cat_most_money_county = (
    projects_df_county.sort_values("Total Project Cost  $1,000")
    .tail(1)
    .iloc[0]["Primary Mode"]
)


# Get a line of where the county ranks.
county_rank_county = (
    counties_df_statewide[
        [
            "Full County Name",
            "Project Rank",
            "Project Cost",
        ]
    ]
    .loc[counties_df_statewide["Full County Name"] == county]
    .reset_index(drop=True)
)

# Find the agency that has the highest project cost among a county
agency_most_money = (
    agency_df_county.sort_values("Total Project Cost  $1,000")
    .tail(1)
    .iloc[0]["Lead Agency"]
)

In [62]:
display(
    Markdown(
        f"""<h4>Overview for projects that fall in {county} Counties</h4>
         <li><b>NOTE</b>: the data below is partially composed of placeholder values.
        <li><b>{total_number_projects_county}</b> out of {total_projects_statewide} projects are in {county} Counties.
        <li>Agencies requested a total of <b>{total_req_county}</b> in funds.
        <li>The total cost of all the projects is <b>{total_cost_county}</b>.
        <li>The most common project phase is <b>{projects_df_county['Primary Mode'][0]}</b>.
        <li>Most projects are in the <b>{phases_df_county['Current Phase'][0]}</b> phase.
        <li><b>{median_benefit_score_county}</b> is the median benefit score.
        """
    )
)

<h4>Overview for projects that fall in Various Counties</h4>
         <li><b>NOTE</b>: the data below is partially composed of placeholder values.
        <li><b>59</b> out of 716 projects are in Various Counties.
        <li>Agencies requested a total of <b>$16,432,545.79</b> in funds.
        <li>The total cost of all the projects is <b>$20,905,490.00</b>.
        <li>The most common project phase is <b>Rail (Passenger)</b>.
        <li>Most projects are in the <b>NONE</b> phase.
        <li><b>9</b> is the median benefit score.
        

In [63]:
display(
    Markdown(
        f"""<h4>Rural versus Urban</h4>
        Most projects in {county} Counties are in a(n) <b>{rural_urban_df_county['Urban Rural'][0]}</b> area, 
        totaling to <b>{rural_urban_df_county['Total Project ($1000) Formatted'][0]}</b> in project costs. 
        """
    )
)

<h4>Rural versus Urban</h4>
        Most projects in Various Counties are in a(n) <b>Urban</b> area, 
        totaling to <b>$13,976,050.00</b> in project costs. 
        

In [64]:
total_urban_rural_bar = _utils.basic_bar_chart_custom_tooltip(
    rural_urban_df_county,
    "Total Project Cost  $1,000",
    "Urban Rural",
    "Total Project ($1000) Formatted",
    "Urban Rural",
    "Cost of Projects",
)

In [65]:
total_urban_rural_pie = _utils.basic_pie_chart(
    rural_urban_df_county,
    "Total Projects:Q",
    "Urban Rural:N",
    "Total Projects",
    "Total Projects",
)

In [66]:
total_urban_rural_pie | total_urban_rural_bar

In [81]:
display(
    Markdown(
        f"""<h4>Project Categories</h4>
        Most projects ({projects_df_county['Total Projects'][0]}) are in the <b>{projects_df_county['Primary Mode'][0]}</b> category.
        <b>{project_cat_most_money_county}</b> received the most money. 
        """
    )
)

<h4>Project Categories</h4>
        Most projects (29) are in the <b>Rail (Passenger)</b> category.
        <b>Rail (Passenger)</b> received the most money. 
        

In [68]:
_utils.dual_bar_chart(
    projects_df_county,
    "Primary Mode",
    "Primary Mode:N",
    "Total Project Cost  $1,000:Q",
    "Primary Mode:N",
    "Total Projects:Q",
    ["Total Project ($1000) Formatted"],
    ["Total Projects"],
    "Categories by Cost and Total Projects - Click on the first graph to highlight the second",
)

In [93]:
display(
    Markdown(
        f"""<h4>Lead Agencies</h4>
        There are <b>{len(agency_df_county)}</b> unique agencies with projects located in {county} County. 
        <b>{agency_df_county['Lead Agency'][0]}</b> is the agency with the most projects and  
       <b>{agency_most_money}</b> is the agency with highest project costs. 
        """
    )
)
_utils.styled_df(
    agency_df_county.drop(
        columns=["Total Project Cost  $1,000", "Current Fake Fund Requested"]
    )
)

<h4>Lead Agencies</h4>
        There are <b>18</b> unique agencies with projects located in Various County. 
        <b>Caltrans</b> is the agency with the most projects and  
       <b>Caltrans</b> is the agency with highest project costs. 
        

Lead Agency,Total Projects,Total Project ($1000) Formatted,Fake Fund Formatted
Caltrans,18,"$5,831,542.00","$5,071,067.54"
,9,"$3,619,452.00","$2,079,781.56"
California - Misc,6,"$18,019.00","$1,661,749.63"
Sjjpa/Sjrrc,6,"$375,408.00","$1,319,299.41"
Ccjpa,4,"$861,042.00","$1,590,839.15"
Metrolink,2,"$1,491,500.00","$734,833.80"
Mtc,2,"$552,335.00","$751,039.09"
Smart,2,"$2,388,888.00","$878,103.83"
"Bnsf, Octa, Rctc",1,"$888,888.00","$295,528.59"
Scvta,1,"$1,500,000.00","$129,086.59"


In [104]:
display(
    Markdown(
        f"""<h4>Districts</h4>
        <b>{district_df_county['District Full Name'][0]}</b> is the district with the most projects that cross multiple counties.  
        """
    )
)

_utils.basic_bar_chart_custom_tooltip(
    district_df_county,
    "District Full Name",
    "Total Projects",
    "Total Project ($1000) Formatted",
    "District Full Name",
    "Districts with the most Projects",
)

<h4>Districts</h4>
        <b>75 - HQ</b> is the district with the most projects that cross multiple counties.  
        

In [97]:
display(
    Markdown(
        f"""<h4>Counties</h4>
        <b>{county_df_county['County'][0]}</b> counties have the most projects, followed by <b>{county_df_county['County'][1]}</b>.
        """
    )
)

_utils.styled_df(
    county_df_county.drop(
        columns=["Total Project Cost  $1,000", "Current Fake Fund Requested"]
    )
)

<h4>Counties</h4>
        <b>VAR</b> counties have the most projects, followed by <b>SAC, SJ, ALA, CC, STA, MER, MAD, FRE, KIN, TUL, KER</b>.
        

County,Total Projects,Total Project ($1000) Formatted,Fake Fund Formatted
VAR,9,"$1,491,740.00","$2,383,024.49"
"SAC, SJ, ALA, CC, STA, MER, MAD, FRE, KIN, TUL, KER",9,"$231,239.00","$2,019,179.82"
MULTI,5,"$1,831,526.00","$1,427,021.09"
"SCL, ALA, CC, SOL, YOL, SAC",3,"$889,914.00","$949,240.47"
"SAC, SJ",2,"$153,188.00","$716,928.91"
NONE,2,"$1,777,776.00","$499,969.36"
MUL,2,"$390,000.00","$297,570.61"
"SAC, PLA",2,"$481,600.00","$757,697.94"
LA VEN,2,"$7,000.00","$457,509.98"
"SD, RIV",1,"$33,400.00","$236,667.28"


In [69]:
display(
    Markdown(
        f"""<h4>Project Details</h4>
       All the projects in {county} are listed below, ranked by benefit score. 
       The median benefit score is <b>{median_benefit_score_county}</b>, 
       compared with {statewide_benefit_score} for projects across California.
        """
    )
)

<h4>Project Details</h4>
       All the projects in Various are listed below, ranked by benefit score. 
       The median benefit score is <b>9</b>, 
       compared with 8 for projects across California.
        

In [70]:
# Subset
df_subset = df_parameter[
    [
        "fake_benefit_score",
        "lead_agency",
        "primary_mode",
        "county",
        "district_full_name",
        "project_name",
        "current_fake_fund_requested",
        "total_project_cost__$1,000_",
    ]
]

# Format
df_subset["total_project_cost__$1,000_"] = df_subset[
    "total_project_cost__$1,000_"
].apply(lambda x: format_currency(x, currency="USD", locale="en_US"))

df_subset["current_fake_fund_requested"] = df_subset[
    "current_fake_fund_requested"
].apply(lambda x: format_currency(x, currency="USD", locale="en_US"))
df_parameter["fake_benefit_score"] = df_parameter["fake_benefit_score"].apply(
    lambda x: round(x, 2)
)

# Clean up Col Names
df_subset = _utils.clean_up_columns(df_subset)

# Sort by score
df_subset = df_subset.sort_values("Fake Benefit Score", ascending=False)

In [71]:
_utils.styled_df(df_subset)

Fake Benefit Score,Lead Agency,Primary Mode,County,District Full Name,Project Name,Current Fake Fund Requested,"Total Project Cost $1,000"
32.02,"Uprr/Bnsf, Actc/Mtc",Grade Crossing,MUL,04 - Oakland,Railroad Grade Crossing Improvements And Grade Separations,"$110,526.63","$150,000.00"
25.43,Caltrans,Highway,KIN FRE,06 - Fresno,Excelsior Expressway Ii,"$139,630.36","$75,000.00"
20.98,California - Misc,Rail (Passenger),"SAC, SJ, ALA, CC, STA, MER, MAD, FRE, KIN, TUL, KER",75 - HQ,Waste System Overhaul On Surfliner Cars,"$151,430.98","$2,760.00"
19.54,Caltrans,Highway,PLA/SAC,03 - Marysville,Pla/Sac-80 Managed Lanes (Priority Managed Lane Project),"$125,223.80","$250,000.00"
19.32,Caltrans,Rail (Passenger),VAR,75 - HQ,Ze Rail Program - Hydrail (Pilot Project),"$120,156.44","$32,450.00"
17.58,California - Misc,Rail (Passenger),"SAC, SJ, ALA, CC, STA, MER, MAD, FRE, KIN, TUL, KER",75 - HQ,End Of Car Vestibule Diaphragm Repair,"$154,291.85",$801.00
16.87,Caltrans,Rail (Passenger),PLA SAC,03 - Marysville,Capital Corridor Regional Transit Improvements,"$182,485.20","$692,468.00"
16.81,"Uprr, Actc/Mtc",Rail (Freight),MUL,04 - Oakland,Rail Connectivity Improvements,"$187,043.98","$240,000.00"
16.41,,Bike/Pedestrian,RIV SBD,08 - San Bernardino,Reconnecting District 8 Tribal Communities,"$163,616.16","$888,888.00"
16.08,,Highway,MON SBT SCR,05 - San Luis Obispo,Sr 129 Watsonville-Pajaro Reconnecting Communities Plan,"$144,017.17","$888,888.00"


In [72]:
score_card = _utils.create_fake_score_card(df_parameter)
score_card = _utils.clean_up_columns(score_card)

In [73]:
score_card = score_card.rename(
    columns={"Value X": "Measure", "Value Y": "Monetary Values"}
)

In [74]:
project_dropdown = score_card["Project Name"].unique().tolist()

In [75]:
score_card_tooltip = [
    "Project Name",
    "Total Category Score",
    "Factor Weight",
    "Weighted Factor Value",
    "Category Description",
]

In [76]:
project_desc_tooltip = [
    "Project Name",
    "Project Description",
    "Monetary",
    "Monetary Values",
]

In [77]:
display(
    Markdown(
        f"""Use the dropdown menu below to retrive information for a specific project. Hover over the bars for more detail.
        """
    )
)

Use the dropdown menu below to retrive information for a specific project. Hover over the bars for more detail.
        

In [78]:
_utils.dual_chart_with_dropdown(
    score_card,
    project_dropdown,
    "Project Name",
    "Monetary:N",
    "Monetary Values:Q",
    "Monetary:N",
    project_desc_tooltip,
    "Total Category Score:Q",
    "Category:N",
    "Category:N",
    score_card_tooltip,
    f"View Individual Projects in {county}",
)