In [None]:
# Turn off warnings
import warnings

warnings.filterwarnings("ignore")

# Normal packages
import geopandas as gpd
import numpy as np
import pandas as pd

# Format
from babel.numbers import format_currency

# Display
from IPython.display import HTML, Image, Markdown, display, display_html

# Settings
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:,.2f}".format

# GCS, del later since this will presumbly be read from a script that cleans up the data
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"
FILE = "fake_data.xlsx"

# My utilities
import _utils

In [None]:
df = pd.read_excel(f"{GCS_FILE_PATH}{FILE}", sheet_name="fake")

In [None]:
df["full_county_name"] = df["full_county_name"].fillna("Various")

In [None]:
# Set some objects
# Number of projects
total_projects = df.project_name.nunique()

# Number of Unique counties
unique_counties = df.county.nunique()

# Count of projects across ALL  counties - for mapping
counties_gdf = _utils.summarize_by_project_names(df, "county")

# Count of projects by county
counties_df = _utils.summarize_by_project_names(df, "full_county_name")

# Create a few columns to rank
counties_df["Project Rank"] = (
    counties_df["Total Projects"].rank(ascending=False).astype("int64")
)
counties_df["Project Cost"] = (
    counties_df["Total Project Cost  $1,000"].rank(ascending=False).astype("int64")
)

# Median benefit score
county_median_benefit_score = int(df.fake_benefit_score.median())

In [None]:
# df_full["full_county_name"].unique()

In [None]:
# Parameter Cell
parameter_county = 'Riverside'

In [None]:
# df with project that was set in the parameter cell
df = df.loc[df["full_county_name"] == parameter_county].reset_index(drop=True)

In [None]:
# Summary tables
# Count of projects by phases it is in
phases_df = _utils.summarize_by_project_names(df, "current_phase")

# Count of projects whether it is rural or urban
rural_urban_df = _utils.summarize_by_project_names(df, "urban_rural")

# Count of projects by project type
projects_df = _utils.summarize_by_project_names(df, "primary_mode")

# Count of projects by lead agency
agency_df = _utils.summarize_by_project_names(df, "lead_agency")

# Get a line of where the county ranks.
county_rank = (
    counties_df[
        [
            "Full County Name",
            "Project Rank",
            "Project Cost",
        ]
    ]
    .loc[counties_df["Full County Name"] == parameter_county]
    .reset_index(drop=True)
)

In [None]:
# District Objects
# Median benefit score
median_benefit_score = int(df.fake_benefit_score.median())

# Total Requested Funds
total_cost = format_currency(
    (df["total_project_cost__$1,000_"].sum()),
    currency="USD",
)

# Median project cost
median_cost = format_currency(
    (df["total_project_cost__$1,000_"].median()),
    currency="USD",
)

# Total Requested Funds
total_req = format_currency(
    (df["current_fake_fund_requested"].sum()),
    currency="USD",
)

# Median Requested Funds
median_req = format_currency(
    (df["current_fake_fund_requested"].median()),
    currency="USD",
)

In [None]:
display(
    Markdown(
        f"""<h1>Overview of <b>{parameter_county}</b> County</h1>
        <li><b>{total_req}</b> in total funds requested across all the agencies.
        <li>The total cost of all the projects is <b>{total_cost}</b>.
        <li><b>{counties_df['Total Projects'][0]}</b> out of {total_projects} projects are in {parameter_county} County.
        <li>The most common project category is <b>{projects_df['Primary Mode'][0]}</b>.
        <li>Most projects are in the <b>{phases_df['Current Phase'][0]}</b> phase.
        <li><b>{median_benefit_score}</b> is the median benefit score. 
        
        """
    )
)

In [None]:
display(
    Markdown(
        f"""<h2>Rural versus Urban</h2>
        Most projects are in a(n) <b>{rural_urban_df['Urban Rural'][0]}</b> area, 
        totaling to <b>{rural_urban_df['Total Project ($1000) Formatted'][0]}</b>. 
        """
    )
)

In [None]:
total_urban_rural_bar = _utils.basic_bar_chart_custom_tooltip(
    rural_urban_df,
    "Total Project Cost  $1,000",
    "Urban Rural",
    "Total Project ($1000) Formatted",
    "Urban Rural",
    "Cost of Projects",
)

In [None]:
total_urban_rural_pie = _utils.basic_pie_chart(
    rural_urban_df,
    "Total Projects:Q",
    "Urban Rural:N",
    "Total Projects",
    "Total Projects",
)

In [None]:
total_urban_rural_pie | total_urban_rural_bar

In [None]:
display(
    Markdown(
        f"""<h2>County Map Comparison</h2>
        There are {counties_df['Full County Name'].nunique()} different counties, including various. 
        {parameter_county} ranks {county_rank['Project Rank'][0]} in total projects and 
        {county_rank['Project Cost'][0]} in project costs. 
        """
    )
)

In [None]:
county_map = _utils.create_county_map(
    counties_gdf,
    "COUNTY_ABBREV",
    "County",
)
county_map = county_map.drop(columns="County").rename(columns={"COUNTY_NAME": "County"})

In [None]:
county_geojson = gpd.read_file("https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson").to_crs(epsg=4326)

In [None]:
 # Keep only the columns we want
county_geojson = county_geojson[['COUNTY_NAME', 'COUNTY_ABBREV','geometry']]

# Replace: where's monterey?? 
county_geojson['COUNTY_ABBREV'] = county_geojson['COUNTY_ABBREV'].replace(
                {'LOS':'LA',
                 'DEL':'DN',
                 'SFO':'SF',
                 'SMT':'SM',
                 'MON':'MNO',
                 'SDG':'SD',
                 'CON':'CC',
                 'SCZ':'SCR',
                 'SJQ':'SJ',
                 'SBA':'SB'})

In [None]:
# Inner merge 
county_test = county_geojson.merge(
    counties_gdf, how="inner", left_on='COUNTY_ABBREV',right_on="County",  indicator = True)

In [None]:
county_test.explore(
    "Total Projects",
    cmap="Set3",
    width=800,
    height=400,
    #tooltip=["County", "Total Projects", "Total Project ($1000) Formatted"],
    highlight=True,
    style_kwds={"fillOpacity": 1},
)

In [None]:
project_cat_most_money = (
    projects_df.sort_values("Total Project Cost  $1,000")
    .tail(1)
    .iloc[0]["Primary Mode"]
)
project_most_money = (
    projects_df.sort_values("Total Project Cost  $1,000")
    .tail(1)
    .iloc[0]["Total Project ($1000) Formatted"]
)

In [None]:
display(
    Markdown(
        f"""<h2>Project Categories</h2>
        Most projects ({projects_df['Total Projects'][0]}) are in the <b>{projects_df['Primary Mode'][0]}</b> category,
        followed by <b>{projects_df['Primary Mode'][1]}</b>. 
        <b>{project_cat_most_money}</b> received the most money ({project_most_money}). 
        """
    )
)

In [None]:
_utils.dual_bar_chart(
    projects_df,
    "Primary Mode",
    "Primary Mode:N",
    "Total Project Cost  $1,000:Q",
    "Primary Mode:N",
    "Total Projects:Q",
    ["Total Project ($1000) Formatted"],
    ["Total Projects"],
)

In [None]:
display(
    Markdown(
        f"""<h2>Project Details</h2>
       Below is a list of all the projects in {df.full_county_name[0]} County ranked by benefit score. 
       The median benefit score is <b>{median_benefit_score}</b>, compared with {county_median_benefit_score} for projects across the state.
        """
    )
)

In [None]:
# Subset
df_subset = df[
    [
        "fake_benefit_score",
        "lead_agency",
        "primary_mode",
        "project_name",
        "current_fake_fund_requested",
        "total_project_cost__$1,000_",
    ]
]

# Format
df_subset["total_project_cost__$1,000_"] = df_subset[
    "total_project_cost__$1,000_"
].apply(lambda x: format_currency(x, currency="USD", locale="en_US"))

df_subset["current_fake_fund_requested"] = df_subset[
    "current_fake_fund_requested"
].apply(lambda x: format_currency(x, currency="USD", locale="en_US"))
df["fake_benefit_score"] = df["fake_benefit_score"].apply(lambda x: round(x, 2))

# Clean up Col Names
df_subset = _utils.clean_up_columns(df_subset)

# Sort by score
df_subset = df_subset.sort_values("Fake Benefit Score", ascending=False)

In [None]:
_utils.styled_df(df_subset)

In [None]:
score_card = _utils.create_fake_score_card(df)
score_card = _utils.clean_up_columns(score_card)

In [None]:
project_dropdown = score_card["Project Name"].unique().tolist()

In [None]:
score_card_tooltip = [
    "Project Name",
    "Total Category Score",
    "Factor Weight",
    "Weighted Factor Value",
    "Category Description",
]

In [None]:
_utils.bar_chart_with_dropdown(
    score_card,
    project_dropdown,
    "Project Name",
    "Total Category Score:Q",
    "Category:N",
    "Category:N",
    score_card_tooltip,
    f"View Benefit Scores for Individual Projects in {parameter_county} County",
)

In [None]:
agency_most_money = (
    agency_df.sort_values("Total Project Cost  $1,000").tail(1).iloc[0]["Lead Agency"]
)

In [None]:
display(
    Markdown(
        f"""<h2>Lead Agencies</h2>
        There are <b>{len(agency_df)}</b> unique agencies with projects located in this county. 
        <b>{agency_df['Lead Agency'][0]}</b> is the agency with the most projects and  
       <b>{agency_most_money}</b> is the agency with highest project costs. 
        """
    )
)

In [None]:
_utils.styled_df(agency_df.drop(columns=["Total Project Cost  $1,000"]))