In [47]:
import numpy as np
import pandas as pd

# import warnings
# warnings.filterwarnings("ignore")
from babel.numbers import format_currency
from calitp import to_snakecase

# Display
from IPython.display import HTML, Image, Markdown, display, display_html
from shared_utils import altair_utils, styleguide
PURPLE = "#9487C0"

# Settings
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

# Charts
import altair as alt
from shared_utils import calitp_color_palette as cp

# GCS, del later since this will presumbly be read from a script that cleans up the data
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"
FILE = "fake_data.xlsx"

# My utilities
import _utils

In [9]:
df = pd.read_excel(f"{GCS_FILE_PATH}{FILE}", sheet_name="fake")

In [10]:
display(
    Markdown(
        f"""<h1>Transportation Planning (DoTP)</h1>
        Caltrans' Division of Transportation Planning articulates a long-term vision for California's transportation system and implements
        statewide transportation policy through partnerships with State, regional, and local agencies. 
        The Division provides quality Planning Products, Services, and Information to support and guide transportation investment decisions.
        <br><br> 
        DoTP manages a total of <b>{df['project_name'].nunique()}</b> different projects. 
        The following guidebook scores all projects under the jurisdiction of DoTP using California's project scoring rubric. 
        """
    )
)

<h1>Transportation Planning (DoTP)</h1>
        Caltrans' Division of Transportation Planning articulates a long-term vision for California's transportation system and implements
        statewide transportation policy through partnerships with State, regional, and local agencies. 
        The Division provides quality Planning Products, Services, and Information to support and guide transportation investment decisions.
        <br><br> 
        DoTP manages a total of <b>716</b> different projects. 
        The following guidebook scores all projects under the jurisdiction of DoTP using California's project scoring rubric. 
        

In [120]:
# Get summary tables 
def summarize_by_project_names(df, col_wanted:str):
    """
    df: original dataframe to summarize
    col_wanted: to column to groupby
    """
    df = (
        df.groupby([col_wanted])
        .agg({"project_name": "count", 
              "total_project_cost__$1,000_": "sum"})
        .reset_index()
        .sort_values('project_name', ascending = False)
        .rename(columns={"project_name": "Total Projects"})
    )
    
    df = df.reset_index(drop = True)
    
    # Create a formatted monetary col
    df["Total Project ($1000) Formatted"] = df["total_project_cost__$1,000_"].apply(
    lambda x: format_currency(x, currency="USD", locale="en_US"))
    
    # Clean up column names, remove snakecase
    df = _utils.clean_up_columns(df)
    
    return df

In [121]:
# Count of projects by phases it is in
phases_df = summarize_by_project_names(df, 'current_phase')

# Count of projects whether it is rural or urban
rural_urban_df =summarize_by_project_names(df, 'urban_rural')

# Count of projects whether it is rural or urban
districts_df =summarize_by_project_names(df, 'district')

In [122]:
# Median benefit score  across the state
median_benefit_score = int(df.fake_benefit_score.median())

# Total Requested Funds
total_cost = format_currency(
    (df["total_project_cost__$1,000_"].sum()),
    currency="USD",
)

# Median project cost
median_cost = format_currency(
    (df["total_project_cost__$1,000_"].median()),
    currency="USD",
)

# Total Requested Funds
total_req = format_currency(
    (df["fake_fund_requested"].sum()),
    currency="USD",
)

# Median Requested Funds
median_req = format_currency(
    (df["fake_fund_requested"].median()),
    currency="USD",
)

In [123]:
display(
    Markdown(
        f"""<h2>Basic Statistics</h2>
        <li>Total requested funds is <b>{total_req}</b>.</li>
        <li>Most projects are in District{districts_df['Total Projects'][0]}.</li>
        <li>The total cost of all the projects is <b>{total_cost}</b>.</li>
        <li><b>{median_benefit_score}</b> is the median benefit score</li>.
        """
    )
)

<h2>Basic Statistics</h2>
        <li>Total requested funds is <b>$219,380,723.84</b>.</li>
        <li>Most projects are in District161.</li>
        <li>The total cost of all the projects is <b>$19,121,443,441.00</b>.</li>
        <li><b>17</b> is the median benefit score</li>.
        

In [131]:
display(
    Markdown(
        f"""<h2>Rural versus Urban</h2>
        Most projects are <b>{rural_urban_df['Urban Rural'][0]}</b>, 
        totaling to <b>{rural_urban_df['Total Project ($1000) Formatted'][0]}</b>. 
        """
    )
)

<h2>Rural versus Urban</h2>
        Most projects are <b>Urban</b>, 
        totaling to <b>$16,533,258,381.00</b>. 
        

In [125]:
rural_urban_df.drop(columns = ['Total Project Cost  $1,000']).style.hide(axis="index")

Urban Rural,Total Projects,Total Project ($1000) Formatted
Urban,528,"$16,533,258,381.00"
Rural,103,"$1,694,527,339.00"
,86,"$893,657,721.00"


In [126]:
_utils.basic_pie_chart(
    rural_urban_df,
    "Total Projects:Q",
    "Urban Rural:N",
    "Total Projects",
    "Rural vs. Urban Projects",
)

In [127]:
districts_df

Unnamed: 0,District,Total Projects,"Total Project Cost $1,000",Total Project ($1000) Formatted
0,75,161,1121858028,"$1,121,858,028.00"
1,7,99,2770589537,"$2,770,589,537.00"
2,4,88,2497502945,"$2,497,502,945.00"
3,6,77,1603696659,"$1,603,696,659.00"
4,8,68,2671390166,"$2,671,390,166.00"
5,5,39,1334733544,"$1,334,733,544.00"
6,12,38,1958084593,"$1,958,084,593.00"
7,10,37,1869127790,"$1,869,127,790.00"
8,11,35,2489643564,"$2,489,643,564.00"
9,1,25,89563944,"$89,563,944.00"


In [128]:
display(
    Markdown(
        f"""<h2>Districts</h2>
        Most projects take place in <b>District {districts_df.District[0]}</b>.
        """
    )
)

<h2>Districts</h2>
        Most projects take place in <b>75</b>.
        

## Funding

In [129]:
cost_primary_mode = (
    df.groupby("primary_mode")
    .agg({"total_project_cost__$1,000_": "sum"})
    .sort_values("total_project_cost__$1,000_", ascending=False)
    .reset_index()
    .rename(
        columns={
            "primary_mode": "Primary Mode",
            "total_project_cost__$1,000_": "Total Project Cost ($1000)",
        }
    )
)

In [130]:
cost_primary_mode

Unnamed: 0,Primary Mode,Total Project Cost ($1000)
0,Highway,7137898024
1,Interchange (Modification),2224130497
2,Bike/Pedestrian,1958099767
3,Interchange (New),1867993947
4,Interchange (Widening),1778680280
5,Transit,713068354
6,Rail (Passenger),672635753
7,Rail (Freight),628358471
8,Grade Crossing,624379080
9,Port,534272566
