# SB125 Fund Split Analysis

## Question:
- How did RTAs split SB125 funds between operations and capital?

## Methodology:
- upload all avilable `SB125 fund request template` files to gcs
- examine all files for consistencies:
    - come with cleaning plan for inconsistent examples (files withot capital/operating columns)
- concat all rows across all files


## Notes:
- some RTPAs did not submit a `SB125 fund request template.xlsx` file, but instead included an quivilent file their allocation package

In [1]:
import pandas as pd
import os
import altair as alt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

GCS_PATH = "gs://calitp-analytics-data/data-analyses/sb125/fund_split/"

In [2]:
file_list = [
    "sierra_fund_request.xlsx",
    "slocog_fund_request.xlsx",
    "tehema_fund_request.xlsx",
    "tuolumne_fund_request.xlsx",
    "ventura_fund_request.xlsx",
    "alpine_fund_request.xlsx",
    "amador_fund_request.xlsx",
    "butte_fund_request.xlsx",
    "calaveras_fund_request.xlsx",
    "del_norte_fund_request.xlsx",
    "el_dorado_fund_request.xlsx",
    "humboldt_fund_request.xlsx",
    "kern_fund_request.xlsx",
    "kings_fund_request.xlsx",
    "la_metro_fund_request.xlsx",
    "lake_fund_request.xlsx",
    "lassen_fund_request.xlsx",
    "madera_fund_request.xlsx",
    "mariposa_fund_request.xlsx",
    "mendocino_fund_request.xlsx",
    "merced_fund_request.xlsx",
    "mtc_fund_request.xlsx",
    "nevada_fund_request.xlsx",
    "orange_fund_request.xlsx",
    "placer_fund_request.xlsx",
    "plumas_fund_request.xlsx",
    "riverside_fund_request.xlsx",
    "san_benito_fund_request.xlsx",
    "san_diego_mts_fund_request.xlsx",
    "santa_cruz_fund_request.xlsx",
    "shasta_fund_request.xlsx",
]

file_list.sort()

In [3]:
def clean_fund_request(file:str) -> pd.DataFrame:
    """
    reads in the file from GCS, maps col_names list to df columns, drops all the blank rows.
    returns df.
    """
    col_names = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "total",
]
    
    df = pd.read_excel(f"{GCS_PATH}{file}", header=2, nrows=40, names=col_names).drop(columns="total")
    row_drop = df["rtpa"].isin(["Grand Total", "RTPA"])
    df = df.drop(df[row_drop].index)
    df = df.dropna(how= "all")
    df[["rtpa", "implementing agenc-y/-ies","project"]] = df[["rtpa", "implementing agenc-y/-ies","project"]].ffill()
    
    return df


first iteration of fund_request_checker func.
>def read_in(file:str) -> pd.DataFrame:
>    df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>    df = df.dropna(how= "all")
>   
>    if len(df.columns) == 13:
>        print(f"{file} can use clean_fund_request func.")
>    else: 
>        print(f"needs manual check, {file}")
       

old iteration
>def fund_request_checker(file_list:list) -> pd.DataFrame:
>
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            print(f"{file} can use clean_fund_request func.")
>        else: 
>            print(f"needs manual check, {file}")
       

In [4]:
#fund_request_checker(file_list)

old iteration
>def fund_request_checker_v2(file_list:list):
>    gtg_files = []
>    manual_review = []
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            gtg_files.append(f"{file}")
>        else: 
>            manual_review.append(f"{file}")
>    return display(
>        "good to go files",
>        list(gtg_files), 
>        "needs manual check", 
>        list(manual_review)
>    )

In [5]:
#fund_request_checker_v2(file_list)

In [6]:
def fund_request_checker_v3(file_list:list) -> tuple:
    """takes in list of fund_request excel file name. reads in each file, checks if DF has 13 columns.
    if yes, appends do good-to-go list. else, appends to needs-manual-review.
    output is a tuple of the 2 list.
    assign 2 variables to use this func.
    """
    gtg_files = []
    manual_review = []
    for file in file_list:
    
        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
        df = df.dropna(how= "all")
    
        if len(df.columns) == 13:
            gtg_files.append(f"{file}")
        else: 
            manual_review.append(f"{file}")
    return gtg_files, manual_review

In [7]:
good_list, review_list = fund_request_checker_v3(file_list)

In [8]:
display(
    len(good_list),
    len(review_list)
)

24

7

In [9]:
def cleaner_loop(gtg_list:list) -> dict: 
    """
    takes in good-to-go list from fund_request_checker.
    applies the clean_fund_request function to each item on the list, then appends to dictionary.
    key is name of the file, value is the cleaned dataframe.
    output is dictionary. 
    """
    cleaned_df ={}

    for name in gtg_list:
        cleaned_df[name] = clean_fund_request(name)
    return cleaned_df

In [10]:
cleaned_fund_request = cleaner_loop(good_list)

In [11]:
display(
    type(cleaned_fund_request),
    len(cleaned_fund_request),
    list(cleaned_fund_request.keys()),
)

dict

24

['alpine_fund_request.xlsx',
 'amador_fund_request.xlsx',
 'calaveras_fund_request.xlsx',
 'del_norte_fund_request.xlsx',
 'el_dorado_fund_request.xlsx',
 'humboldt_fund_request.xlsx',
 'kings_fund_request.xlsx',
 'la_metro_fund_request.xlsx',
 'lake_fund_request.xlsx',
 'madera_fund_request.xlsx',
 'mariposa_fund_request.xlsx',
 'mendocino_fund_request.xlsx',
 'merced_fund_request.xlsx',
 'nevada_fund_request.xlsx',
 'placer_fund_request.xlsx',
 'plumas_fund_request.xlsx',
 'riverside_fund_request.xlsx',
 'san_benito_fund_request.xlsx',
 'san_diego_mts_fund_request.xlsx',
 'shasta_fund_request.xlsx',
 'sierra_fund_request.xlsx',
 'slocog_fund_request.xlsx',
 'tehema_fund_request.xlsx',
 'tuolumne_fund_request.xlsx']

In [12]:
# view all the good-to-go df

from IPython.display import display

# .items() creates tuples of each element in the dict. key:value maps to key:df

for key, df in cleaned_fund_request.items():
    print(f"DataFrame: {key}")
    #display(df)


DataFrame: alpine_fund_request.xlsx
DataFrame: amador_fund_request.xlsx
DataFrame: calaveras_fund_request.xlsx
DataFrame: del_norte_fund_request.xlsx
DataFrame: el_dorado_fund_request.xlsx
DataFrame: humboldt_fund_request.xlsx
DataFrame: kings_fund_request.xlsx
DataFrame: la_metro_fund_request.xlsx
DataFrame: lake_fund_request.xlsx
DataFrame: madera_fund_request.xlsx
DataFrame: mariposa_fund_request.xlsx
DataFrame: mendocino_fund_request.xlsx
DataFrame: merced_fund_request.xlsx
DataFrame: nevada_fund_request.xlsx
DataFrame: placer_fund_request.xlsx
DataFrame: plumas_fund_request.xlsx
DataFrame: riverside_fund_request.xlsx
DataFrame: san_benito_fund_request.xlsx
DataFrame: san_diego_mts_fund_request.xlsx
DataFrame: shasta_fund_request.xlsx
DataFrame: sierra_fund_request.xlsx
DataFrame: slocog_fund_request.xlsx
DataFrame: tehema_fund_request.xlsx
DataFrame: tuolumne_fund_request.xlsx


# Cleaning individual good DFs

In [13]:
cleaned_fund_request["amador_fund_request.xlsx"] = cleaned_fund_request["amador_fund_request.xlsx"].drop([2])

In [14]:
cleaned_fund_request["merced_fund_request.xlsx"] = cleaned_fund_request["merced_fund_request.xlsx"].drop([1,2,34,36,37])

In [15]:
cleaned_fund_request["san_benito_fund_request.xlsx"] = cleaned_fund_request["san_benito_fund_request.xlsx"].drop([6,9])

In [16]:
row_drops = [
    1,
    2,
    3,
    4,
    9,
    10,
    11,
]
cleaned_fund_request["san_diego_mts_fund_request.xlsx"] = cleaned_fund_request["san_diego_mts_fund_request.xlsx"].drop(row_drops)

In [17]:
cleaned_fund_request["sierra_fund_request.xlsx"] = cleaned_fund_request["sierra_fund_request.xlsx"].drop(list(range(24,32)))

In [18]:
cleaned_fund_request["nevada_fund_request.xlsx"] = cleaned_fund_request["nevada_fund_request.xlsx"].drop([8,9])

In [19]:
cleaned_fund_request["plumas_fund_request.xlsx"]= cleaned_fund_request["plumas_fund_request.xlsx"].drop(list(range(10,14)))

# Attempt to clean bad DFs

In [91]:
    col_names = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "total",
    ]

In [50]:
# list of bad DFs that didnt match the other layouts
review_list

['butte_fund_request.xlsx',
 'kern_fund_request.xlsx',
 'lassen_fund_request.xlsx',
 'mtc_fund_request.xlsx',
 'orange_fund_request.xlsx',
 'santa_cruz_fund_request.xlsx',
 'ventura_fund_request.xlsx']

In [105]:
#CLEAN COMPLETED

lassen = pd.read_excel(f"{GCS_PATH}lassen_fund_request.xlsx", 
                       sheet_name = "D.2. Detailed Fund Request",
                       skiprows = 6,
                       header=0, 
                       skipfooter=12,
                      ).drop(columns=["Unnamed: 0", "Project Type", "Operator"])

# can work with this. may be able to remove the top rows then use cleaner loop
lassen.columns=col_names
lassen_cleaned = lassen.drop(columns="total")

In [106]:
lassen_cleaned

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
0,LCTC,LCTC / LTSA,LTSA Operating Deficit,TIRCP,,,,,334937.0,665063.0,,
1,LCTC,LCTC / LTSA,NEMT Service to Reno,TIRCP,200000.0,,,,,,,
2,LCTC,LCTC / LTSA,Vehicle Replacement (ZEB Plan),TIRCP,885000.0,958737.0,,,,,,
3,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,TIRCP,400000.0,200000.0,,,,,,
4,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,ZETCP (PTA),65103.0,,,,,,,
5,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,ZETCP (GGRF),,81552.0,87619.0,87619.0,,,,
6,LCTC,LCTC,Countywide ZEV Study,ZETCP (PTA),11339.69,,,,,,,
7,LCTC,LCTC,Countywide ZEV Study,ZETCP (GGRF),88660.31,,,,,,,
8,LCTC,LCTC,RTPA Administrative Costs,TIRCP,,,,,10000.0,10000.0,,
9,LCTC,LCTC,RTPA Administrative Costs,ZETCP (GGRF),,,,,,11067.0,5000.0,5000.0


In [148]:
# CLEANED COMPLETE
butte = pd.read_excel(f"{GCS_PATH}butte_fund_request.xlsx",
                      skiprows= 2,
                      header= 0,
                      skipfooter= 17,
                     )

butte_cleaned = butte.copy()

butte_cleaned[["RTPA","Implementing Agenc-y/-ies", "Project"]] = butte_cleaned[["RTPA","Implementing Agenc-y/-ies", "Project"]].ffill()

butte_cleaned.insert(6, "operations FY25-26",0)
butte_cleaned.insert(7, "operations FY26-27",0)
butte_cleaned.columns = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "total",
    
]
butte_cleaned = butte_cleaned.drop(columns="total")
# missing the operations columns. may not need if we melt just the capital columns
display(
    butte.shape,
    butte_cleaned.shape,
    butte_cleaned,
)

(5, 11)

(5, 12)

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27
0,Butte County Association of Governments,Butte County Association of Governments and th...,North Valley Rail,TIRCP,10488545.0,4511455.0,0,0,,,,
1,Butte County Association of Governments,Butte Regional Transit,Butte Regional Transit Zero-Emission Deployment,TIRCP,,200000.0,0,0,,5802814.0,,
2,Butte County Association of Governments,Butte Regional Transit,Butte Regional Transit Zero-Emission Deployment,ZETCP (PTA),200000.0,,0,0,315836.0,,,
3,Butte County Association of Governments,Butte Regional Transit,Butte Regional Transit Zero-Emission Deployment,ZETCP (GGRF),,,0,0,597319.51,,,
4,Butte County Association of Governments,Butte Regional Transit,Butte Regional Transit Zero-Emission Deployment,ZETCP (GGRF),,200000.0,0,0,,1673362.0,,


In [149]:
# CLEANED COMPLETE

mtc = pd.read_excel(f"{GCS_PATH}mtc_fund_request.xlsx",
                    skiprows = 2,
                    header= 0,
                    skipfooter= 21
                   ).drop(columns= ["Unnamed: 13","Unnamed: 14","Unnamed: 15"])

mtc_cleaned = mtc.copy()

mtc_cleaned.columns = col_names
mtc_cleaned = mtc_cleaned.drop(columns="total")
# can work with this. may be able to remove the top rows then use cleaner loop
display(
    mtc_cleaned.shape,
    mtc_cleaned.columns,
    mtc_cleaned
)

(16, 12)

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital_FY23-24', 'capital_FY24-25', 'capital_FY25-26',
       'capital_FY26-27', 'operating_FY23-24', 'operating_FY24-25',
       'operating_FY25-26', 'operating_FY26-27'],
      dtype='object')

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
0,MTC,Santa Clara Valley Transportation Authority (VTA),BART to Silicon Valley Phase II,TIRCP,199173500.0,175826500.0,,,,,,
1,MTC,Bay Area Rapid Transit Authority (BART),BART Core Capacity,TIRCP,185895200.0,164104800.0,,,,,,
2,MTC,MTC,Planning/Admin: Regional Network Management,ZETCP (PTA),,,,,2000000.0,,,
3,MTC,Bay Area Rapid Transit Authority (BART),Operating Assistance,TIRCP,,,,,,46108880.0,,
4,MTC,Bay Area Rapid Transit Authority (BART),Operating Assistance,ZETCP (GGRF),,,,,36522320.0,38181390.0,38181390.0,38181390.0
5,MTC,San Francisco Municipal Transportation Authority,Operating Assistance,ZETCP (PTA),,,,,67361490.0,,,
6,MTC,San Francisco Municipal Transportation Authority,Operating Assistance,ZETCP (GGRF),,,,,32036190.0,33491470.0,33491470.0,33491470.0
7,MTC,Alameda-Contra Costa Transit District,Operating Assistance,ZETCP (GGRF),,,,,3378788.0,3532273.0,3532273.0,3532273.0
8,MTC,"Golden Gate Bridge, Highway and Transportation...",Operating Assistance,ZETCP (GGRF),,,,,4263919.0,4457611.0,4457611.0,4457611.0
9,MTC,Peninsula Corridor Joint Powers Board (Caltrain),Operating Assistance,ZETCP (GGRF),,,,,2640142.0,2760073.0,2760073.0,2760073.0


In [161]:
#CLEANED COMPLETE 
orange = pd.read_excel(f"{GCS_PATH}orange_fund_request.xlsx",
                       skiprows=3,
                       header=0,
                       skipfooter=1
                      )

orange_cleaned = orange.copy()

orange_cleaned.rename(columns={"Unnamed: 0":"RTPA"}, inplace = True)
orange_cleaned["RTPA"] = "OCTA"
orange_cleaned = orange_cleaned.drop(columns= [
    "FY27-28",
    "FY28-29",
    "FY27-28.1",
    "FY28-29.1"
])

orange_cleaned.columns = col_names
orange_cleaned = orange_cleaned.drop(columns="total")
# FY goes to 28-29. dont think i need those columns, there are no numbers in them anyways
# can drop
orange_cleaned

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
0,OCTA,OCTA & SCRRA,Stabilize Rail and Bus Operations: Metrolink f...,TIRCP,,,,,116608526.0,19136022.0,,
1,OCTA,OCTA,Stabilize Rail and Bus Operations: OC Streetca...,TIRCP,,,,,,37900000.0,,
2,OCTA,OCTA,Stabilize Rail and Bus Operations: OC Streetca...,ZETCP (GGRF),,,,,11059106.0,,,
3,OCTA,OCTA,Stabilize Rail and Bus Operations: OC Streetca...,ZETCP (PTA),,,,,10414682.0,,,
4,OCTA,OCTA,Stabilize Rail and Bus Operations: OC Bus oper...,TIRCP,,,,,,6072000.0,,
5,OCTA,OCTA,Stabilize Rail and Bus Operations: Open paymen...,TIRCP,,,,,26500000.0,,,
6,OCTA,OCTA,Stabilize Rail and Bus Operations: Harbor Blvd...,TIRCP,,,,,,5100000.0,,
7,OCTA,OCTA & SCRRA,Construction Critical Rail Infrastructure: Rai...,TIRCP,,86454000.0,,,,,,
8,OCTA,OCTA & SCRRA,Construction Critical Rail Infrastructure: Ora...,TIRCP,,5850000.0,,,,,,
9,OCTA,OCTA,Construction Critical Rail Infrastructure: San...,TIRCP,17000000.0,,,,,,,


In [169]:
# CLEANED COMPLETE
santa_cruz = pd.read_excel(f"{GCS_PATH}santa_cruz_fund_request.xlsx",
                           skiprows= 4,
                           header= 0,
                           skipfooter= 5
                          ).iloc[:,0:13]

santa_cruz_cleaned = santa_cruz.copy()

santa_cruz_cleaned.columns = col_names
santa_cruz_cleaned.drop(columns="total", inplace=True)
# can remove the extra columns past unnamed 12, then remove top and bottom rows, then use cleaner loop
display(
    santa_cruz_cleaned.shape,
    santa_cruz_cleaned
)

(8, 12)

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
0,SCCRTC,SCCRTC,Zero Emission Passenger Rail & Trail,TIRCP,,2000000.0,,,,,,
1,SCCRTC,SCMTD,Rapid Corridors,TIRCP,,4000000.0,,,,,,
2,SCCRTC,SCMTD,Reimagine METRO,TIRCP,,,,,13633480.0,7667149.0,,
3,SCCRTC,SCMTD,Reimagine METRO,ZETCP (PTA),,,,,1215715.0,,,
4,SCCRTC,SCMTD,Reimagine METRO,ZETCP (GGRF),,,,,1407753.0,1471702.0,1471702.0,1471702.0
5,SCCRTC,SCCRTC,RTPA administration,TIRCP,137711.88,138052.0,,,,,,
6,SCCRTC,SCCRTC,RTPA administration,ZETCP (PTA),14219.730703,,,,,,,
7,SCCRTC,SCCRTC,RTPA administration,ZETCP (GGRF),12279.953797,14865.67,14865.6745,14865.6745,,,,


In [197]:
ventura = pd.read_excel(f"{GCS_PATH}ventura_fund_request.xlsx")

# needs a lot of work, may have to do in excel
ventura

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,TIRCP Capital,,,,Capital,,,,
1,Implementing Agenc-y/-ies,Project Category,Project,Year Requested,FY23-24,FY24-25,FY25-26,FY26-27,FY27-28 +
2,Simi Valley,Infrastructure Improvements,EV Charging,Year 1,2024-02-01 00:00:00,2025-06-30 00:00:00,,,
3,Thousand Oaks,Infrastructure Improvements,EV Charging,Year 1,,2024-07-01 00:00:00,2026-06-30 00:00:00,,
4,Thousand Oaks,Infrastructure Improvements,EV Support,Year 1,,2024-07-01 00:00:00,2026-06-30 00:00:00,,
5,Gold Coast Transit District,Infrastructure Improvements,Energy Storage - Hydrogen Station,Year 1,,2024-07-01 00:00:00,2026-06-30 00:00:00,,
6,Ojai,Infrastructure Improvements,Facility Upgrades & EV Charging,Year 1 & 2,,2024-07-01 00:00:00,2026-06-30 00:00:00,,
7,County of Ventura,Infrastructure Improvements,Bus Stop Improvements,Year 1,,2024-10-01 00:00:00,2026-06-30 00:00:00,,
8,Gold Coast Transit District,Infrastructure Improvements,Energy Storage - Hydrogen Station,Year 2,,2024-07-01 00:00:00,2026-06-30 00:00:00,,
9,Valley Express,Infrastructure Improvements,Bus Stop Improvements,Year 2,,2024-10-01 00:00:00,2026-06-30 00:00:00,,


In [None]:
kern = pd.read_excel(f"{GCS_PATH}kern_fund_request.xlsx")

# this is so bad, may need to do this in excel to make it work. 
# the agency name and fund soruce are on the same column
kern

# Test of concat all the dictionary dataframe vales

In [173]:
fund_request_df = cleaned_fund_request.values()

type(fund_request_df)

dict_values

In [175]:
all_fund_request = pd.concat(cleaned_fund_request.values(), ignore_index=True)

In [178]:
all_fund_request = pd.concat([all_fund_request,
                              santa_cruz_cleaned,
                              orange_cleaned,
                              butte_cleaned,
                              lassen_cleaned
                             ], ignore_index=True)

In [179]:
display(
    all_fund_request.shape,
    type(all_fund_request),
    all_fund_request.columns
)

(218, 12)

pandas.core.frame.DataFrame

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital_FY23-24', 'capital_FY24-25', 'capital_FY25-26',
       'capital_FY26-27', 'operating_FY23-24', 'operating_FY24-25',
       'operating_FY25-26', 'operating_FY26-27'],
      dtype='object')

In [22]:
# sum all the "capital columns"
#all_fund_request[["capital_FY23-24":"capital_FY26-27"]].sum()

In [23]:
#sum all the "operations columns"

# TEST of Melting the dataframe

In [182]:
id_vars= [
    'rtpa',
 'implementing agenc-y/-ies',
 'project',
 'fund source',
]
val_vars = [
    'capital_FY23-24',
 'capital_FY24-25',
 'capital_FY25-26',
 'capital_FY26-27',
 'operating_FY23-24',
 'operating_FY24-25',
 'operating_FY25-26',
 'operating_FY26-27'
]

melt = all_fund_request.melt(
    id_vars = id_vars,
    value_vars = val_vars,
    var_name = "capital/operation fy",
    value_name = "fund amount",
    ignore_index = True)

In [183]:
display(
    type(melt),
    melt.shape,
    melt.dtypes,
    melt.head()
)

pandas.core.frame.DataFrame

(1744, 6)

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital/operation fy         object
fund amount                  object
dtype: object

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital/operation fy,fund amount
0,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,360641.0
1,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,
2,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (GGRF),capital_FY23-24,3616.684
3,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (PTA),capital_FY23-24,3123.316
4,Amador County Transportation Commission,Amador Transit,,TIRCP,capital_FY23-24,100000.0


In [184]:
# splitting the cap/operations columns

melt[["project type", "fiscal year"]] = melt["capital/operation fy"].str.split('_FY', expand = True)

In [185]:
# updates the `-` string to zero.
melt.loc[(melt["rtpa"] == "Humboldt County Association of Governments") & (melt["fund amount"] == "-"),"fund amount"] = 0

In [186]:
# check for NaNs
melt["fund amount"].isna().sum()

1296

In [187]:
melt["fund amount"] = melt["fund amount"].fillna(0).astype("int64")
melt["fund source"] = melt["fund source"].astype("str")

In [188]:
# ensure no more NaNs
melt["fund amount"].isna().sum()

0

In [189]:
display(
    melt.columns,
    melt.sample(3)
)

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital/operation fy', 'fund amount', 'project type', 'fiscal year'],
      dtype='object')

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital/operation fy,fund amount,project type,fiscal year
1076,Butte County Association of Governments,Butte Regional Transit,Butte Regional Transit Zero-Emission Deployment,TIRCP,operating_FY23-24,0,operating,23-24
979,San Benito Council of Governments,San Benito Council of Governments,Transit Modernization and Next Gen Infrastruct...,TIRCP,operating_FY23-24,0,operating,23-24
41,Madera County Transportation Commission,Madera County Transportation Commission,Program Administration,ZETCP (PTA),capital_FY23-24,0,capital,23-24


In [190]:
# checking fund amounts for any non-int values 
melt["fund amount"].unique()

array([   360641,         0,      3616,      3123,    100000,   2576611,
          238532,    500000,   1175501,     69395,     80357,   9085857,
          242312,    521458,   6849293,    329561,    295818,   1133408,
          101796, 498650905, 119494973,   3706510,     87574,    198860,
           84159,   8254231,    108201,     42539,     49259,   4848229,
          123518,    271142,    110636,    104561,    696393,    722403,
          836515,   1000000,   2080000,   6400000,    200000,    370000,
         1486685,    297576,    133646,  10000000,    400000,     92109,
          501942,   1019544,   3354086,   2510740,    614200,    250000,
          100520,     61421,     53042,  63382700,   4003053,   7956643,
         2868594,  60000000,   2328990,    348002,  10126000,   5434000,
         1705263,  26000000,    943316,   2050000,      9408,      8124,
          124000,    280000,    395000,   1778000,   1400000,   4000000,
          387000,    375000,    600000,    422000, 

In [191]:
display(
    melt.dtypes,
    melt["fund source"].value_counts()
)

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital/operation fy         object
fund amount                   int64
project type                 object
fiscal year                  object
dtype: object

TIRCP           968
ZETCP (GGRF)    368
ZETCP (PTA)     312
ZETCP            40
CMAQ             16
Measure V         8
5339              8
SGR               8
Farebox           8
5307              8
Name: fund source, dtype: int64

# Draft Aggregations

In [192]:
def make_bar(data, x_axis, y_axis):
    chart = alt.Chart(data).mark_bar().encode(
        x = x_axis,
        y = y_axis,
    )
    return chart

In [193]:
by_type = melt.groupby(["project type"]).agg({
    "fund amount": "sum",
}).reset_index()

make_bar(by_type, y_axis = "project type", x_axis = "fund amount")

In [194]:
by_year = melt.groupby(["fiscal year","project type"]).agg({
    "fund amount": "sum",
}).reset_index()

make_bar(by_year, y_axis = "fiscal year", x_axis = "fund amount",)

In [195]:
by_source = melt.groupby(["fund source"]).agg({
    "fund amount": "sum",
        "rtpa": "nunique"
}).reset_index()

make_bar(by_source, y_axis = "fund source", x_axis = "fund amount")

In [196]:
by_rtpa = melt.groupby(["rtpa"]).agg({
    "fund amount": "sum",
    "project": "nunique"
}).reset_index()

make_bar(by_rtpa, "rtpa", "fund amount")