# SB125 Fund Split Analysis

## Question:
- How did RTAs split SB125 funds between operations and capital?

## Methodology:
- upload all avilable `SB125 fund request template` files to gcs
- examine all files for consistencies:
    - come with cleaning plan for inconsistent examples (files withot capital/operating columns)
- concat all rows across all files


## Notes:
- some RTPAs did not submit a `SB125 fund request template.xlsx` file, but instead included an quivilent file their allocation package

In [1]:
import pandas as pd
import os
import altair as alt

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

GCS_PATH = "gs://calitp-analytics-data/data-analyses/sb125/fund_split/"

In [2]:
file_list = [
    "sierra_fund_request.xlsx",
    "slocog_fund_request.xlsx",
    "tehema_fund_request.xlsx",
    "tuolumne_fund_request.xlsx",
    "ventura_fund_request.xlsx",
    "alpine_fund_request.xlsx",
    "amador_fund_request.xlsx",
    "butte_fund_request.xlsx",
    "calaveras_fund_request.xlsx",
    "del_norte_fund_request.xlsx",
    "el_dorado_fund_request.xlsx",
    "humboldt_fund_request.xlsx",
    "kern_fund_request.xlsx",
    "kings_fund_request.xlsx",
    "la_metro_fund_request.xlsx",
    "lake_fund_request.xlsx",
    "lassen_fund_request.xlsx",
    "madera_fund_request.xlsx",
    "mariposa_fund_request.xlsx",
    "mendocino_fund_request.xlsx",
    "merced_fund_request.xlsx",
    "mtc_fund_request.xlsx",
    "nevada_fund_request.xlsx",
    "orange_fund_request.xlsx",
    "placer_fund_request.xlsx",
    "plumas_fund_request.xlsx",
    "riverside_fund_request.xlsx",
    "san_benito_fund_request.xlsx",
    "san_diego_mts_fund_request.xlsx",
    "santa_cruz_fund_request.xlsx",
    "shasta_fund_request.xlsx",
]

file_list.sort()

In [3]:
def clean_fund_request(file:str) -> pd.DataFrame:
    """
    reads in the file from GCS, maps col_names list to df columns, drops all the blank rows.
    returns df.
    """
    col_names = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "total",
]
    
    df = pd.read_excel(f"{GCS_PATH}{file}", header=2, nrows=40, names=col_names).drop(columns="total")
    row_drop = df["rtpa"].isin(["Grand Total", "RTPA"])
    df = df.drop(df[row_drop].index)
    df = df.dropna(how= "all")
    df[["rtpa", "implementing agenc-y/-ies","project"]] = df[["rtpa", "implementing agenc-y/-ies","project"]].ffill()
    
    return df


first iteration of fund_request_checker func.
>def read_in(file:str) -> pd.DataFrame:
>    df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>    df = df.dropna(how= "all")
>   
>    if len(df.columns) == 13:
>        print(f"{file} can use clean_fund_request func.")
>    else: 
>        print(f"needs manual check, {file}")
       

old iteration
>def fund_request_checker(file_list:list) -> pd.DataFrame:
>
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            print(f"{file} can use clean_fund_request func.")
>        else: 
>            print(f"needs manual check, {file}")
       

In [4]:
#fund_request_checker(file_list)

old iteration
>def fund_request_checker_v2(file_list:list):
>    gtg_files = []
>    manual_review = []
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            gtg_files.append(f"{file}")
>        else: 
>            manual_review.append(f"{file}")
>    return display(
>        "good to go files",
>        list(gtg_files), 
>        "needs manual check", 
>        list(manual_review)
>    )

In [5]:
#fund_request_checker_v2(file_list)

In [6]:
def fund_request_checker_v3(file_list:list) -> tuple:
    """takes in list of fund_request excel file name. reads in each file, checks if DF has 13 columns.
    if yes, appends do good-to-go list. else, appends to needs-manual-review.
    output is a tuple of the 2 list.
    assign 2 variables to use this func.
    """
    gtg_files = []
    manual_review = []
    for file in file_list:
    
        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
        df = df.dropna(how= "all")
    
        if len(df.columns) == 13:
            gtg_files.append(f"{file}")
        else: 
            manual_review.append(f"{file}")
    return gtg_files, manual_review

In [7]:
good_list, review_list = fund_request_checker_v3(file_list)

In [8]:
display(
    len(good_list),
    len(review_list)
)

24

7

In [9]:
def cleaner_loop(gtg_list:list) -> dict: 
    """
    takes in good-to-go list from fund_request_checker.
    applies the clean_fund_request function to each item on the list, then appends to dictionary.
    key is name of the file, value is the cleaned dataframe.
    output is dictionary. 
    """
    cleaned_df ={}

    for name in gtg_list:
        cleaned_df[name] = clean_fund_request(name)
    return cleaned_df

In [10]:
cleaned_fund_request = cleaner_loop(good_list)

In [11]:
display(
    type(cleaned_fund_request),
    len(cleaned_fund_request),
    list(cleaned_fund_request.keys()),
)

dict

24

['alpine_fund_request.xlsx',
 'amador_fund_request.xlsx',
 'calaveras_fund_request.xlsx',
 'del_norte_fund_request.xlsx',
 'el_dorado_fund_request.xlsx',
 'humboldt_fund_request.xlsx',
 'kings_fund_request.xlsx',
 'la_metro_fund_request.xlsx',
 'lake_fund_request.xlsx',
 'madera_fund_request.xlsx',
 'mariposa_fund_request.xlsx',
 'mendocino_fund_request.xlsx',
 'merced_fund_request.xlsx',
 'nevada_fund_request.xlsx',
 'placer_fund_request.xlsx',
 'plumas_fund_request.xlsx',
 'riverside_fund_request.xlsx',
 'san_benito_fund_request.xlsx',
 'san_diego_mts_fund_request.xlsx',
 'shasta_fund_request.xlsx',
 'sierra_fund_request.xlsx',
 'slocog_fund_request.xlsx',
 'tehema_fund_request.xlsx',
 'tuolumne_fund_request.xlsx']

In [12]:
# view all the good-to-go df

from IPython.display import display

# .items() creates tuples of each element in the dict. key:value maps to key:df

for key, df in cleaned_fund_request.items():
    print(f"DataFrame: {key}")
    #display(df)


DataFrame: alpine_fund_request.xlsx
DataFrame: amador_fund_request.xlsx
DataFrame: calaveras_fund_request.xlsx
DataFrame: del_norte_fund_request.xlsx
DataFrame: el_dorado_fund_request.xlsx
DataFrame: humboldt_fund_request.xlsx
DataFrame: kings_fund_request.xlsx
DataFrame: la_metro_fund_request.xlsx
DataFrame: lake_fund_request.xlsx
DataFrame: madera_fund_request.xlsx
DataFrame: mariposa_fund_request.xlsx
DataFrame: mendocino_fund_request.xlsx
DataFrame: merced_fund_request.xlsx
DataFrame: nevada_fund_request.xlsx
DataFrame: placer_fund_request.xlsx
DataFrame: plumas_fund_request.xlsx
DataFrame: riverside_fund_request.xlsx
DataFrame: san_benito_fund_request.xlsx
DataFrame: san_diego_mts_fund_request.xlsx
DataFrame: shasta_fund_request.xlsx
DataFrame: sierra_fund_request.xlsx
DataFrame: slocog_fund_request.xlsx
DataFrame: tehema_fund_request.xlsx
DataFrame: tuolumne_fund_request.xlsx


# Cleaning individual good DFs

In [13]:
cleaned_fund_request["amador_fund_request.xlsx"] = cleaned_fund_request["amador_fund_request.xlsx"].drop([2])

In [14]:
cleaned_fund_request["merced_fund_request.xlsx"] = cleaned_fund_request["merced_fund_request.xlsx"].drop([1,2,34,36,37])

In [15]:
cleaned_fund_request["san_benito_fund_request.xlsx"] = cleaned_fund_request["san_benito_fund_request.xlsx"].drop([6,9])

In [16]:
row_drops = [
    1,
    2,
    3,
    4,
    9,
    10,
    11,
]
cleaned_fund_request["san_diego_mts_fund_request.xlsx"] = cleaned_fund_request["san_diego_mts_fund_request.xlsx"].drop(row_drops)

In [17]:
cleaned_fund_request["sierra_fund_request.xlsx"] = cleaned_fund_request["sierra_fund_request.xlsx"].drop(list(range(24,32)))

In [18]:
cleaned_fund_request["nevada_fund_request.xlsx"] = cleaned_fund_request["nevada_fund_request.xlsx"].drop([8,9])

In [19]:
cleaned_fund_request["plumas_fund_request.xlsx"]= cleaned_fund_request["plumas_fund_request.xlsx"].drop(list(range(10,14)))

# Attempt to clean bad DFs

In [91]:
    col_names = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "total",
    ]

In [50]:
# list of bad DFs that didnt match the other layouts
review_list

['butte_fund_request.xlsx',
 'kern_fund_request.xlsx',
 'lassen_fund_request.xlsx',
 'mtc_fund_request.xlsx',
 'orange_fund_request.xlsx',
 'santa_cruz_fund_request.xlsx',
 'ventura_fund_request.xlsx']

In [105]:
#CLEAN COMPLETED

lassen = pd.read_excel(f"{GCS_PATH}lassen_fund_request.xlsx", 
                       sheet_name = "D.2. Detailed Fund Request",
                       skiprows = 6,
                       header=0, 
                       skipfooter=12,
                      ).drop(columns=["Unnamed: 0", "Project Type", "Operator"])

# can work with this. may be able to remove the top rows then use cleaner loop
lassen.columns=col_names
lassen_cleaned = lassen.drop(columns="total")

In [106]:
lassen_cleaned

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
0,LCTC,LCTC / LTSA,LTSA Operating Deficit,TIRCP,,,,,334937.0,665063.0,,
1,LCTC,LCTC / LTSA,NEMT Service to Reno,TIRCP,200000.0,,,,,,,
2,LCTC,LCTC / LTSA,Vehicle Replacement (ZEB Plan),TIRCP,885000.0,958737.0,,,,,,
3,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,TIRCP,400000.0,200000.0,,,,,,
4,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,ZETCP (PTA),65103.0,,,,,,,
5,LCTC,LCTC / LTSA,ZEV Charging Infrastrucutre,ZETCP (GGRF),,81552.0,87619.0,87619.0,,,,
6,LCTC,LCTC,Countywide ZEV Study,ZETCP (PTA),11339.69,,,,,,,
7,LCTC,LCTC,Countywide ZEV Study,ZETCP (GGRF),88660.31,,,,,,,
8,LCTC,LCTC,RTPA Administrative Costs,TIRCP,,,,,10000.0,10000.0,,
9,LCTC,LCTC,RTPA Administrative Costs,ZETCP (GGRF),,,,,,11067.0,5000.0,5000.0


In [134]:
# CLEANED COMPLETE
butte = pd.read_excel(f"{GCS_PATH}butte_fund_request.xlsx",
                      skiprows= 2,
                      header= 0,
                      skipfooter= 17,
                     )

butte_cleaned = butte.copy()

butte_cleaned[["RTPA","Implementing Agenc-y/-ies", "Project"]] = butte_cleaned[["RTPA","Implementing Agenc-y/-ies", "Project"]].ffill()

butte_cleaned.insert(6, "operations FY25-26",0)
butte_cleaned.insert(7, "operations FY26-27",0)
butte_cleaned.columns = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "total",
    
]
# missing the operations columns. may not need if we melt just the capital columns
display(
    butte.shape,
    butte_cleaned.shape,
    butte_cleaned,
)

In [None]:
mtc = pd.read_excel(f"{GCS_PATH}mtc_fund_request.xlsx")

# can work with this. may be able to remove the top rows then use cleaner loop
mtc

In [None]:
orange = pd.read_excel(f"{GCS_PATH}orange_fund_request.xlsx")

# FY goes to 28-29. dont think i need those columns, there are no numbers in them anyways
# can drop
orange

In [None]:
santa_cruz = pd.read_excel(f"{GCS_PATH}santa_cruz_fund_request.xlsx")

# can remove the extra columns past unnamed 12, then remove top and bottom rows, then use cleaner loop
santa_cruz

In [None]:
kern = pd.read_excel(f"{GCS_PATH}kern_fund_request.xlsx")

# this is so bad, may need to do this in excel to make it work. 
# the agency name and fund soruce are on the same column
kern

In [None]:
ventura = pd.read_excel(f"{GCS_PATH}ventura_fund_request.xlsx")

# needs a lot of work, may have to do in excel
ventura

# Test of concat all the dictionary dataframe vales

In [20]:
all_fund_request = pd.concat(cleaned_fund_request.values(), ignore_index=True)

In [21]:
display(
    all_fund_request.shape,
    type(all_fund_request),
    all_fund_request.columns
)

(172, 12)

pandas.core.frame.DataFrame

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital_FY23-24', 'capital_FY24-25', 'capital_FY25-26',
       'capital_FY26-27', 'operating_FY23-24', 'operating_FY24-25',
       'operating_FY25-26', 'operating_FY26-27'],
      dtype='object')

In [22]:
# sum all the "capital columns"
#all_fund_request[["capital_FY23-24":"capital_FY26-27"]].sum()

In [23]:
#sum all the "operations columns"

In [24]:
all_fund_request.groupby("rtpa").agg({
    "capital_FY23-24": "sum",
    "operating_FY23-24": "sum"
})

Unnamed: 0_level_0,capital_FY23-24,operating_FY23-24
rtpa,Unnamed: 1_level_1,Unnamed: 2_level_1
Alpine County Transportation Commission,367381.0,0.0
Amador County Transportation Commission,100000.0,0.0
Calaveras County Council of Goverments,2815143.0,0.0
DNLTC,1825253.0,0.0
El Dorado County Transportation Commission,9849627.0,208011.0
Humboldt County Association of Governments,0.0,1952069.06
Kings County Association of Governments,8709876.0,100000.0
Lake County/City Council of Governments,4077103.0,0.0
Los Angeles County Metropolitan Transportation Authority,618145878.0,0.0
Madera County Transportation Commission,8254231.0,188112.0


In [25]:
all_fund_request.pivot_table(
    values = ["capital_FY23-24", "operating_FY23-24"],
    #columns= ["capital_FY23-24", "operating_FY23-24"],
    index = "rtpa",
    aggfunc = "sum",
    margins = True,
    margins_name = "Grand Total"
)

Unnamed: 0_level_0,capital_FY23-24,operating_FY23-24
rtpa,Unnamed: 1_level_1,Unnamed: 2_level_1
Alpine County Transportation Commission,367381.0,0.0
Amador County Transportation Commission,100000.0,0.0
Calaveras County Council of Goverments,2815143.0,0.0
DNLTC,1825253.0,0.0
El Dorado County Transportation Commission,9849627.0,208011.0
Humboldt County Association of Governments,0.0,1952069.06
Kings County Association of Governments,8709876.0,100000.0
Lake County/City Council of Governments,4077103.0,0.0
Los Angeles County Metropolitan Transportation Authority,618145878.0,0.0
Madera County Transportation Commission,8254231.0,188112.0


# TEST of Melting the dataframe

In [26]:
id_vars= [
    'rtpa',
 'implementing agenc-y/-ies',
 'project',
 'fund source',
]
val_vars = [
    'capital_FY23-24',
 'capital_FY24-25',
 'capital_FY25-26',
 'capital_FY26-27',
 'operating_FY23-24',
 'operating_FY24-25',
 'operating_FY25-26',
 'operating_FY26-27'
]

melt = all_fund_request.melt(
    id_vars = id_vars,
    value_vars = val_vars,
    var_name = "capital/operation fy",
    value_name = "fund amount",
    ignore_index = True)

In [27]:
display(
    type(melt),
    melt.shape,
    melt.dtypes,
    melt.head()
)

pandas.core.frame.DataFrame

(1376, 6)

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital/operation fy         object
fund amount                  object
dtype: object

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital/operation fy,fund amount
0,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,360641.0
1,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,
2,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (GGRF),capital_FY23-24,3616.684
3,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (PTA),capital_FY23-24,3123.316
4,Amador County Transportation Commission,Amador Transit,,TIRCP,capital_FY23-24,100000.0


In [28]:
# splitting the cap/operations columns

melt[["project type", "fiscal year"]] = melt["capital/operation fy"].str.split('_FY', expand = True)

In [29]:
# updates the `-` string to zero.
melt.loc[(melt["rtpa"] == "Humboldt County Association of Governments") & (melt["fund amount"] == "-"),"fund amount"] = 0

In [30]:
# check for NaNs
melt["fund amount"].isna().sum()

1021

In [31]:
melt["fund amount"] = melt["fund amount"].fillna(0).astype("int64")
melt["fund source"] = melt["fund source"].astype("str")

In [32]:
# ensure no more NaNs
melt["fund amount"].isna().sum()

0

In [45]:
display(
    melt.columns,
    melt.sample(3)
)

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital/operation fy', 'fund amount', 'project type', 'fiscal year'],
      dtype='object')

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital/operation fy,fund amount,project type,fiscal year
966,RCTC,"Cities of Banning and Beaumont, County of Rive...",Passenger Rail Project Development,TIRCP,operating_FY24-25,0,operating,24-25
51,Mariposa County Local Transportation Commission,Yes,Mariposa County Zero Emission Transit Yard,TIRCP,capital_FY23-24,108201,capital,23-24
567,Mariposa County Local Transportation Commission,Yes,Mariposa County Zero Emission Transit Yard,TIRCP,capital_FY26-27,0,capital,26-27


In [34]:
# checking fund amounts for any non-int values 
melt["fund amount"].unique()

array([   360641,         0,      3616,      3123,    100000,   2576611,
          238532,    500000,   1175501,     69395,     80357,   9085857,
          242312,    521458,   6849293,    329561,    295818,   1133408,
          101796, 498650905, 119494973,   3706510,     87574,    198860,
           84159,   8254231,    108201,     42539,     49259,   4848229,
          123518,    271142,    110636,    104561,    696393,    722403,
          836515,   1000000,   2080000,   6400000,    200000,    370000,
         1486685,    297576,    133646,  10000000,    400000,     92109,
          501942,   1019544,   3354086,   2510740,    614200,    250000,
          100520,     61421,     53042,  63382700,   4003053,   7956643,
         2868594,  60000000,   2328990,    348002,  10126000,   5434000,
         1705263,  26000000,    943316,   2050000,      9408,      8124,
          124000,    280000,    395000,   1778000,   1400000,   4000000,
          387000,    375000,    600000,    422000, 

In [35]:
display(
    melt.dtypes,
    melt["fund source"].value_counts()
)

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital/operation fy         object
fund amount                   int64
project type                 object
fiscal year                  object
dtype: object

TIRCP           784
ZETCP (GGRF)    248
ZETCP (PTA)     248
ZETCP            40
CMAQ             16
Measure V         8
5339              8
SGR               8
Farebox           8
5307              8
Name: fund source, dtype: int64

# Draft Aggregations

In [36]:
def make_bar(data, x_axis, y_axis):
    chart = alt.Chart(data).mark_bar().encode(
        x = x_axis,
        y = y_axis,
    )
    return chart

In [42]:
by_type = melt.groupby(["project type"]).agg({
    "fund amount": "sum",
}).reset_index()

make_bar(by_type, y_axis = "project type", x_axis = "fund amount")

In [41]:
by_year = melt.groupby(["fiscal year","project type"]).agg({
    "fund amount": "sum",
}).reset_index()

make_bar(by_year, y_axis = "fiscal year", x_axis = "fund amount",)

In [44]:
by_source = melt.groupby(["fund source"]).agg({
    "fund amount": "sum",
        "rtpa": "nunique"
}).reset_index()

make_bar(by_source, y_axis = "fund source", x_axis = "fund amount")

In [40]:
by_rtpa = melt.groupby(["rtpa"]).agg({
    "fund amount": "sum",
    "project": "nunique"
}).reset_index()

make_bar(by_rtpa, "rtpa", "fund amount")