# SB125 Fund Split Analysis

## Question:
- How did RTAs split SB125 funds between operations and capital?

## Methodology:
- upload all avilable `SB125 fund request template` files to gcs
- examine all files for consistencies:
    - come with cleaning plan for inconsistent examples (files withot capital/operating columns)
- concat all rows across all files


## Notes:
- some RTPAs did not submit a `SB125 fund request template.xlsx` file, but instead included an quivilent file their allocation package

In [1]:
import altair as alt
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

GCS_PATH = "gs://calitp-analytics-data/data-analyses/sb125/fund_split/"

In [2]:
file_list = [
    "sierra_fund_request.xlsx",
    "slocog_fund_request.xlsx",
    "tehema_fund_request.xlsx",
    "tuolumne_fund_request.xlsx",
    "ventura_fund_request.xlsx",
    "alpine_fund_request.xlsx",
    "amador_fund_request.xlsx",
    "butte_fund_request.xlsx",
    "calaveras_fund_request.xlsx",
    "del_norte_fund_request.xlsx",
    "el_dorado_fund_request.xlsx",
    "humboldt_fund_request.xlsx",
    "kern_fund_request.xlsx",
    "kings_fund_request.xlsx",
    "la_metro_fund_request.xlsx",
    "lake_fund_request.xlsx",
    "lassen_fund_request.xlsx",
    "madera_fund_request.xlsx",
    "mariposa_fund_request.xlsx",
    "mendocino_fund_request.xlsx",
    "merced_fund_request.xlsx",
    "mtc_fund_request.xlsx",
    "nevada_fund_request.xlsx",
    "orange_fund_request.xlsx",
    "placer_fund_request.xlsx",
    "plumas_fund_request.xlsx",
    "riverside_fund_request.xlsx",
    "san_benito_fund_request.xlsx",
    "san_diego_mts_fund_request.xlsx",
    "santa_cruz_fund_request.xlsx",
    "shasta_fund_request.xlsx",
]

file_list.sort()

In [3]:
def clean_fund_request(file: str) -> pd.DataFrame:
    """
    reads in the file from GCS, maps col_names list to df columns, drops all the blank rows.
    returns df.
    """
    col_names = [
        "rtpa",
        "implementing agenc-y/-ies",
        "project",
        "fund source",
        "capital_FY23-24",
        "capital_FY24-25",
        "capital_FY25-26",
        "capital_FY26-27",
        "operating_FY23-24",
        "operating_FY24-25",
        "operating_FY25-26",
        "operating_FY26-27",
        "total",
    ]

    df = pd.read_excel(f"{GCS_PATH}{file}", header=2, nrows=40, names=col_names).drop(
        columns="total"
    )
    row_drop = df["rtpa"].isin(["Grand Total", "RTPA"])
    df = df.drop(df[row_drop].index)
    df = df.dropna(how="all")
    df[["rtpa", "implementing agenc-y/-ies", "project"]] = df[
        ["rtpa", "implementing agenc-y/-ies", "project"]
    ].ffill()

    return df

first iteration of fund_request_checker func.
>def read_in(file:str) -> pd.DataFrame:
>    df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>    df = df.dropna(how= "all")
>   
>    if len(df.columns) == 13:
>        print(f"{file} can use clean_fund_request func.")
>    else: 
>        print(f"needs manual check, {file}")
       

old iteration
>def fund_request_checker(file_list:list) -> pd.DataFrame:
>
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            print(f"{file} can use clean_fund_request func.")
>        else: 
>            print(f"needs manual check, {file}")
       

In [4]:
# fund_request_checker(file_list)

old iteration
>def fund_request_checker_v2(file_list:list):
>    gtg_files = []
>    manual_review = []
>    for file in file_list:
>    
>        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
>        df = df.dropna(how= "all")
>    
>        if len(df.columns) == 13:
>            gtg_files.append(f"{file}")
>        else: 
>            manual_review.append(f"{file}")
>    return display(
>        "good to go files",
>        list(gtg_files), 
>        "needs manual check", 
>        list(manual_review)
>    )

In [5]:
# fund_request_checker_v2(file_list)

# Cleaning the easy DFs

In [6]:
def fund_request_checker_v3(file_list: list) -> tuple:
    """takes in list of fund_request excel file name. reads in each file, checks if DF has 13 columns.
    if yes, appends do good-to-go list. else, appends to needs-manual-review.
    output is a tuple of the 2 list.
    assign 2 variables to use this func.
    """
    gtg_files = []
    manual_review = []
    for file in file_list:

        df = pd.read_excel(f"{GCS_PATH}{file}", nrows=40)
        df = df.dropna(how="all")

        if len(df.columns) == 13:
            gtg_files.append(f"{file}")
        else:
            manual_review.append(f"{file}")
    return gtg_files, manual_review

In [7]:
good_list, review_list = fund_request_checker_v3(file_list)

In [8]:
display(len(good_list), len(review_list))

24

7

In [9]:
def cleaner_loop(gtg_list: list) -> dict:
    """
    takes in good-to-go list from fund_request_checker.
    applies the clean_fund_request function to each item on the list, then appends to dictionary.
    key is name of the file, value is the cleaned dataframe.
    output is dictionary.
    """
    cleaned_df = {}

    for name in gtg_list:
        cleaned_df[name] = clean_fund_request(name)
    return cleaned_df

In [10]:
cleaned_fund_request = cleaner_loop(good_list)

In [11]:
display(
    type(cleaned_fund_request),
    len(cleaned_fund_request),
    list(cleaned_fund_request.keys()),
)

dict

24

['alpine_fund_request.xlsx',
 'amador_fund_request.xlsx',
 'calaveras_fund_request.xlsx',
 'del_norte_fund_request.xlsx',
 'el_dorado_fund_request.xlsx',
 'humboldt_fund_request.xlsx',
 'kings_fund_request.xlsx',
 'la_metro_fund_request.xlsx',
 'lake_fund_request.xlsx',
 'madera_fund_request.xlsx',
 'mariposa_fund_request.xlsx',
 'mendocino_fund_request.xlsx',
 'merced_fund_request.xlsx',
 'nevada_fund_request.xlsx',
 'placer_fund_request.xlsx',
 'plumas_fund_request.xlsx',
 'riverside_fund_request.xlsx',
 'san_benito_fund_request.xlsx',
 'san_diego_mts_fund_request.xlsx',
 'shasta_fund_request.xlsx',
 'sierra_fund_request.xlsx',
 'slocog_fund_request.xlsx',
 'tehema_fund_request.xlsx',
 'tuolumne_fund_request.xlsx']

In [12]:
# view all the good-to-go df

from IPython.display import display

# .items() creates tuples of each element in the dict. key:value maps to key:df

for key, df in cleaned_fund_request.items():
    print(f"DataFrame: {key}")
    # display(df)

DataFrame: alpine_fund_request.xlsx
DataFrame: amador_fund_request.xlsx
DataFrame: calaveras_fund_request.xlsx
DataFrame: del_norte_fund_request.xlsx
DataFrame: el_dorado_fund_request.xlsx
DataFrame: humboldt_fund_request.xlsx
DataFrame: kings_fund_request.xlsx
DataFrame: la_metro_fund_request.xlsx
DataFrame: lake_fund_request.xlsx
DataFrame: madera_fund_request.xlsx
DataFrame: mariposa_fund_request.xlsx
DataFrame: mendocino_fund_request.xlsx
DataFrame: merced_fund_request.xlsx
DataFrame: nevada_fund_request.xlsx
DataFrame: placer_fund_request.xlsx
DataFrame: plumas_fund_request.xlsx
DataFrame: riverside_fund_request.xlsx
DataFrame: san_benito_fund_request.xlsx
DataFrame: san_diego_mts_fund_request.xlsx
DataFrame: shasta_fund_request.xlsx
DataFrame: sierra_fund_request.xlsx
DataFrame: slocog_fund_request.xlsx
DataFrame: tehema_fund_request.xlsx
DataFrame: tuolumne_fund_request.xlsx


# Cleaning individual good DFs

## DONE Humboldt 

In [13]:
def clean_humboldt():
    cleaned_fund_request["humboldt_fund_request.xlsx"][
    ["operating_FY24-25", "operating_FY25-26", "operating_FY26-27"]
] = cleaned_fund_request["humboldt_fund_request.xlsx"][
    ["operating_FY24-25", "operating_FY25-26", "operating_FY26-27"]
].replace(
    "-", 0
)
    return

In [14]:
clean_humboldt()

## DONE Amador

In [15]:
def clean_amador():
    cleaned_fund_request["amador_fund_request.xlsx"] = cleaned_fund_request[
    "amador_fund_request.xlsx"
    ][cleaned_fund_request["amador_fund_request.xlsx"]["fund source"].notna()
     ]
    return

In [16]:
clean_amador()

## DONE Merced

In [17]:
def clean_merced():
    cleaned_fund_request["merced_fund_request.xlsx"]= cleaned_fund_request[
    "merced_fund_request.xlsx"][cleaned_fund_request["merced_fund_request.xlsx"]["project"].notna()].drop([34, 36, 37])
    
    # is there another way to update these values that doesnt include using the number index
    cleaned_fund_request["merced_fund_request.xlsx"].at[13, "fund source"] = "`5339"
    cleaned_fund_request["merced_fund_request.xlsx"].at[19, "fund source"] = "`5307"
    
    return

In [18]:
clean_merced()

## DONE San Benito

In [19]:
def clean_san_benito():
    cleaned_fund_request["san_benito_fund_request.xlsx"] = cleaned_fund_request["san_benito_fund_request.xlsx"][
    cleaned_fund_request["san_benito_fund_request.xlsx"]["fund source"].notna()]
    
    return

In [20]:
clean_san_benito()

## DONE San Diego MTS

In [21]:
def clean_mts():
    cleaned_fund_request["san_diego_mts_fund_request.xlsx"] = cleaned_fund_request["san_diego_mts_fund_request.xlsx"].iloc[7:]
    
    return

In [22]:
clean_mts()

## DONE Sierra

In [23]:
def clean_sierra():
    cleaned_fund_request["sierra_fund_request.xlsx"] = cleaned_fund_request["sierra_fund_request.xlsx"].iloc[:-8]
    
    return

In [24]:
clean_sierra()

## DONE Nevada

In [25]:
def clean_nevada():
    cleaned_fund_request["nevada_fund_request.xlsx"]=cleaned_fund_request["nevada_fund_request.xlsx"][
    cleaned_fund_request["nevada_fund_request.xlsx"]["fund source"].notna()]
    
    return

In [26]:
clean_nevada()

## DONE Plumas

In [27]:
def clean_plumas():
    cleaned_fund_request["plumas_fund_request.xlsx"]=cleaned_fund_request["plumas_fund_request.xlsx"][cleaned_fund_request["plumas_fund_request.xlsx"][
    "fund source"].notna()]
    
    return

In [28]:
clean_plumas()

# Attempt to clean bad DFs

In [29]:
col_names = [
    "rtpa",
    "implementing agenc-y/-ies",
    "project",
    "fund source",
    "capital_FY23-24",
    "capital_FY24-25",
    "capital_FY25-26",
    "capital_FY26-27",
    "operating_FY23-24",
    "operating_FY24-25",
    "operating_FY25-26",
    "operating_FY26-27",
    "total",
]

## DONE Lassen

In [30]:
def clean_lassen():
    lassen = pd.read_excel(
        f"{GCS_PATH}lassen_fund_request.xlsx",
        sheet_name="D.2. Detailed Fund Request",
        skiprows=6,
        header=0,
        skipfooter=12,
    ).drop(columns=["Unnamed: 0", "Project Type", "Operator"])

    # can work with this. may be able to remove the top rows then use cleaner loop
    lassen.columns = col_names
    lassen_cleaned = lassen.drop(columns="total")
    
    return lassen_cleaned


In [31]:
lassen_cleaned = clean_lassen()

## DONE Butte

In [32]:
# CLEANED COMPLETE
def clean_butte():
    butte = pd.read_excel(
        f"{GCS_PATH}butte_fund_request.xlsx",
        skiprows=2,
        header=0,
        skipfooter=17,
    )

    butte_cleaned = butte.copy()

    butte_cleaned[["RTPA", "Implementing Agenc-y/-ies", "Project"]] = butte_cleaned[
        ["RTPA", "Implementing Agenc-y/-ies", "Project"]
    ].ffill()

    butte_cleaned.insert(6, "operations FY25-26", 0)
    butte_cleaned.insert(7, "operations FY26-27", 0)
    butte_cleaned.columns = [
        "rtpa",
        "implementing agenc-y/-ies",
        "project",
        "fund source",
        "operating_FY23-24",
        "operating_FY24-25",
        "operating_FY25-26",
        "operating_FY26-27",
        "capital_FY23-24",
        "capital_FY24-25",
        "capital_FY25-26",
        "capital_FY26-27",
        "total",
    ]
    butte_cleaned = butte_cleaned.drop(columns="total")

    return butte_cleaned

In [33]:
butte_cleaned = clean_butte()

## DONE MTC

In [71]:
# CLEANED COMPLETE

def clean_mtc():
    mtc = pd.read_excel(
        f"{GCS_PATH}mtc_fund_request.xlsx", skiprows=2, header=0, skipfooter=21
    ).drop(columns=["Unnamed: 13", "Unnamed: 14", "Unnamed: 15"])

    mtc_cleaned = mtc.copy()
    mtc_cleaned.columns = col_names
    mtc_cleaned = mtc_cleaned.drop(columns="total")
    
    return mtc_cleaned

In [74]:
mtc_cleaned = clean_mtc()

display(
    mtc_cleaned.shape,
    mtc_cleaned.columns,
    #mtc_cleaned
)

(16, 12)

Index(['rtpa', 'implementing agenc-y/-ies', 'project', 'fund source',
       'capital_FY23-24', 'capital_FY24-25', 'capital_FY25-26',
       'capital_FY26-27', 'operating_FY23-24', 'operating_FY24-25',
       'operating_FY25-26', 'operating_FY26-27'],
      dtype='object')

## DONE Orange

In [75]:
# CLEANED COMPLETE
def clean_orange():
    orange = pd.read_excel(
        f"{GCS_PATH}orange_fund_request.xlsx", skiprows=3, header=0, skipfooter=1
    )

    orange_cleaned = orange.copy()

    orange_cleaned.rename(columns={"Unnamed: 0": "RTPA"}, inplace=True)
    orange_cleaned["RTPA"] = "OCTA"
    orange_cleaned = orange_cleaned.drop(
        columns=["FY27-28", "FY28-29", "FY27-28.1", "FY28-29.1"]
    )

    orange_cleaned.columns = col_names
    orange_cleaned = orange_cleaned.drop(columns="total")
    
    return orange_cleaned

In [78]:
orange_cleaned = clean_orange()

display(
    orange_cleaned.shape,
    #orange_cleaned
)

(15, 12)

## DONE Santa Cruz

In [79]:
# CLEANED COMPLETE
def clean_santa_cruz():
    santa_cruz = pd.read_excel(
        f"{GCS_PATH}santa_cruz_fund_request.xlsx", skiprows=4, header=0, skipfooter=5
    ).iloc[:, 0:13]

    santa_cruz_cleaned = santa_cruz.copy()

    santa_cruz_cleaned.columns = col_names
    santa_cruz_cleaned.drop(columns="total", inplace=True)
    
    return santa_cruz_cleaned

In [82]:
santa_cruz_cleaned = clean_santa_cruz()

display(
    santa_cruz_cleaned.shape,
    #santa_cruz_cleaned
)

(8, 12)

## DONE Ventura

In [92]:
def clean_ventura():
    #clean TIRCP sections
    ventura_tircp_capital = pd.read_excel(
    f"{GCS_PATH}ventura_fund_request.xlsx",
    sheet_name="Project Breakdown",
    skiprows=2,
    header=0,
    skipfooter=40,
    )
    
    ventura_tircp_operating = pd.read_excel(
        f"{GCS_PATH}ventura_fund_request.xlsx",
        sheet_name="Project Breakdown",
        skiprows=51,
        header=0,
        skipfooter=1,
    )
    
    ventura_tircp_merge = ventura_tircp_capital.merge(
        ventura_tircp_operating,
        how="outer",
        on=["Implementing Agenc-y/-ies", "Project Category", "Project"],
        suffixes=["_capital", "_operating"],
    ).drop(
        columns=[
            "Year Requested",
            "Unnamed: 8_capital",
            "Unnamed: 8_operating",
            "Project Category",
        ]
    )
    
    #merging TIRCP sections
    ventura_tircp_merge["rtpa"] = "VCTC"
    ventura_tircp_merge["Fund Source"] = "TIRCP"

    ventura_col_dict = {
        "Implementing Agenc-y/-ies": "implementing agenc-y/-ies",
        "Project": "project",
        "Fund Source": "fund source",
        "FY23-24_capital": "capital_FY23-24",
        "FY24-25_capital": "capital_FY24-25",
        "FY25-26_capital": "capital_FY25-26",
        "FY26-27_capital": "capital_FY26-27",
        "FY23-24_operating": "operating_FY23-24",
        "FY24-25_operating": "operating_FY24-25",
        "FY25-26_operating": "operating_FY25-26",
        "FY26-27_operating": "operating_FY26-27",
    }

    col_order = [
        "rtpa",
        "implementing agenc-y/-ies",
        "project",
        "fund source",
        "capital_FY23-24",
        "capital_FY24-25",
        "capital_FY25-26",
        "capital_FY26-27",
        "operating_FY23-24",
        "operating_FY24-25",
        "operating_FY25-26",
        "operating_FY26-27",
    ]

    ventura_tircp_merge.rename(columns=ventura_col_dict, inplace=True)

    ventura_tircp_merge = ventura_tircp_merge[col_order]
    
    # clean zetcp sections
    ventura_zetcp_capital = pd.read_excel(
        f"{GCS_PATH}ventura_fund_request.xlsx",
        sheet_name="Project Breakdown",
        skiprows=32,
        header=0,
        skipfooter=21,
    ).drop(columns=["Unnamed: 7", "Unnamed: 8"])
    fund_dict = {
        "GGRF Y1": "ZETCP (GGRF)",
        "GGRF Y2": "ZETCP (GGRF)",
        "GGRF Y3": "ZETCP (GGRF)",
        "GGRF Y4": "ZETCP (GGRF)",
        "PTA": "ZETCP (PTA)",
    }

    ven_col = {
        "Implementing Agenc-y/-ies": "implementing agenc-y/-ies",
        "Project": "project",
        "Fund Source": "fund source",
        "FY23-24": "capital_FY23-24",
        "FY24-25": "capital_FY24-25",
        "FY25-26": "capital_FY25-26",
        "FY26-27": "capital_FY26-27",
    }

    ventura_zetcp_capital["Fund Source"] = ventura_zetcp_capital["Fund Source"].replace(
        fund_dict
    )

    ventura_zetcp_capital["rtpa"] = "VCTC"

    ventura_zetcp_capital.rename(columns=ven_col, inplace=True)

    ventura_zetcp_capital = ventura_zetcp_capital[col_order[0:8]]
    
    #final merge
    ventura_big_merge = ventura_tircp_merge.merge(
        ventura_zetcp_capital,
        how="outer",
        on=[
            "implementing agenc-y/-ies",
            "project",
            "fund source",
            "capital_FY23-24",
            "capital_FY24-25",
            "capital_FY25-26",
            "capital_FY26-27",
            "rtpa",
        ],
        suffixes=("_zetcp_cap", "_tircp"),
    )
    return ventura_big_merge
    

In [95]:
# should see 57 rows total
ventura_cleaned = clean_ventura()

display(
    ventura_cleaned.shape,
    #ventura_cleaned.sort_values(by="implementing agenc-y/-ies"),
)

(57, 12)

## DONE Kern

In [44]:
def get_kern_data(head_count: int, foot_count: int, agency: str, proj_title: str):

    kern_cols = {
        "Unnamed: 0": "fund source",
        "Unnamed: 1": "capital_FY23-24",
        "Unnamed: 2": "operating_FY23-24",
        "Unnamed: 3": "capital_FY24-25",
        "Unnamed: 4": "operating_FY24-25",
        "Unnamed: 5": "capital_FY25-26",
        "Unnamed: 6": "operating_FY25-26",
        "Unnamed: 7": "capital_FY26-27",
        "Unnamed: 8": "operating_FY26-27",
    }
    df = (
        pd.read_excel(
            f"{GCS_PATH}kern_fund_request.xlsx",
            header=head_count,
            skipfooter=foot_count,
        )
        .rename(columns=kern_cols)
        .assign(implementing_agency=agency, rtpa="Kern COG", project=proj_title)
    )
    return df

In [98]:
#kern = pd.read_excel(f"{GCS_PATH}kern_fund_request.xlsx")

# this is so bad, may need to do this in excel to make it work.
# the agency name and fund soruce are on the same column

def clean_kern():
    
    col_order = [
        "rtpa",
        "implementing agenc-y/-ies",
        "project",
        "fund source",
        "capital_FY23-24",
        "capital_FY24-25",
        "capital_FY25-26",
        "capital_FY26-27",
        "operating_FY23-24",
        "operating_FY24-25",
        "operating_FY25-26",
        "operating_FY26-27",
    ]
    
    kern_cols = {
        "Unnamed: 0": "fund source",
        "Unnamed: 1": "capital_FY23-24",
        "Unnamed: 2": "operating_FY23-24",
        "Unnamed: 3": "capital_FY24-25",
        "Unnamed: 4": "operating_FY24-25",
        "Unnamed: 5": "capital_FY25-26",
        "Unnamed: 6": "operating_FY25-26",
        "Unnamed: 7": "capital_FY26-27",
        "Unnamed: 8": "operating_FY26-27",
    }

    arvin = get_kern_data(
        3,
        43,
        "arvin",
        "Purcahse and install EV mirco-grid Purcahse and install EV mirco-grid",
    )

    california_city = get_kern_data(
        7,
        39,
        "california city",
        "Purchase and construct transit building to house EV vans and solar charging stations",
    )

    delano = get_kern_data(11, 35, "delano", "Construct transit Facility")

    get = get_kern_data(
        15,
        31,
        "get",
        "Golden Empire Transit Free or near free transit fares Back up hydrogent fuel plant",
    )

    kern_transit = get_kern_data(
        19,
        27,
        "kern transit",
        "Transition to Zero-Emission Vehicles and supporting infrastructure",
    )

    mcfarland = get_kern_data(
        23,
        23,
        "mcfarland",
        "Design and construct a transit station providing a transit office , waitning area, restrooms and EV charging stations",
    )

    ridgecrest = get_kern_data(
        27,
        19,
        "ridgecrest",
        "Replacement of Cutaway Buses with Electric Vans and Construction of Bus Stop at North Norma Street and West Felspar Avenue",
    )

    shafter = get_kern_data(
        31, 15, "shafter", "bus storage, new transit vehicles, free ridership fare program"
    )

    taft = get_kern_data(
        35,
        11,
        "taft",
        "convert the City’s gasoline powered fleet of on-demand transit vehicles to plug-in electric vans compatible with the solar-powered charging infrastructure being completed now.",
    )

    tehahapi = get_kern_data(
        39,
        9,
        "tehahapi",
        "Improvements to Downtown Transit Center and Installation of EV Charging Infrastructure",
    )

    wasco = get_kern_data(
        43, 
        3, 
        "wasco", "New Transit Operating and Maintenance Facility"
    )

    kern_concat = pd.concat(
        [
            wasco,
            tehahapi,
            taft,
            shafter,
            ridgecrest,
            mcfarland,
            kern_transit,
            get,
            arvin,
            california_city,
            delano,
        ],
        ignore_index=True,
    )

    kern_clean = kern_concat[kern_concat["fund source"].isin(["TIRCP", "ZETCP"])].rename(
        columns={"implementing_agency": "implementing agenc-y/-ies"}
    )

    kern_clean = kern_clean[col_order]
    
    return kern_clean


In [100]:
# CLEANING COMPLETE
kern_cleaned = clean_kern()

display(
    
    kern_cleaned.shape,
    kern_cleaned
)

(20, 12)

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital_FY23-24,capital_FY24-25,capital_FY25-26,capital_FY26-27,operating_FY23-24,operating_FY24-25,operating_FY25-26,operating_FY26-27
1,Kern COG,wasco,New Transit Operating and Maintenance Facility,TIRCP,1350328.0,1362548.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Kern COG,wasco,New Transit Operating and Maintenance Facility,ZETCP,146871.0,83215.0,83215.0,83215.0,0.0,0.0,0.0,0.0
5,Kern COG,taft,convert the City’s gasoline powered fleet of o...,TIRCP,359651.0,362907.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Kern COG,taft,convert the City’s gasoline powered fleet of o...,ZETCP,39119.0,22164.0,22164.0,22164.0,0.0,0.0,0.0,0.0
8,Kern COG,shafter,"bus storage, new transit vehicles, free riders...",TIRCP,1029704.0,1503605.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Kern COG,shafter,"bus storage, new transit vehicles, free riders...",ZETCP,111998.0,0.0,0.0,63456.0,0.0,63456.0,63456.0,0.0
11,Kern COG,ridgecrest,Replacement of Cutaway Buses with Electric Van...,TIRCP,1490120.0,1503605.0,0.0,0.0,0.0,0.0,0.0,0.0
12,Kern COG,ridgecrest,Replacement of Cutaway Buses with Electric Van...,ZETCP,162076.0,91830.0,91830.0,91830.0,0.0,0.0,0.0,0.0
14,Kern COG,mcfarland,Design and construct a transit station providi...,TIRCP,707216.0,713606.0,0.0,0.0,0.0,0.0,0.0,0.0
15,Kern COG,mcfarland,Design and construct a transit station providi...,ZETCP,76922.0,43583.0,43583.0,43583.0,0.0,0.0,0.0,0.0


# DONE concat_everything 
Test of concat all the dictionary dataframe vales

In [105]:
def concat_everything():
    all_fund_requests = pd.concat(cleaned_fund_request.values(), ignore_index=True)
    
    all_fund_requests = pd.concat(
    [
        all_fund_requests,
        santa_cruz_cleaned,
        orange_cleaned,
        butte_cleaned,
        lassen_cleaned,
        ventura_big_merge,
        kern_clean,
    ],
    ignore_index=True,
    )
    
    all_fund_requests["fund source"] = all_fund_requests["fund source"].astype(str)
    
    return all_fund_requests

In [106]:
test_all_df = concat_everything()

display(
    test_all_df.shape
)

(287, 12)

In [102]:
fund_request_df = cleaned_fund_request.values()

type(fund_request_df)

dict_values

In [112]:
all_fund_requests = pd.concat(cleaned_fund_request.values(), ignore_index=True)

In [113]:
all_fund_requests = pd.concat(
    [
        all_fund_requests,
        santa_cruz_cleaned,
        orange_cleaned,
        butte_cleaned,
        lassen_cleaned,
        ventura_big_merge,
        kern_clean,
    ],
    ignore_index=True,
)

In [114]:
all_fund_requests["fund source"] = all_fund_requests["fund source"].astype(str)

In [115]:
display(
    all_fund_requests.shape,
    type(all_fund_requests),
    all_fund_requests.dtypes,
)

(287, 12)

pandas.core.frame.DataFrame

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital_FY23-24              object
capital_FY24-25              object
capital_FY25-26              object
capital_FY26-27              object
operating_FY23-24            object
operating_FY24-25            object
operating_FY25-26            object
operating_FY26-27            object
dtype: object

# Save Concat DF Parquet to GCS

In [53]:
# SAVING TO GCS!
all_fund_requests.to_parquet(f"{GCS_PATH}all_fund_requests_concat.parquet")

In [108]:
test_all_df.to_parquet(f"{GCS_PATH}test_DELETE_LATER.parquet")

# DONE fund_requet_melt

In [126]:

def fund_request_melt(df):
    id_vars = [
        "rtpa",
        "implementing agenc-y/-ies",
        "project",
        "fund source",
    ]
    val_vars = [
        "capital_FY23-24",
        "capital_FY24-25",
        "capital_FY25-26",
        "capital_FY26-27",
        "operating_FY23-24",
        "operating_FY24-25",
        "operating_FY25-26",
        "operating_FY26-27",
    ]

    melt = df.melt(
        id_vars=id_vars,
        value_vars=val_vars,
        var_name="capital/operation fy",
        value_name="fund amount",
        ignore_index=True,
    )
    
    # splitting the cap/operations columns

    melt[["project type", "fiscal year"]] = melt["capital/operation fy"].str.split(
        "_FY", expand=True
    )

    melt["fund amount"] = melt["fund amount"].fillna(0).astype("int64")
    melt["fund source"] = melt["fund source"].astype("str")
    
    return melt

In [130]:
all_melt = fund_request_melt(all_fund_requests)

In [123]:
display(type(all_melt), all_melt.shape, all_melt.dtypes, all_melt.head())

pandas.core.frame.DataFrame

(2296, 8)

rtpa                         object
implementing agenc-y/-ies    object
project                      object
fund source                  object
capital/operation fy         object
fund amount                   int64
project type                 object
fiscal year                  object
dtype: object

Unnamed: 0,rtpa,implementing agenc-y/-ies,project,fund source,capital/operation fy,fund amount,project type,fiscal year
0,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,360641,capital,23-24
1,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,TIRCP,capital_FY23-24,0,capital,23-24
2,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (GGRF),capital_FY23-24,3616,capital,23-24
3,Alpine County Transportation Commission,Alpine County Transportation Commission,Transit Facility Conversion Project,ZETCP (PTA),capital_FY23-24,3123,capital,23-24
4,Amador County Transportation Commission,Amador Transit,,TIRCP,capital_FY23-24,100000,capital,23-24


In [131]:
# check for NaNs
all_melt["fund amount"].isna().sum()

0

In [132]:
# checking fund amounts for any non-int values
all_melt["fund amount"].unique()

array([   360641,         0,      3616,      3123,    100000,   2576611,
          238532,    500000,   1175501,     69395,     80357,   9085857,
          242312,    521458,   6849293,    329561,    295818,   1133408,
          101796, 498650905, 119494973,   3706510,     87574,    198860,
           84159,   8254231,    108201,     42539,     49259,   4848229,
          123518,    271142,    110636,    104561,    696393,    722403,
          836515,   1000000,   2080000,   6400000,    200000,    370000,
         1486685,    297576,    133646,  10000000,    400000,     92109,
          501942,   1019544,   3354086,   2510740,    614200,    250000,
          100520,     61421,     53042,  63382700,   4003053,   7956643,
         2868594,  60000000,   2328990,    348002,  10126000,   5434000,
         1705263,  26000000,    943316,   2050000,      9408,      8124,
          124000,    280000,    395000,   1778000,   1400000,   4000000,
          387000,    375000,    600000,    422000, 

In [134]:
display(all_melt["fund source"].value_counts())

TIRCP           1352
ZETCP (GGRF)     464
ZETCP (PTA)      304
ZETCP            120
CMAQ              16
Measure V          8
`5339              8
SGR                8
Farebox            8
`5307              8
Name: fund source, dtype: int64

# Save Melt DF Parquet to GCS

In [63]:
all_melt.to_parquet(f"{GCS_PATH}all_fund_requests_melt.parquet")

# Draft Aggregations

In [135]:
def make_bar(data, x_axis, y_axis):
    chart = (
        alt.Chart(data)
        .mark_bar()
        .encode(
            x=x_axis,
            y=y_axis,
        )
    )
    return chart

In [136]:
by_type = (
    all_melt.groupby(["project type"])
    .agg(
        {
            "fund amount": "sum",
        }
    )
    .reset_index()
)

display(by_type)
make_bar(by_type, y_axis="project type", x_axis="fund amount")

Unnamed: 0,project type,fund amount
0,capital,1855348721
1,operating,625654847


In [137]:
by_year = (
    all_melt.groupby(["fiscal year", "project type"])
    .agg(
        {
            "fund amount": "sum",
        }
    )
    .reset_index()
)

display(by_year)
make_bar(
    by_year,
    y_axis="fiscal year",
    x_axis="fund amount",
)

Unnamed: 0,fiscal year,project type,fund amount
0,23-24,capital,945242943
1,23-24,operating,230807266
2,24-25,capital,812480147
3,24-25,operating,169494114
4,25-26,capital,49092309
5,25-26,operating,96177774
6,26-27,capital,48533322
7,26-27,operating,129175693


In [138]:
alt.Chart(by_year).mark_line(point=True).encode(
    x="fiscal year", y="fund amount", color="project type"
)

In [139]:
alt.Chart(by_year).mark_bar(point=True).encode(
    x="fiscal year", y="fund amount", color="project type"
)

In [140]:
by_source = (
    all_melt.groupby(["fund source"])
    .agg({"fund amount": "sum", "rtpa": "nunique"})
    .reset_index()
)

make_bar(by_source, y_axis="fund source", x_axis="fund amount")

In [141]:
by_rtpa = (
    all_melt.groupby(["rtpa"])
    .agg({"fund amount": "sum", "project": "nunique"})
    .reset_index()
)

make_bar(by_rtpa, "rtpa", "fund amount")