## Requested updates submitted January 2025
* Using the additional lists provided, can you please update your script to include additional program codes
* Updates to program descriptions as highlighted in column C 
* Adding the funding type from column F in the script output. 
* Use the “RK Locode” column K in the Project list as the Primary Locode, and if blank, use your current data source to populate the implementing agency.

In [1]:
import _data_utils
import _script_utils
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/dla/dla-iija"

In [4]:
project_list = "IIJA Project List 01_2025.xlsx"

In [5]:
project_df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{project_list}"))

In [6]:
program_codes = "program_codes/Ycodes_01.2025.xlsx"

In [7]:
program_codes_df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{program_codes}"))

In [8]:
program_codes_df.columns

Index(['program_code', 'short_name', 'program_code_description',
       'funding_type', 'funding_type_code', 'iija_code', 'notes_resources'],
      dtype='object')

### Testing

In [9]:
new_codes = _data_utils.update_program_code_list_2025()

In [10]:
iija_code_map = dict(new_codes[['iija_program_code', 'program_name']].values)

In [11]:
iija_code_map

{'Y44A': 'Advanced Transportation Technologies Deployment Program',
 'Y110': 'Bridge Formula Program',
 'Y113': 'Bridge Formula Program',
 'Y114': 'Hip Bridge Formula Program-24',
 'Y115': 'Hip Bridge Formula Program-25',
 'Y120': 'Bridge Formula Program',
 'Y123': 'Hip Bridge Formula Pgm Off-Sys Program',
 'Y124': 'Hip Bridge Form Prm-Off Sys-24 Program',
 'Y125': 'Hip Bridge Form Prm-Off Sys-25 Program',
 'Y908': 'Bridge Replacement and Rehabilitation Program',
 'Y909': 'Bridge Replacement and Rehabilitation Program',
 'Y600': 'Carbon Reduction Program',
 'Y601': 'Carbon Reduction Program',
 'Y606': 'Carbon Reduction Program',
 'Y607': 'Carbon Reduction Program',
 'Y608': 'Carbon Reduction Program',
 'Y407': 'Charging & Fueling Infra-Iija Program',
 'Y603': 'Hip Comm Proj Cong-Dir 24 Hif Program',
 'Y926': 'Community Project Funding Congressionally Directed Spending Program',
 'Y928': 'Community Project Funding Congressionally Directed Spending Program',
 'Y400': 'Congestion Mitigati

In [12]:
project_df.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'pid_district', 'project_number', 'recipient_project_number',
       'pid_check1', 'efis_id', 'pid_check2', 'project_title', 'rk_locode',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient_defined_text_field_1_value',
       'comp'],
      dtype='object')

In [14]:
test_df = _data_utils.add_new_codes(project_df)

In [15]:
test_df.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'pid_district', 'project_number', 'recipient_project_number',
       'pid_check1', 'efis_id', 'pid_check2', 'project_title', 'rk_locode',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient_defined_text_field_1_value',
       'comp', 'iija_program_code', 'funding_type_code'],
      dtype='object')

In [None]:
test_df.info()

In [None]:
test_df[
    ["program_code_description", "program_code", "funding_type_code"]
].drop_duplicates().sample(10)

### Breakout `_data_utils.update_program_code_list2()`

In [None]:
def add_program_to_row(row):
    if "Program" not in row["program_name"]:
        return row["program_name"] + " Program"
    else:
        return row["program_name"]

In [None]:
def load_program_codes_og() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/Copy of lst_IIJA_Code_20230908.xlsx"
        )
    )[["iija_program_code", "description", "program_name"]]
    return df

In [None]:
original_codes_df = load_program_codes_og()

In [None]:
original_codes_df.head()

In [None]:
def load_program_codes_sept_2023() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"
        )
    )[["iija_program_code", "new_description"]]
    return df

In [None]:
program_codes_sept_2023 = load_program_codes_sept_2023()

In [None]:
program_codes_sept_2023.head(2)

In [None]:
program_codes = pd.merge(
    program_codes_sept_2023,
    original_codes_df,
    on="iija_program_code",
    how="outer",
    indicator=True,
)

In [None]:
program_codes["new_description"] = (
    program_codes["new_description"].str.strip().fillna(program_codes.description)
)

In [None]:
program_codes._merge.value_counts()

In [None]:
program_codes = program_codes.drop(columns={"description", "_merge"})

In [None]:
program_codes["program_name"] = program_codes.apply(add_program_to_row, axis=1)

In [None]:
program_codes.head(2)

In [None]:
def load_program_codes_jan_2025() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}/program_codes/Ycodes_01.2025.xlsx")
    )[["program_code", "short_name", "program_code_description", "funding_type_code"]]

    df = df.rename(
        columns={
            "program_code": "iija_program_code",
        }
    )
    df.short_name = df.short_name.str.title()
    return df

In [None]:
program_codes_jan_2025 = load_program_codes_jan_2025()

In [None]:
program_codes_jan_2025.head(2)

In [None]:
program_codes.head(2)

In [None]:
program_codes2 = pd.merge(
    program_codes_jan_2025,
    program_codes,
    on="iija_program_code",
    how="outer",
    indicator=True,
)

In [None]:
program_codes2._merge.value_counts()

In [None]:
program_codes2["2025_description"] = (
    program_codes2["program_code_description"]
    .str.strip()
    .fillna(program_codes2.new_description)
)

In [None]:
program_codes2.head(2)

In [None]:
program_codes2["2025_program_name"] = program_codes2.program_name.fillna(
    program_codes2.short_name
)

In [None]:
program_codes2.columns

In [None]:
program_codes2.loc[program_codes2._merge == "both"][
    [
        "iija_program_code",
        "funding_type_code",
        "short_name",
        "program_name",
        "2025_program_name",
        "program_code_description",
        "2025_description",
        "new_description",
        "_merge",
    ]
]

In [None]:
program_codes3 = program_codes2.drop(
    columns=[
        "short_name",
        "program_name",
        "program_code_description",
        "new_description",
        "_merge",
    ]
)

In [None]:
program_codes3 = program_codes3.rename(
    columns={"2025_description": "new_description", "2025_program_name": "program_name"}
)

In [None]:
program_codes3.sort_values(by=["iija_program_code"])

In [None]:
program_codes3["program_name"] = program_codes3.apply(add_program_to_row, axis=1)

#### Turn this into a function

In [None]:
def update_program_code_list_2025():
    """
    On January 2025, we received a new list of updated codes.
    Merge this new list with codes received originally and in
    September 2023.
    """
    # Load original codes
    original_codes_df = load_program_codes_og()

    # Load September 2023 codes
    program_codes_sept_2023 = load_program_codes_sept_2023()

    # Merge original + September first
    m1 = pd.merge(
        program_codes_sept_2023,
        original_codes_df,
        on="iija_program_code",
        how="outer",
        indicator=True,
    )

    # Clean up description
    m1["new_description"] = m1["new_description"].str.strip().fillna(m1.description)

    # Delete unnecessary columns
    m1 = m1.drop(columns={"description", "_merge"})

    """
    # Load January 2025 code
    program_codes_jan_2025 = load_program_codes_jan_2025()

    # Merge m1 with program codes from January 2025.
    m2 = pd.merge(
        program_codes_jan_2025,
        m1,
        on="iija_program_code",
        how="outer",
        indicator=True,
    )
    # Update descriptions
    m2["2025_description"] = (
        m2["program_code_description"].str.strip().fillna(m2.new_description)
    )

    # Update program names
    m2["2025_program_name"] = m2.program_name.fillna(m2.short_name)

    # Delete outdated columns
    m2 = m2.drop(
        columns=[
            "short_name",
            "program_name",
            "program_code_description",
            "new_description",
            "_merge",
        ]
    )

    # Rename to match original sheet
    m2 = m2.rename(
        columns={
            "2025_description": "new_description",
            "2025_program_name": "program_name",
        }
    )

    # Add program to another program names without the string "program"
    m2["program_name"] = m2.apply(add_program_to_row, axis=1)
    """
    return m1

In [None]:
program_codes_jan_2025 = _data_utils.load_program_codes_jan_2025()

In [None]:
program_codes_jan_2025.loc[
    program_codes_jan_2025.iija_program_code.str.contains("ER03")
]

In [None]:
program_codes_jan_2025.loc[
    program_codes_jan_2025.iija_program_code.str.contains("ER01")
]

In [None]:
og_code_list = update_program_code_list_2025()

In [None]:
og_code_list.head(2)

In [None]:
og_code_list.loc[og_code_list.iija_program_code.str.contains("ER03")]

### Project
`Use the “RK Locode” column K in the Project list as the Primary Locode, and if blank, use your current data source to populate the implementing agency.`

In [None]:
march_file = "FMIS_Projects_Universe_IIJA_Reporting_03012024_ToDLA.xlsx"

In [None]:
march_data = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{march_file}"))

In [None]:
march_data.info()

In [None]:
project_df.head(1)

In [None]:
project_df[
    [
        "summary_recipient_defined_text_field_1_value",
        "rk_locode",
    ]
].head()

#### Filter out for rows with a locode first?

In [None]:
def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill in locodes, using the column rk_locode first
    then using the original function from Natalie.
    """
    # Load dataframe with locodes
    locodes_df = to_snakecase(
        pd.read_excel(
            f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
        )
    )

    # Filter out for rows in which rk_locode is filled
    filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)

    # Merge the two dataframes
    filled_locode_df2 = pd.merge(
        filled_locode_df,
        locodes_df,
        left_on="rk_locode",
        right_on="agency_locode",
        how="left",
        indicator=True,
    )
    display("Rows with locodes filled")
    display(filled_locode_df2._merge.value_counts())

    # Clean
    filled_locode_df2 = filled_locode_df2.rename(
        columns={
            "agency_name": "implementing_agency",
            "rk_locode": "implementing_agency_locode",
        }
    ).drop(columns=["active_e76s______7_12_2021_", "mpo_locode_fads", "agency_locode"])

    # Create a crosswalk out of this dataframe so any rows without a rk_locode
    # has summary_recipient_defined_text_field_1_value filled can be merged
    crosswalk_cols = [
        "summary_recipient_defined_text_field_1_value",
        "implementing_agency",
        "implementing_agency_locode",
        "district",
        "county_name",
        "rtpa_name",
        "mpo_name",
    ]

    # Crosswalk
    crosswalk_df = filled_locode_df2[crosswalk_cols].drop_duplicates(
        subset=["summary_recipient_defined_text_field_1_value"]
    )

    # Filter out for rows with missing locodes
    missing_locode_df = (
        df.loc[(df.rk_locode.isna())].reset_index(drop=True).drop(columns=["rk_locode"])
    )

    # Merge crosswalk and dataframe of rows with missing locodes
    missing_locode_df1 = pd.merge(
        missing_locode_df,
        crosswalk_df,
        on="summary_recipient_defined_text_field_1_value",
        how="left",
        indicator=True,
    )
    display("Rows with locodes filled with the crosswalk")
    display(m1._merge.value_counts())

    # Fill out missing locodes with Natalie's function
    missing_locode_df2 = missing_locode_df1.loc[
        missing_locode_df1._merge == "left_only"
    ].reset_index(drop=True)

    # Drop columns that were combined from the crosswalk
    crosswalk_cols.remove("summary_recipient_defined_text_field_1_value")
    crosswalk_cols.append("_merge")

    missing_locode_df2 = missing_locode_df2.drop(columns=crosswalk_cols)

    # Fill in summary_recipient_defined_text_field_1_value
    missing_locode_df2.summary_recipient_defined_text_field_1_value = (
        missing_locode_df2.summary_recipient_defined_text_field_1_value.fillna("None")
    )

    # Try add_name_from_locode from _data_utils
    missing_locode_df2 = _data_utils.add_name_from_locode(
        missing_locode_df2, "summary_recipient_defined_text_field_1_value"
    )

    # Concat all the dataframes
    final_df = pd.concat([filled_locode_df2, missing_locode_df1, missing_locode_df2])

    return final_df

In [None]:
project_df_test = identify_agency2(project_df)

In [None]:
len(project_df_test)

In [None]:
len(project_df)

In [None]:
filled_locode_df = project_df.loc[project_df.rk_locode.notna()].reset_index(drop=True)

In [None]:
# Merge the two dataframes
filled_locode_df2 = pd.merge(
    filled_locode_df,
    locodes_df,
    left_on="rk_locode",
    right_on="agency_locode",
    how="left",
    indicator=True,
)

In [None]:
filled_locode_df2 = filled_locode_df2.rename(
    columns={
        "agency_name": "implementing_agency",
        "rk_locode": "implementing_agency_locode",
    }
).drop(columns=["active_e76s______7_12_2021_", "mpo_locode_fads", "agency_locode"])

In [None]:
def identify_agency3(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill in locodes, using the column rk_locode first
    then using the original function from Natalie.
    """
    # Load dataframe with locodes
    locodes_df = to_snakecase(
        pd.read_excel(
            f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
        )
    ).rename(
        columns={
            "agency_name": "implementing_agency",
        }
    )

    # Filter out for rows in which rk_locode is filled
    filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)

    # Merge the two dataframes
    filled_locode_df2 = pd.merge(
        filled_locode_df,
        locodes_df,
        left_on="rk_locode",
        right_on="agency_locode",
        how="left",
        indicator=True,
    )
    display("Rows with locodes filled")
    display(filled_locode_df2._merge.value_counts())

    # Clean
    filled_locode_df2 = filled_locode_df2.rename(
        columns={
            "agency_name": "implementing_agency",
            "rk_locode": "implementing_agency_locode",
        }
    ).drop(
        columns=[
            "active_e76s______7_12_2021_",
            "mpo_locode_fads",
            "agency_locode",
            "_merge",
        ]
    )

    # Filter out for rows with missing locodes
    missing_locode_df = (df.loc[(df.rk_locode.isna())].reset_index(drop=True)).drop(
        columns=["rk_locode"]
    )

    # Fill in summary_recipient_defined_text_field_1_value
    missing_locode_df.summary_recipient_defined_text_field_1_value = (
        missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
    )

    # Try add_name_from_locode from _data_utils
    missing_locode_df2 = _data_utils.add_name_from_locode(
        missing_locode_df, "summary_recipient_defined_text_field_1_value"
    )

    # Concat all the dataframes
    final_df = pd.concat([filled_locode_df2, missing_locode_df2])
    display("Do the # of rows match?")
    display(len(final_df) == len(df))

    # More cleaning
    county_district = (
        locodes
        >> group_by(_.district, _.county_name)
        >> count(_.county_name)
        >> select(_.district, _.county_name)
        >> filter(_.county_name != "Multi-County", _.district != 53)
    )
    county_info = pd.merge(
        county_base,
        county_district,
        how="left",
        left_on="county_description",
        right_on="county_name",
    ).drop(columns=["county_name"])
    mapping1 = dict(county_info[["county_code", "county_description"]].values)
    mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
    mapping3 = dict(county_info[["county_code", "district"]].values)

    final_df["county_description"] = final_df.county_code.map(mapping1)
    final_df["recipient_name"] = final_df.county_code.map(mapping2)
    final_df["district"] = final_df.county_code.map(mapping3)

    final_df.loc[final_df.county_name == "Statewide County", "county_name"] = (
        "Statewide"
    )

    final_df["implementing_agency"] = final_df["implementing_agency"].fillna(
        value="Unknown"
    )
    final_df["county_name"] = final_df["county_name"].fillna(value="Unknown")
    return final_df

In [None]:
project_df_test2 = identify_agency3(project_df)

In [None]:
project_df_test2.columns

In [None]:
county_info = pd.merge(
    county_base,
    county_district,
    how="left",
    left_on="county_description",
    right_on="county_name",
)
county_info.drop(columns=["county_name"], axis=1, inplace=True)

In [None]:
mapping1 = dict(county_info[["county_code", "county_description"]].values)
mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
mapping3 = dict(county_info[["county_code", "district"]].values)

In [None]:
mapping1

In [None]:
project_df_test2["county_description"] = project_df_test2.county_code.map(mapping1)
project_df_test2["recipient_name"] = project_df_test2.county_code.map(mapping2)
project_df_test2["district"] = project_df_test2.county_code.map(mapping3)

In [None]:
project_df_test2.loc[
    project_df_test2.county_name == "Statewide County", "county_name"
] = "Statewide"

project_df_test2["implementing_agency"] = project_df_test2[
    "implementing_agency"
].fillna(value="Unknown")
project_df_test2["county_name"] = project_df_test2["county_name"].fillna(
    value="Unknown"
)

In [None]:
project_df_test2.info()

### To-Do
* How to incorporate `Funding Type Code` from `Ycodes` file
* Er03 is still "Emergency Rel 2023 Supplement Program" instead of Emergency Supplement Funding