# Run Functions to Add Information to Projects

To run the data through the script, all you need to do is update `my_file` path to the most recent export from FMIS and QMRS uploaded to GCS, then run the function in the section `Export Data` with your dataframe and the current date. Then your aggregated data will be ready in GCS. 

In [1]:
import _data_utils
import _script_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_colwidth", None)

In [3]:
locodes = to_snakecase(
    pd.read_excel(
        f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
    )
)

## Read in Data and function development / Test Function

For the following function:
* update the file path for `my_file` to the most recent file name of the FMIS & QMRS export
* the second kwargs is the unique recipient identifier, in this case it should stay the same with subsequent exports
* the third kwargs is the aggregation level you want for the data. Unless otherwise specified, it should be `agg` which is one row per project

In [4]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/dla/dla-iija"

In [5]:
my_file = "FMIS_IIJA_20250709.xlsx"

### Check data
* July 2025 Notes
    * `summary_recipient_defined_text_field_1_value` changed to `summary_recipient` in `script_utils.run_script_original` and `script_utils.run_script_2025.`
    * `rk_locode` is missing so I used `run_script_original` instead.
    * Updated `_script_utils.add_county_abbrev()` because the values in the counties geojson in `shared_data_catalog.yml` changed. 

In [6]:
check_data = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{my_file}"))

In [7]:
check_data.head(1)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient
0,2022-03-15,Y230,STBG-URBANIZED >200K IIJA,6084275,0422000280L,"FREMONT, RICHMOND, AND MARIN AND SONOMA COUNTIES, ALONG THE SMART CORRIDOR. BIKE SHARE CAPITAL PROGRAM (TC)",13,Cong Dist 15,Active,"FREMONT, RICHMOND, AND MARIN AND SONOMA COUNTIES, ALONG THE SMART CORRIDOR. BIKE SHARE CAPITAL PROGRAM (TC)",44,Other,700000.0,700000.0,L6084MTC


### Run Script
* Choose between `run_script_original` or `run_script_2025` depending on the dataframe you receive.

In [8]:
df = _script_utils.run_script_original(
    my_file, "summary_recipient_defined_text_field_1_value", "agg"
)

Index(['fmis_transaction_date', 'project_number', 'implementing_agency',
       'summary_recipient_defined_text_field_1_value', 'funding_type_code',
       'program_code', 'program_code_description', 'recipient_project_number',
       'improvement_type', 'improvement_type_description',
       'program_code_description_for_description', 'project_title',
       'obligations_amount', 'total_cost_amount', 'congressional_district',
       'district', 'county_code', 'county_name', 'county_name_abbrev',
       'county_name_title', 'implementing_agency_locode', 'rtpa_name',
       'mpo_name'],
      dtype='object')

  df['implementing_agency_locode'] = df['implementing_agency_locode'].str.replace('.0', '')


True

### Testing the data

In [9]:
assert len(df) == check_data.project_number.nunique()

In [10]:
check_data.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'project_number', 'recipient_project_number', 'project_title',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient'],
      dtype='object')

In [11]:
check_data.loc[check_data.project_number == "5004049"]

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient
1771,2024-04-15,Y001,NATIONAL HIGHWAY PERF IIJA,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",10,Bridge Replacement - Added Capacity,43068358.0,38128417.81,L5004SANDAG
1772,2024-04-15,Y001,NATIONAL HIGHWAY PERF IIJA,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",17,Construction Engineering,12180390.07,10321290.0,L5004SANDAG
1773,2024-04-15,Y110,HIP BRIDGE FORMULA PROGRAM,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",10,Bridge Replacement - Added Capacity,13409672.0,11871582.19,L5004SANDAG
1777,2024-04-15,Y908,HWY INFRA BRDG REPL -2022 APPN,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",10,Bridge Replacement - Added Capacity,10373649.58,9183791.97,L5004SANDAG
1778,2024-04-15,Y908,HWY INFRA BRDG REPL -2022 APPN,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",17,Construction Engineering,595918.93,527567.03,L5004SANDAG
1779,2024-04-15,Y909,HWY INFRA BRDG REPL -2023 APPN,5004049,11955780L,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",73,Cong Dist 52,Active,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",10,Bridge Replacement - Added Capacity,11300338.42,10004189.6,L5004SANDAG


In [12]:
df.columns

Index(['fmis_transaction_date', 'project_number', 'implementing_agency',
       'summary_recipient_defined_text_field_1_value', 'funding_type_code',
       'program_code', 'program_code_description', 'recipient_project_number',
       'improvement_type', 'improvement_type_description',
       'old_project_title_desc', 'obligations_amount', 'total_cost_amount',
       'congressional_district', 'district', 'county_code', 'county_name',
       'county_name_abbrev', 'implementing_agency_locode', 'rtpa_name',
       'mpo_name', 'new_project_title', 'new_description_col'],
      dtype='object')

In [13]:
df.loc[df["project_number"] == "5004049"]

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,funding_type_code,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,old_project_title_desc,obligations_amount,total_cost_amount,congressional_district,district,county_code,county_name,county_name_abbrev,implementing_agency_locode,rtpa_name,mpo_name,new_project_title,new_description_col
785,2024-04-15,5004049,San Diego,L5004SANDAG,IIJA-F,Y001|Y110|Y908|Y909,National Highway Performance Program (NHPP)|Bridge Formula Program|Bridge Replacement and Rehabilitation Program,11955780L,10|17,Bridge Replacement - Added Capacity|Construction Engineering,"WEST MISSION BAY DRIVE OVER THE SAN DIEGO RIVER BRIDGE REPLACEMENT, BR. NO. 57C-0023",80036838,90928327,|52|,|11|,73,San Diego County,|SD|,4,San Diego Association of Governments,San Diego Association Of Governments,Replace Bridge in San Diego,"Replace Bridge in San Diego, part of the National Highway Performance Program (NHPP), and the Bridge Formula Program, and the Bridge Replacement and Rehabilitation Program. (Federal Project ID: 5004049)."


## Export Data

In [14]:
### rename the file for export to GCS
### use date to rename

In [15]:
# _script_utils.export_to_gcs(df, "07102025_agg")

## Removing S***ba
### `data_utils`

In [16]:
def update_program_code_list2():
    updated_codes = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"
        )
    )[["iija_program_code", "new_description"]]
    original_codes = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/Copy of lst_IIJA_Code_20230908.xlsx"
        )
    )[["iija_program_code", "description", "program_name"]]

    program_codes = pd.merge(
        updated_codes,
        original_codes,
        on="iija_program_code",
        how="outer",
        indicator=True,
    )
    program_codes["new_description"] = program_codes["new_description"].str.strip()

    program_codes.new_description.fillna(program_codes["description"], inplace=True)

    program_codes = program_codes.drop(columns={"description", "_merge"})

    def add_program_to_row(row):
        if "Program" not in row["program_name"]:
            return row["program_name"] + " Program"
        else:
            return row["program_name"]

    program_codes["program_name"] = program_codes.apply(add_program_to_row, axis=1)

    return program_codes

### `script_utils`

In [17]:
def county_district_crosswalk() -> pd.DataFrame:
    """
    Aggregate locodes dataset to find which
    districts a county lies in.
    """
    # Load locodes
    locodes_df = _script_utils.load_locodes()

    # Load counties
    county_base = _script_utils.load_county()

    county_district = locodes_df[["district", "county_name"]].drop_duplicates()

    county_info = pd.merge(
        county_base,
        county_district,
        how="left",
        left_on="county_description",
        right_on="county_name",
    ).drop(columns=["county_name"])
    return county_info

In [18]:
test1 = county_district_crosswalk()

In [19]:
test1.head()

Unnamed: 0,recipient_name,county_description,county_code,district
0,California,Alameda County,1,4.0
1,California,Alpine County,3,10.0
2,California,Amador County,5,10.0
3,California,Butte County,7,3.0
4,California,Calaveras County,9,10.0
