## Requested updates submitted January 2025
* Using the additional lists provided, can you please update your script to include additional program codes
* Updates to program descriptions as highlighted in column C 
* Adding the funding type from column F in the script output. 
* Use the “RK Locode” column K in the Project list as the Primary Locode, and if blank, use your current data source to populate the implementing agency.

In [43]:
import _data_utils
import _script_utils
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [44]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [45]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/dla/dla-iija"

In [46]:
project_list = "IIJA Project List 01_2025.xlsx"

In [47]:
project_df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{project_list}"))

In [48]:
program_codes = "program_codes/Ycodes_01.2025.xlsx"

In [49]:
program_codes_df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{program_codes}"))

In [50]:
program_codes_df.columns

Index(['program_code', 'short_name', 'program_code_description',
       'funding_type', 'funding_type_code', 'iija_code', 'notes_resources'],
      dtype='object')

### Breakout `_data_utils.update_program_code_list2()`

In [51]:
def add_program_to_row(row):
    if "Program" not in row["program_name"]:
        return row["program_name"] + " Program"
    else:
        return row["program_name"]

In [52]:
def load_program_codes_og() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/Copy of lst_IIJA_Code_20230908.xlsx"
        )
    )[["iija_program_code", "description", "program_name"]]
    return df

In [53]:
original_codes_df = load_program_codes_og()

In [54]:
original_codes_df.head()

Unnamed: 0,iija_program_code,description,program_name
0,22MP,Metropolitan Transportation Planning (FY 22),Metropolitan Transportation Planning (FY 22)
1,22SP,Statewide and Nonmetropolitan Transportation Planning (FY 22),Statewide and Nonmetropolitan Transportation Planning (FY 22)
2,73AD,Military Construction (FMIS),Military Construction
3,ER01,Emergency Suppliment Funding,Emergency Supplement Funding
4,N003,CMAQ - Projects to Reduce PM 2.5 Emissions,Congestion Mitigation & Air Quality Improvement


In [55]:
def load_program_codes_sept_2023() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(
            f"{GCS_FILE_PATH}/program_codes/FY21-22ProgramCodesAsOf5-25-2022.v2_expanded090823.xlsx"
        )
    )[["iija_program_code", "new_description"]]
    return df

In [56]:
program_codes_sept_2023 = load_program_codes_sept_2023()

In [57]:
program_codes_sept_2023.head(2)

Unnamed: 0,iija_program_code,new_description
0,Y001,National Highway Performance Program (NHPP)
1,Y002,National Highway Performance Program (NHPP)


In [58]:
program_codes = pd.merge(
    program_codes_sept_2023,
    original_codes_df,
    on="iija_program_code",
    how="outer",
    indicator=True,
)

In [59]:
program_codes["new_description"] = (
    program_codes["new_description"].str.strip().fillna(program_codes.description)
)

In [60]:
program_codes._merge.value_counts()

both          133
right_only      3
left_only       0
Name: _merge, dtype: int64

In [61]:
program_codes = program_codes.drop(columns={"description", "_merge"})

In [62]:
program_codes["program_name"] = program_codes.apply(add_program_to_row, axis=1)

In [63]:
program_codes.head(2)

Unnamed: 0,iija_program_code,new_description,program_name
0,Y001,National Highway Performance Program (NHPP),National Highway Performance Program (NHPP)
1,Y002,National Highway Performance Program (NHPP),NHPP Exempt Program


In [64]:
def load_program_codes_jan_2025() -> pd.DataFrame:
    df = to_snakecase(
        pd.read_excel(f"{GCS_FILE_PATH}/program_codes/Ycodes_01.2025.xlsx")
    )[["program_code", "short_name", "program_code_description", "funding_type_code"]]

    df = df.rename(
        columns={
            "program_code": "iija_program_code",
        }
    )
    df.short_name = df.short_name.str.title()
    return df

In [65]:
program_codes_jan_2025 = load_program_codes_jan_2025()

In [66]:
program_codes_jan_2025.head(2)

Unnamed: 0,iija_program_code,short_name,program_code_description,funding_type_code
0,Y44A,Adv Tech Innv Mobility Deploy,Advanced Transportation Technologies Deployment Program,IIJA-C
1,Y110,Hip Bridge Formula Program,Bridge Formula Program,IIJA-F


In [67]:
program_codes.head(2)

Unnamed: 0,iija_program_code,new_description,program_name
0,Y001,National Highway Performance Program (NHPP),National Highway Performance Program (NHPP)
1,Y002,National Highway Performance Program (NHPP),NHPP Exempt Program


In [68]:
program_codes2 = pd.merge(
    program_codes_jan_2025,
    program_codes,
    on="iija_program_code",
    how="outer",
    indicator=True,
)

In [69]:
program_codes2._merge.value_counts()

right_only    86
both          50
left_only     22
Name: _merge, dtype: int64

In [70]:
program_codes2["2025_description"] = (
    program_codes2["program_code_description"]
    .str.strip()
    .fillna(program_codes2.new_description)
)

In [71]:
program_codes2.head(2)

Unnamed: 0,iija_program_code,short_name,program_code_description,funding_type_code,new_description,program_name,_merge,2025_description
0,Y44A,Adv Tech Innv Mobility Deploy,Advanced Transportation Technologies Deployment Program,IIJA-C,Advanced Transportation Technologies and Innovative Mobility Deployment,Advanced Transportation Technologies Deployment Program,both,Advanced Transportation Technologies Deployment Program
1,Y110,Hip Bridge Formula Program,Bridge Formula Program,IIJA-F,Bridge Formula Program,Bridge Formula Program,both,Bridge Formula Program


In [72]:
program_codes2["2025_program_name"] = program_codes2.program_name.fillna(
    program_codes2.short_name
)

In [73]:
program_codes2.columns

Index(['iija_program_code', 'short_name', 'program_code_description',
       'funding_type_code', 'new_description', 'program_name', '_merge',
       '2025_description', '2025_program_name'],
      dtype='object')

In [74]:
program_codes2.loc[program_codes2._merge == "both"][
    [
        "iija_program_code",
        "funding_type_code",
        "short_name",
        "program_name",
        "2025_program_name",
        "program_code_description",
        "2025_description",
        "new_description",
        "_merge",
    ]
]

Unnamed: 0,iija_program_code,funding_type_code,short_name,program_name,2025_program_name,program_code_description,2025_description,new_description,_merge
0,Y44A,IIJA-C,Adv Tech Innv Mobility Deploy,Advanced Transportation Technologies Deployment Program,Advanced Transportation Technologies Deployment Program,Advanced Transportation Technologies Deployment Program,Advanced Transportation Technologies Deployment Program,Advanced Transportation Technologies and Innovative Mobility Deployment,both
1,Y110,IIJA-F,Hip Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,both
2,Y113,IIJA-F,Hip Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,both
5,Y120,IIJA-F,Hip Bridge Formula Pgm Off-Sys,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,both
6,Y120,IIJA-F,Hip Bridge Formula Pgm Off-Sys,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,Bridge Formula Program,both
10,Y908,IIJA-F,Hwy Infra Brdg Repl -2022 Appn,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,both
11,Y909,IIJA-F,Hwy Infra Brdg Repl -2023 Appn,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,both
12,Y600,IIJA-F,Carbon Reduction Prg Flex Iija,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program (CRP),both
13,Y601,IIJA-F,Carbon Reductn Prog >200K Iija,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program (CRP),both
14,Y606,IIJA-F,Carbon Redcn Prg 50K-200K Iija,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program,Carbon Reduction Program (CRP),both


In [75]:
program_codes3 = program_codes2.drop(
    columns=[
        "short_name",
        "program_name",
        "program_code_description",
        "new_description",
        "_merge",
    ]
)

In [76]:
program_codes3 = program_codes3.rename(
    columns={"2025_description": "new_description", "2025_program_name": "program_name"}
)

In [77]:
program_codes3.sort_values(by=["iija_program_code"])

Unnamed: 0,iija_program_code,funding_type_code,new_description,program_name
112,22MP,,Metropolitan Transportation Planning (FY 22),Metropolitan Transportation Planning (FY 22) Program
113,22SP,,Statewide and Nonmetropolitan Transportation Planning (FY 22),Statewide and Nonmetropolitan Transportation Planning (FY 22) Program
110,73AD,,Military Construction (FMIS),Military Construction Program
23,ER01,IIJA-A,Emergency Supplement Funding,Emergency Supplement Funding Program
24,ER03,IIJA-A,Emergency Supplement Funding,Emergency Rel 2023 Supplement
151,N003,,CMAQ - Projects to Reduce PM 2.5 Emissions,Congestion Mitigation & Air Quality Improvement Program
107,N916,,Highway Infrastructure Regional Infrastructure Accelerator,Highway Infrastructure Program
108,N925,,HIP-Rigional Infrastructure Accelerator Demonstration Program,Highway Infrastructure Program
152,RA01,,National Infrastractur Investment (RAISE),National Infrastructure Investment (RAISE) Program
153,RA02,,National Infrastractur Investment (RAISE),National Infrastructure Investment (RAISE) Program


In [78]:
program_codes3["program_name"] = program_codes3.apply(add_program_to_row, axis=1)

#### Turn this into a function

In [79]:
def update_program_code_list_2025():
    """
    On January 2025, we received a new list of updated codes.
    Merge this new list with codes received originally and in
    September 2023.
    """
    # Load original codes
    original_codes_df = load_program_codes_og()

    # Load September 2023 codes
    program_codes_sept_2023 = load_program_codes_sept_2023()

    # Merge original + September first
    m1 = pd.merge(
        program_codes_sept_2023,
        original_codes_df,
        on="iija_program_code",
        how="outer",
        indicator=True,
    )

    # Clean up description
    m1["new_description"] = m1["new_description"].str.strip().fillna(m1.description)

    # Delete unnecessary columns
    m1 = m1.drop(columns={"description", "_merge"})

    # Load January 2025 code
    program_codes_jan_2025 = load_program_codes_jan_2025()

    # Merge m1 with program codes from January 2025.
    m2 = pd.merge(
        program_codes_jan_2025,
        m1,
        on="iija_program_code",
        how="outer",
        indicator=True,
    )
    # Update descriptions
    m2["2025_description"] = (
        m2["program_code_description"].str.strip().fillna(m2.new_description)
    )

    # Update program names
    m2["2025_program_name"] = m2.program_name.fillna(m2.short_name)

    # Delete outdated columns
    m2 = m2.drop(
        columns=[
            "short_name",
            "program_name",
            "program_code_description",
            "new_description",
            "_merge",
        ]
    )

    # Rename to match original sheet
    m2 = m2.rename(
        columns={
            "2025_description": "new_description",
            "2025_program_name": "program_name",
        }
    )

    # Add program to another program names without the string "program"
    m2["program_name"] = m2.apply(add_program_to_row, axis=1)
    return m2

In [80]:
new_codes = update_program_code_list_2025()

### Project
`Use the “RK Locode” column K in the Project list as the Primary Locode, and if blank, use your current data source to populate the implementing agency.`

In [87]:
march_file = "FMIS_Projects_Universe_IIJA_Reporting_03012024_ToDLA.xlsx"

In [88]:
march_data = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/{march_file}"))

In [83]:
project_df.head(1)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,pid_district,project_number,recipient_project_number,pid_check1,efis_id,pid_check2,project_title,rk_locode,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,comp
0,2022-01-20,ER01,EMERGENCY REL 2022 SUPPLEMENT,5.0,31RA002,0518000118S,11,518000118,10,MONTEREY COUNTY NEAR BIG SUR 2.3 MILES NORTH OF CASTRO CANYON BRIDGE TO 0.8 MILE SOUTH OF BIG SUR RIVER BRIDGE. EMERGENCY PROJECT - PERMANENT RESTORA,,53,Cong Dist 20,Active,MONTEREY COUNTY NEAR BIG SUR 2.3 MILES NORTH OF CASTRO CANYON BRIDGE TO 0.8 MILE SOUTH OF BIG SUR RIVER BRIDGE. EMERGENCY PROJECT - PERMANENT RESTORATION. COMPLETE COASTAL DEVELOPMENT PERMIT REQUIREMENTS AT PFEIFFER CANYON BRIDGE.,16,Right of Way,600000.0,531100.0,S AMBAG,IIJA-A


In [86]:
project_df[
    [
        "summary_recipient_defined_text_field_1_value",
        "rk_locode",
    ]
].head()

Unnamed: 0,summary_recipient_defined_text_field_1_value,rk_locode
0,S AMBAG,
1,S AMBAG,
2,S ER NONE,
3,S SCAG,
4,S AMBAG,


#### In March 2024, there wasn't a column for locodes. 

In [89]:
march_data.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'project_number', 'recipient_project_number', 'project_title',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient_defined_text_field_1_value',
       'proj_id'],
      dtype='object')

In [84]:
project_df.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'pid_district', 'project_number', 'recipient_project_number',
       'pid_check1', 'efis_id', 'pid_check2', 'project_title', 'rk_locode',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient_defined_text_field_1_value',
       'comp'],
      dtype='object')

In [90]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5696 entries, 0 to 5695
Data columns (total 21 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         5696 non-null   datetime64[ns]
 1   program_code                                  5696 non-null   object        
 2   program_code_description                      5696 non-null   object        
 3   pid_district                                  5689 non-null   float64       
 4   project_number                                5696 non-null   object        
 5   recipient_project_number                      5689 non-null   object        
 6   pid_check1                                    5696 non-null   int64         
 7   efis_id                                       5689 non-null   object        
 8   pid_check2                                    5696 non-null   int64 

In [92]:
project_df.loc[project_df.rk_locode.notna()].sample(10)[
    [
        "summary_recipient_defined_text_field_1_value",
        "rk_locode",
    ]
]

Unnamed: 0,summary_recipient_defined_text_field_1_value,rk_locode
4313,L5083SCAG,5083.0
1675,L6066SANDAG,6066.0
3826,L5060FCOG,5060.0
4354,L6049SCAG,6049.0
3676,L5916SACOG,5916.0
2715,L5002SACOG,5002.0
5466,L5060COFCG,5060.0
5009,L5450SCAG,5450.0
2487,L5927MTC,5927.0
4596,L5137MTC,5137.0


#### Filter out for rows with a locode first?

In [94]:
filled_locode_df = project_df.loc[project_df.rk_locode.notna()].reset_index(drop=True)

In [96]:
# This didn't work
# filled_locode_df2 = _data_utils.add_name_from_locode(filled_locode_df, "rk_locode")

In [97]:
locodes = to_snakecase(
    pd.read_excel(
        f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
    )
)

In [98]:
locodes.head(3)

Unnamed: 0,agency_locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,6302,Humboldt Bay Harbor Recreation & Conservation District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,


In [108]:
filled_locode_df2 = pd.merge(
    filled_locode_df,
    locodes,
    left_on="rk_locode",
    right_on="agency_locode",
    how="left",
    indicator=True,
)

#### Some locodes are missing from the original list.

In [109]:
filled_locode_df2._merge.value_counts()

both          3085
left_only        2
right_only       0
Name: _merge, dtype: int64

In [103]:
filled_locode_df2 = filled_locode_df2.rename(columns={'agency_name':'implementing_agency',
                                   'locode':'implementing_agency_locode'})

In [117]:
filled_locode_df2.columns

Index(['fmis_transaction_date', 'program_code', 'program_code_description',
       'pid_district', 'project_number', 'recipient_project_number',
       'pid_check1', 'efis_id', 'pid_check2', 'project_title', 'rk_locode',
       'county_code', 'congressional_district', 'project_status_description',
       'project_description', 'improvement_type',
       'improvement_type_description', 'total_cost_amount',
       'obligations_amount', 'summary_recipient_defined_text_field_1_value',
       'comp', 'agency_locode', 'agency_name', 'district', 'county_name',
       'rtpa_name', 'mpo_name', 'mpo_locode_fads',
       'active_e76s______7_12_2021_', '_merge'],
      dtype='object')

In [104]:
filled_locode_df2 = filled_locode_df2.drop(columns =['active_e76s______7_12_2021_', 'mpo_locode_fads', 'agency_locode'])

In [116]:
filled_locode_df2.loc[filled_locode_df2._merge == "left_only"][
        "summary_recipient_defined_text_field_1_value",
        "rk_locode",
    ]
]

KeyError: "['implementing_agency'] not in index"

#### Filter out for rows with missing locodes

In [None]:
crosswalk = 

In [112]:
missing_locode_df = project_df.loc[project_df.rk_locode.isna()].reset_index(drop=True)

In [114]:
missing_locode_df.sample(10)[
    [
        "summary_recipient_defined_text_field_1_value",
        "rk_locode",
    ]
]

Unnamed: 0,summary_recipient_defined_text_field_1_value,rk_locode
1533,S SCAG,
2518,S SCAG,
1738,S SJCOG,
1761,S SANDAG,
1463,S NON-MPO,
2447,S TCAG,
2181,S MTC,
1984,S ER NONE,
2353,S SCAG,
583,S MCTC,
