# Run Functions to Add Information to Projects

In [1]:
# ! pip install nltk

In [2]:
import numpy as np
import pandas as pd
from siuba import *

from calitp_data_analysis.sql import to_snakecase

from shared_utils import geography_utils

import dla_utils

import _script_utils
import _data_utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [3]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)

## Read in Data and function development

In [4]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

In [5]:
### Read in data (First round)
# proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FMIS_Projects_Universe_(IIJA_Reporting)_ (3-9-23).xlsx", header=[3], skiprows=range(4,1784)))
# proj.drop(columns =['unnamed:_0'], axis=1, inplace=True)
# proj['summary_recipient_defined_text_field_1_value'] = proj['summary_recipient_defined_text_field_1_value'].fillna(value='None')


In [6]:
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/IIJA_FMIS_AllProject_20230309_ToDLA.xlsx",))


In [7]:
proj.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value
420,44686,YS30,HIGHWAY SAFETY IMP PROG IIJA,5033056,0819000060L,TWO HUNDRED AND TWENTY-FOUR (224) SIGNALIZED INTERSECTIONS ON VARIOUS ARTERIALS WITHININ THE CITY OF SAN BERNARDINO UPGRADE TRAFFIC SIGNAL HARDWARE,71,Cong Dist 31,Active,TWO HUNDRED AND TWENTY-FOUR (224) SIGNALIZED INTERSECTIONS ON VARIOUS ARTERIALS WITHININ THE CITY OF SAN BERNARDINO UPGRADE TRAFFIC SIGNAL HARDWARE,17,Construction Engineering,80000.0,80000.0,L5033SCAG


In [8]:
# proj.congressional_district.

In [9]:
## move to _data_utils
# def add_new_codes(df):
#     new_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FY21-22ProgramCodesAsOf5-25-2022.v2.xlsx"))
#     code_map = dict(new_codes[['iija_program_code', 'new_description']].values)
    
#     df['program_code_description'] = df.program_code.map(code_map)
#     proj['summary_recipient_defined_text_field_1_value'] = proj['summary_recipient_defined_text_field_1_value'].astype(str)
    
#     return df


In [10]:
proj2 = _data_utils.add_new_codes(proj)

In [11]:

# df = utils.read_data_all()
    
    ## function that adds known agency name to df 
df = _script_utils.identify_agency(proj2, 'summary_recipient_defined_text_field_1_value')

In [12]:
df.sample(3)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
3,44594,ER01,Emergency Supplement Funding,4400046,01924731L,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,23,Cong Dist 2,Active,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,4,4R - No Added Capacity,124980.9,110645.59,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO
79,44599,ER01,Emergency Supplement Funding,37Y4002,0320000035S,"IN AND NEAR PARADISE, FROM 0.3 MILE SOUTH OF AIRPORT ROAD TO 0.2 MILE NORTH OF OLD CLARK ROAD. STABILIZE THE FIRE DAMAGED CUT SLOPES, WIDEN SHOULDERS",7,Cong Dist 1,Active,"IN AND NEAR PARADISE, FROM 0.3 MILE SOUTH OF AIRPORT ROAD TO 0.2 MILE NORTH OF OLD CLARK ROAD. STABILIZE THE FIRE DAMAGED CUT SLOPES, WIDEN SHOULDERS TO CREATE CATCHMENT AREA FOR ROCKFALL DEBRIS, AND IMPROVE DRAINAGE SYSTEMS.",16,Right of Way,1603865.14,1389712.91,S BCAG,,California,3.0,Butte County,,
689,44826,ER01,Emergency Supplement Funding,40A0050,0120000075L,BRICELAND THORNE ROAD PM 3.08 PR PERMANENT RESTORATION,23,Cong Dist 2,Active,BRICELAND THORNE ROAD PM 3.08 PR PERMANENT RESTORATION,17,Construction Engineering,238000.0,210701.4,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO


##### Check unknowns

In [13]:
len(df>>filter(_.implementing_agency=='Unknown'))



1

In [14]:
len(df>>filter(_.county_name=='Unknown'))

1

In [15]:
df>>filter(_.implementing_agency=="Unknown")

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
706,44827,YR10,Recreational Trails Program,2023000,,2023 Rec Trails Project: State Parks Incurred Administrative Expenses,999,Cong Dist 0,Active,"2023 Rec Trails Project: State Parks Incurred Administrative Expenses. State costs incurred administering the RTP such as staff time, meeting and travel costs and attendance at trail-related training sessions and conferences, Trail Advisory committee costs including meeting and travel costs for committee members, trail conference support, etc.",44,Other,274727.3,241760.0,,,Unknown,,Unknown,,


In [16]:
# df>>filter(_.project_title.str.contains("Administrative Expense"))

#### Reformat Columns

To change: 
* ~~District: 2 digit / integer~~
* ~~Congressional District number~~
* Add Progam Code Description and amount to a new project description

In [17]:
df = _data_utils.change_col_to_integer(df, "congressional_district")

In [18]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
902,44923,Y230,Surface Transportation Block Grant,6084293,0423000148L,"SF BAY AREA: REGIONWIDE REGIONWIDE: ADMINISTRATION OF THE PRIORITY CONSERVATION AREA (PCA) GRANT PROGRAM, WHICH FUNDS THE PLANNING, DESIGN, AND IMPLE",1,15,Active,"SF BAY AREA: REGIONWIDE REGIONWIDE: ADMINISTRATION OF THE PRIORITY CONSERVATION AREA (PCA) GRANT PROGRAM, WHICH FUNDS THE PLANNING, DESIGN, AND IMPLEMENTATION OF PROJECTS THAT IMPROVE ACCESS TO PRIORITY HABITATS, OPEN SPACES, AND RECREATIONAL OPPORTUNITIES. (TC)",44,Other,39375.0,39375.0,L6084MTC,6084.0,Metropolitan Transportation Commission,4.0,Multi-County,NON-RTPA,Metropolitan Transportation Commission


In [19]:
## move to _data_utils
# def change_col_to_integer(df, col):
    
#     df[col] = df[col].str.split(' ').str[-1]
    
#     return df

In [20]:

# (df>>select(_.congressional_district, _.congressional_district_2)).sample(20)

In [21]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
1592,44880,ER01,Emergency Supplement Funding,31TG004,0418000139S,"SOLANO COUNTY NEAR VACAVILLE AT 0.1 MILE NORTH OF ALLENDALE ROAD UNDERCROSSING ROCK SLOPE PROTECTION, RDWY EXCAVATION, GUARDRAILS & VEGETATION CNTL",95,3,Active,"ON STATE ROUTE: 505. SOLANO COUNTY NEAR VACAVILLE AT 0.1 MILE NORTH OF ALLENDALE ROAD UNDERCROSSING ROCK SLOPE PROTECTION, RDWY EXCAVATION, GUARDRAILS & VEGETATION CNTL",6,4R - Restoration & Rehabilitation,945000.0,850800.0,S MTC,,California,4.0,Solano County,,


In [22]:
## move to utils,
# def add_new_description_col(df):
#     df["obligations_amount_string"] = df["obligations_amount"].astype(str)
    
#     df["new_description_col"] = df["program_code_description"] + " for $" + df["obligations_amount_string"]
    
#     df.drop(columns =['obligations_amount_string'], axis=1, inplace=True)
    
#     return df

In [23]:
# (add_new_description_col(df))>>filter(_.program_code_description.notnull())

In [24]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
587,44812,Y240,Surface Transportation Block Grant,5924169,03929228L,DILLARD RD OVER COSUMNES RIVER 0.2 MILES SOUTH OF SR-16. BR.# 24C0004 SCOUR COUNTERMEASURES,67,7,Active,DILLARD RD OVER COSUMNES RIVER 0.2 MILES SOUTH OF SR-16. BR.# 24C0004 SCOUR COUNTERMEASURES,14,Bridge Rehabilitation - No Added Capacity,804939.0,712612.0,L5924SACOG,5924.0,Sacramento County,3.0,Sacramento County,Sacramento Area Council of Governments,Sacramento Area Council Of Governments


In [25]:
df.program_code_description.value_counts()

Emergency Supplement Funding                                          590
National Highway Performance Program (NHPP)                           493
Surface Transportation Block Grant                                    386
Highway Safety Improvement Program (HSIP)                             377
Congestion Mitigation & Air Quality Improvement                       104
Transportation Alternatives                                            48
Projects to Reduce PM 2.5 Emissions                                    42
Section 164 Penalties - Use for HSIP Activities                        37
Bridge Formula Program                                                 33
Metropolitan Planning Program                                          19
Safe and Accessible Transportation Options - Metropolitan Planning     18
NHPP Exempt                                                            12
Railway-Highway Crossings Program                                       7
National Highway Freight Program (NHFP

## Test & Export

In [26]:
# GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

In [27]:
###test agg. PASS df with title column, concat and 552 rows
##throwingerror
agg = _script_utils.get_clean_data(df, full_or_agg = 'agg')

In [28]:
agg.sample(3)

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col
218,44687,5929276,San Joaquin County,L5929SJCOG,Y001,National Highway Performance Program (NHPP),1015000153L,15,Preliminary Engineering,ESCALON BELLOTA ROAD OVER MORMON SLOUGH (BRIDGE 29C0051) BRIDGE REPLACEMENT,177060.0,9,10.0,77,San Joaquin County,San Joaquin County,5929.0,San Joaquin Council of Governments,San Joaquin Council Of Goverments,Replace Bridge in San Joaquin County,"This project is part of the National Highway Performance Program (NHPP) Program, and recieved $177060.0. This project will Replace Bridge in San Joaquin County"
150,44666,5456021,Lathrop,L5456SJCOG,Y230,Surface Transportation Block Grant,1021000206L,6|17,4R - Restoration & Rehabilitation|Construction Engineering,"GOLDEN VALLEY PARKWAY (TOWNE CENTRE DRIVE AND LATHROP ROAD), RIVER ISLANDS PARKWAY (GOLDEN VALLEY PARKWAY LATHROP FIRE STATION 34), LATHROP ROAD. ROA",1274000.0,9,10.0,77,San Joaquin County,San Joaquin County,5456.0,San Joaquin Council of Governments,San Joaquin Council Of Goverments,Road Restoration & Rehabilitation in Lathrop,"This project is part of the Surface Transportation Block Grant Program, and recieved $1274000.0. This project will Road Restoration & Rehabilitation in Lathrop"
1049,44977,38M0023,San Diego County,L5957SANDAG,ER01,Emergency Supplement Funding,1121000062L,15|17,Preliminary Engineering|Construction Engineering,"W. LILAC RD. 8,900' W/O OLD HWY 395 AND 22,100' W/O OF OLD HWY 395 EMERGENCY OPENING",93851.29,53,11.0,73,San Diego County,San Diego County,5957.0,San Diego Association of Governments,San Diego Association Of Governments,Preliminary Engineering Projects in San Diego County,"This project is part of the Emergency Supplement Funding Program, and recieved $93851.29000000001. This project will Preliminary Engineering Projects in San Diego County"


In [29]:
(agg>>filter(_.county_name_title=="Statewide")).sample()

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col
686,44826,10207,California,S SBCAG,Y410,Safe and Accessible Transportation Options - Metropolitan Planning,0022000352S,18,Planning,OWP 22/23 SANTA BARBARA REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR SANTA BARBARA COUNTY ASSOCIATION OF GOVERNMENTS (SBCAG) OVERALL WORK PRO,29729.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Safe and Accessible Transportation Options - Metropolitan Planning Program, and recieved $29729.0. This project will Planning and Research Statewide"


## Program Code deep dive

In [30]:
(agg>>filter(_.program_code_description==("nan"))).program_code.value_counts()

Series([], Name: program_code, dtype: int64)

In [31]:
(agg>>filter(_.program_code_description==("nan")))

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col


### Read in Old Codes


(currently used in script)

In [32]:
#check project codes list
codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FY21-22ProgramCodesAsOf5-25-2022.v2.xlsx"))


In [33]:
len(codes)

101

In [34]:
codes.head()

Unnamed: 0,description,iija_program_code,new_description
0,Advanced Transportation Technologies and Innovative Mobility Deployment,Y44A,Advanced Transportation Technologies and Innovative Mobility Deployment
1,Applalachian Developmen Highway System,Y140,Applalachian Developmen Highway System
2,Border State Infrastructure,Y500,Border State Infrastructure
3,Bridge Formula Program,Y110,Bridge Formula Program
4,Bridge Formula Program Off-System Bridges,Y120,Bridge Formula Program


### New Codes

In [35]:
## read in new project codes:
new_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of lst_IIJA_Code_20230510.xlsx"))

In [36]:
new_codes

Unnamed: 0,iija_code,description,program_name,iija_program_code
0,|BFP|,Bridge Formula Program,Bridge Formula Program,Y120
1,|CPF|,Community Project Funding Congressionally Directed Spending,Community Project Funding Congressionally Directed Spending,Y928
2,|RAISE|,National Infrastractur Investment (RAISE),National Infrastructure Investment (RAISE),RA02
3,|RAISE|,National Infrastractur Investment (RAISE),National Infrastructure Investment (RAISE),RA03
4,|TID|,Technology and Innovation Deployment - FMIS,Technology and Innovation Deployment,Y37H
...,...,...,...,...
108,|HRDP|,Highway Research and Deveopment Program,Highway Research and Development Program,Y448
109,|BRRP|,Bridge Replacement and Rehabilitation Program,Bridge Replacement and Rehabilitation Program,Y908
110,|TID|,Technology and Innovation Deployment - FMIS,Technology and Innovation Deployment,Y37G
111,|ER|,Emergency Suppliment Funding,Emergency Supplement Funding,ER01


#### Remove | from codes

In [37]:
# new_codes['iija_code_2'] = new_codes['iija_code'].str.split("|").str[0]
new_codes['iija_code_2'] = new_codes['iija_code'].str.replace("|","")



In [38]:
new_codes.sample(5)

Unnamed: 0,iija_code,description,program_name,iija_program_code,iija_code_2
11,|NHPPE|,NHPP Exempt,NHPP Exempt,Y002,NHPPE
58,|CRP|,CRP Program - Areas with Population Less Than 5K [NEW],Carbon Reduction Program,Y608,CRP
17,|STBG|,STBG Program - Areas with Population Less Than 5K [NEW],Surface Transportation Block Grant,Y238,STBG
26,|TA|,"Transportation Alternatives (Section 133(h)) - Areas with Population 5K to 49,999 [NEW]",Transportation Alternatives,Y307,TA
71,|HIP|,Highway Infrastructure Regional Infrastructure Accelerator,Highway Infrastructure Program,N916,HIP


#### Codes with ranges

In [39]:
new_codes>>filter(_.iija_program_code.str.contains("-"))

Unnamed: 0,iija_code,description,program_name,iija_program_code,iija_code_2
83,|IIJAPRH|,IIJA Puerto Rico Highway,IIJA Puerto Rico Highway,YP10 -YP30,IIJAPRH


In [40]:
# ## checking with the proj
# proj>>filter(_.program_code.str.contains("YP"))

In [41]:
# proj.program_code.value_counts()

#### Null codes

In [42]:
len(new_codes>>filter(_.iija_code.isna()))

0

In [43]:
null_program_codes = new_codes>>filter(_.iija_code.isna())

In [44]:
#download and send to contact! 
# null_program_codes.to_csv("lst_IIJA_Code_20230501_null.csv")

In [45]:
# null_program_codes

### Add acronym code for IIJA programs
https://stackoverflow.com/questions/4355201/creating-acronyms-in-python

In [46]:
# (proj>>filter(_.program_code==("ER01"), _.program_code_description.notnull())).sample()

In [47]:
# (proj>>filter(_.program_code==("ER01"), _.program_code_description.notnull())).sample()

In [48]:
df.head()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name,county_name_title
0,44594,ER01,Emergency Supplement Funding,32L0118,0117000181L,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,23,2,Active,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,4,4R - No Added Capacity,68757.0,60870.23,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO,Humboldt County
1,44594,ER01,Emergency Supplement Funding,32L0118,0117000181L,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,23,2,Active,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,15,Preliminary Engineering,207848.0,184008.19,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO,Humboldt County
2,44594,ER01,Emergency Supplement Funding,32L0118,0117000181L,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,23,2,Active,MATTOLE ROAD PM 5.00 PERMANENT RESTORATION,17,Construction Engineering,139775.0,123742.5,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO,Humboldt County
3,44594,ER01,Emergency Supplement Funding,4400046,01924731L,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,23,2,Active,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,4,4R - No Added Capacity,124980.9,110645.59,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO,Humboldt County
4,44594,ER01,Emergency Supplement Funding,4400046,01924731L,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,23,2,Active,ALDERPOINT ROAD PM 40.21 TO 40.70 EMERGENCY RELIEF PR ONLY,15,Preliminary Engineering,831.49,736.11,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO,Humboldt County


##### Check titles
moved to `_script_utils`

In [49]:
# sorted(list(agg.project_title_new.unique()))

In [50]:
# agg>>filter(_.implementing_agency.str.contains("California"))

In [51]:
# agg>>filter(_.project_title_new.str.contains("Caltrans"))

In [52]:
# len(agg)

In [53]:
# agg = utils.title_column_names(agg)

In [54]:
# agg.to_csv(f"{GCS_FILE_PATH}/IIJA_FMIS_AllProject_20230315_ToDLA_agg.csv")

In [55]:
###test full. PASS title, no concat and 1241 rows
# full = utils.get_clean_data(df, full_or_agg = 'full')

In [56]:
# full = utils.title_column_names(full)

In [57]:
# full.to_csv(f"{GCS_FILE_PATH}/FMIS_Projects_Universe_IIJA_Reporting_3-9-23_all.csv")

In [58]:
# agg = to_snakecase(agg)

In [59]:
# test_df = agg>>filter(_.implementing_agency.str.contains('Unknown')
                      # | _.implementing_agency.str.contains("Caltrans")
                     # )

In [60]:
# test_df.sample()

In [61]:
# locodes = to_snakecase(pd.read_excel(f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"))

In [62]:
# locodes['agency_name'] = locodes['agency_name'].str.upper()

In [63]:
# locode_names = sorted(list(locodes['agency_name'].unique()), reverse=True)

In [64]:
# locode_names.remove('ROSS')

In [65]:
# locode_names.append("State Parks")

In [66]:
# locode_names

In [67]:
# pattern = '|'.join(locode_names)

In [68]:
# test_df.project_title.str.contains(pattern)

In [69]:
### https://stackoverflow.com/questions/68869434/create-an-pandas-column-if-a-string-from-a-list-matches-from-another-column
#test_df["agencies_found"] = test_df["project_title"].apply(lambda x: next((a for a in locode_names if a in x), ""))
# test_df["agencies_found2"] = test_df["project_title"].map(lambda s: next((name for name in locode_names if name in s), np.nan))

In [70]:
# test_df["agencies_found2"] = test_df["agencies_found2"].str.title()

In [71]:
# test_df

In [72]:
# agg>>filter(_["Project Title New"].str.contains('Unknown'))

In [73]:
# agg>>filter(_["implementing_agency"].str.contains('Unknown'))

### Next

#### Statewide 

In [74]:
len(agg>>filter(_.project_title_new.str.contains("Statewide")))

75

In [75]:
state = agg>>filter(_.project_title_new.str.contains("Statewide"))

In [76]:
state.sample(5)

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col
431,44776,000C530,California,S SACOG,YS30,Highway Safety Improvement Program (HSIP),0320000039S,17|21,Construction Engineering|Safety,"IN SACRAMENTO, PLACER, AND BUTTE COUNTIES, ON ROUTES 50, 51, 80, AND 99 AT VARIOUS LOCATIONS. APPLY HIGH FRICTION SURFACE TREATMENT (HFST) AND OPEN G",3835900.0,0,,999,Statewide,Statewide,,,,Safety Improvements Statewide,"This project is part of the Highway Safety Improvement Program (HSIP) Program, and recieved $3835900.0. This project will Safety Improvements Statewide"
687,44826,0010207,California,S SBCAG,Y450,Metropolitan Planning Program,0022000352S,18,Planning,OWP 22/23 SANTA BARBARA REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR SANTA BARBARA COUNTY ASSOCIATION OF GOVERNMENTS (SBCAG) OVERALL WORK PRO,1042203.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Metropolitan Planning Program Program, and recieved $1042203.0. This project will Planning and Research Statewide"
572,44813,X001665,California,S MTC,YS30,Highway Safety Improvement Program (HSIP),0415000090S,17|21|42,Construction Engineering|Safety|Training,ALAMEDA COUNTY AT VARIOUS LOCATIONS INSTALL ACCESSIBLE PEDESTRIAN SIGNAL AND RE-STRIPE CROSSWALK.,3279000.0,0,,999,Statewide,Statewide,,,,Install Pedestrian Safety Improvements Statewide,"This project is part of the Highway Safety Improvement Program (HSIP) Program, and recieved $3279000.0. This project will Install Pedestrian Safety Improvements Statewide"
671,44826,0010199,California,S NON-MPO,Y410,Safe and Accessible Transportation Options - Metropolitan Planning,0022000344S,18,Planning,OWP 22/23 -SAN JOAQUIN REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR SAN JOAQUIN COUNCIL OF GOVERNMENTS (SJCOG) OVERALL WORK PROGRAM (OWP) FOR,46585.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Safe and Accessible Transportation Options - Metropolitan Planning Program, and recieved $46585.0. This project will Planning and Research Statewide"
705,44827,0010206,California,S AMBAG,Y410,Safe and Accessible Transportation Options - Metropolitan Planning,0022000351S,18,Planning,OWP 22/23 MONTEREY BAY REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR ASSOCIATION OF MONTEREY BAY AREA GOVERNMENTS (AMBAG) OVERALL WORK PROGRAM,41695.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Safe and Accessible Transportation Options - Metropolitan Planning Program, and recieved $41695.0. This project will Planning and Research Statewide"


In [77]:
# #put into function

In [78]:
def read_in_locodes():
    locodes = to_snakecase(pd.read_excel(f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"))
    locodes['agency_name'] = locodes['agency_name'].str.upper()
    locode_names = sorted(list(locodes['agency_name'].unique()), reverse=True)    
    
    ## append list so that "Crosswalk" does not select "Ross"
    locode_names.remove('ROSS')
    locode_names.append("STATE PARKS")
    locode_names.append("SACOG - Sacramento Area Council of Governments")
    locode_names.append("SACOG")
    locode_names.append("MTC")
    locode_names.append("MTC - Metropolitan Transportation Commission")
    
    return locode_names

In [79]:
locode_names = read_in_locodes()

In [80]:
locode_names

['YUCCA VALLEY',
 'YUCAIPA',
 'YUBA-SUTTER TRANSIT AUTHORITY',
 'YUBA COUNTY',
 'YUBA CITY',
 'YREKA CITY',
 'YOUNTVILLE',
 'YOSEMITE NATIONAL PARK',
 'YOSEMITE AREA REGIONAL TRANSPORTATION SYSTEM JPA',
 'YORBA LINDA',
 'YOLO COUNTY TRANSPORTATION DISTRICT',
 'YOLO COUNTY',
 'WOODSIDE',
 'WOODLAND',
 'WOODLAKE',
 'WINTERS',
 'WINDSOR',
 'WILLOWS',
 'WILLOW CREEK COMMUNITY SERVICES DISTRICT',
 'WILLITS',
 'WILLIAMS',
 'WILDOMAR',
 'WHITTIER',
 'WHEATLAND',
 'WESTMORLAND',
 'WESTMINSTER',
 'WESTLAKE VILLAGE',
 'WESTERN SHASTA RESOURCE CONSERVATION DISTRICT',
 'WESTERN RIVERSIDE COUNCIL OF GOVERNMENTS',
 'WESTERN PACIFIC RAILROAD',
 'WESTERN CONTRA COSTA TRANSIT AUTHORITY',
 'WEST SACRAMENTO',
 'WEST HOLLYWOOD',
 'WEST COVINA',
 'WEED',
 'WATSONVILLE',
 'WATERSHED CONSERVATION AUTHORITY',
 'WATERFORD',
 'WASCO',
 'WALNUT CREEK',
 'WALNUT',
 'VOLCAN MOUNTAIN PRESERVE FOUNDATION',
 'VISTA',
 'VISALIA UNIFIED SCHOOL DISTRICT',
 'VISALIA ELECTRIC RAILROAD COMPANY',
 'VISALIA',
 'VILLA PARK',


In [81]:
def find_alternative_name(df, desc_col):
    
    #read in locode names to get list to find in description column
    locode_names = read_in_locodes()
    
    ### https://stackoverflow.com/questions/68869434/create-an-pandas-column-if-a-string-from-a-list-matches-from-another-column
    df["alt_geo_name_projdesc2"] = df[desc_col].map(lambda s: next((name for name in locode_names if name in s), ""))
    df["alt_geo_name_projdesc2"] = df["alt_geo_name_projdesc2"].str.title()
    
    return df

In [82]:
#test

In [83]:
state2 = find_alternative_name(state, 'project_title')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [84]:
state2.sample(5)

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col,alt_geo_name_projdesc2
575,44816,000C539,California,S AMBAG,YS30,Highway Safety Improvement Program (HSIP),0519000115S,17|21,Construction Engineering|Safety,SANTA CRUZ AND SAN BENITO COUNTIES AT VARIOUS LOCATIONS CONSTRUCT CENTERLINE AND EDGE LINE RUMBLE STRIPS AND UPGRADE STRIPING.,2876200.0,0,,999,Statewide,Statewide,,,,Safety Improvements Statewide,"This project is part of the Highway Safety Improvement Program (HSIP) Program, and recieved $2876200.0. This project will Safety Improvements Statewide",Santa Cruz
634,44824,15C3001,California,S ER NONE,ER01,Emergency Supplement Funding,0520000046S,6|16|17,4R - Restoration & Rehabilitation|Right of Way|Construction Engineering,"IN SANTA CRUZ COUNTY, ON ROUTES 1, 9 AND 236 AT VARIOUS LOCATIONS. EMERGENCY OPENING WITH PERMANENT REPAIRS - REMOVE FIRE DEBRIS, BURNED TREES, REPLA",2526447.25,0,,999,Statewide,Statewide,,,,Road Restoration & Rehabilitation Statewide,"This project is part of the Emergency Supplement Funding Program, and recieved $2526447.25. This project will Road Restoration & Rehabilitation Statewide",Santa Cruz County
1027,44966,X075048,California,S MTC,YS32,Section 164 Penalties - Use for HSIP Activities,0415000221S,17|21,Construction Engineering|Safety,"IN THE CITY AND COUNTY OF SAN FRANCISCO, ON ROUTES 1, 35, 80, 101 AND 280 AT VARIOUS LOCATIONS. INSTALL ACCESSIBLE PEDESTRIAN SIGNALS AND RESTRIPE CR",2380000.0,0,,999,Statewide,Statewide,,,,Install Pedestrian Safety Improvements Statewide,"This project is part of the Section 164 Penalties - Use for HSIP Activities Program, and recieved $2380000.0. This project will Install Pedestrian Safety Improvements Statewide",San Francisco
656,44826,0010045,California,S NON-MPO,Y570,Safe and Accessible Transportation Options - SPR,7400010045S,18,Planning,"STATEWIDE STATE PLANNING AND RESEARCH PROGRAM, PART I FY 21/22",2275891.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Safe and Accessible Transportation Options - SPR Program, and recieved $2275891.0. This project will Planning and Research Statewide",
682,44826,0010204,California,S TAHOE,Y410,Safe and Accessible Transportation Options - Metropolitan Planning,0022000349S,18,Planning,OWP 22/23 TAHOE REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR TAHOE REGIONAL PLANNING AGENCY (TRPA) OVERALL WORK PROGRAM (OWP) FOR FISCAL YEAR,21287.0,0,,999,Statewide,Statewide,,,,Planning and Research Statewide,"This project is part of the Safe and Accessible Transportation Options - Metropolitan Planning Program, and recieved $21287.0. This project will Planning and Research Statewide",Tahoe Regional Planning Agency


In [85]:
state2>>select(_.summary_recipient_defined_text_field_1_value, _.implementing_agency, _.program_code_description, _.improvement_type_description,
                                                      _.project_title, _.obligations_amount, _.county_name, _.county_name_title,
                                                      _.project_title_new, _.new_description_col, _.alt_geo_name_projdesc)

KeyError: "['alt_geo_name_projdesc'] not in index"

In [None]:
state2>>filter(_.alt_geo_name_projdesc==(""))>>select(_.summary_recipient_defined_text_field_1_value, _.implementing_agency, _.program_code_description, _.improvement_type_description,
                                                      _.project_title, _.obligations_amount, _.county_name, _.county_name_title,
                                                      _.project_title_new, _.new_description_col, _.alt_geo_name_projdesc)

In [None]:
(find_alternative_name((state2>>filter(_.alt_geo_name_projdesc=="")), 'summary_recipient_defined_text_field_1_value'))>>select(_.summary_recipient_defined_text_field_1_value, _.implementing_agency, _.program_code_description, _.improvement_type_description,
                                                      _.project_title, _.obligations_amount, _.county_name, _.county_name_title,
                                                      _.project_title_new, _.new_description_col, _.alt_geo_name_projdesc, _.alt_geo_name_projdesc2)