# Run Functions to Add Information to Projects

In [1]:
# ! pip install nltk

In [1]:
import numpy as np
import pandas as pd
from siuba import *

from calitp_data_analysis.sql import to_snakecase

from shared_utils import geography_utils

import dla_utils

import _script_utils
import _data_utils


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)

## Read in Data and function development

In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

In [5]:
### Read in data (First round)
# proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FMIS_Projects_Universe_(IIJA_Reporting)_ (3-9-23).xlsx", header=[3], skiprows=range(4,1784)))
# proj.drop(columns =['unnamed:_0'], axis=1, inplace=True)
# proj['summary_recipient_defined_text_field_1_value'] = proj['summary_recipient_defined_text_field_1_value'].fillna(value='None')


In [4]:
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/IIJA_FMIS_AllProject_20230309_ToDLA.xlsx",))


In [5]:
proj.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value
1103,44805,YS30,HIGHWAY SAFETY IMP PROG IIJA,X071074,0820000042S,"IN SAN BERNARDINO COUNTY AT VARIOUS LOCATIONS THERMOPLASITC TRAFFIC STRIPING, PAVEMENT MARKINGS, AND ROADSIDE SIGNS AT 42 RAMPS",71,Cong Dist 39,Active,"ON STATE ROUTE: 10. IN SAN BERNARDINO COUNTY AT VARIOUS LOCATIONS THERMOPLASITC TRAFFIC STRIPING, PAVEMENT MARKINGS, AND ROADSIDE SIGNS AT 42 RAMPS",17,Construction Engineering,397547.36,397547.36,S SCAG


In [6]:
# proj.congressional_district.

In [7]:
## move to _data_utils
# def add_new_codes(df):
#     new_codes = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/FY21-22ProgramCodesAsOf5-25-2022.v2.xlsx"))
#     code_map = dict(new_codes[['iija_program_code', 'new_description']].values)
    
#     df['program_code_description'] = df.program_code.map(code_map)
#     proj['summary_recipient_defined_text_field_1_value'] = proj['summary_recipient_defined_text_field_1_value'].astype(str)
    
#     return df


In [8]:
proj = _data_utils.add_new_codes(proj)

In [9]:

# df = utils.read_data_all()
    
    ## function that adds known agency name to df 
df = _script_utils.identify_agency(proj, 'summary_recipient_defined_text_field_1_value')

In [10]:
df.sample(3)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
45,44588,ER01,,31SU004,0518000114S,MONTEREY COUNTY NEAR LITTLE SUR RIVER FROM 0.4 MILE SOUTH OF LITTLE SUR RIVER BRIDGE TO 0.2 MILE SOUTH OF LITTLE SUR RIVER BRIDGE PERMANENT RESTORATI,53,Cong Dist 20,Active,"ON STATE ROUTE: 1. MONTEREY COUNTY NEAR LITTLE SUR RIVER FROM 0.4 MILE SOUTH OF LITTLE SUR RIVER BRIDGE TO 0.2 MILE SOUTH OF LITTLE SUR RIVER BRIDGE PERMANENT RESTORATION: CONSTRUCT EARTH RETAINING SYSTEM, RESTORE ROADWAY AND FACILITIES",6,4R - Restoration & Rehabilitation,1865000.0,1578500.0,S AMBAG,,California,5.0,Monterey County,,
592,44812,YS30,Highway Safety Improvement Program (HSIP),5953756,0717000287L,OLYMPIC BLVD / GARFIELD AVE & WHITTIER BLVD / EASTERN AVE INTERSECTIONS IN EAST LOS ANGELES EAST LA TRAFFIC SIGNAL SAFETY PROJECT PHASE I: TRAFFIC S,37,Cong Dist 40,Active,"OLYMPIC BLVD / GARFIELD AVE & WHITTIER BLVD / EASTERN AVE INTERSECTIONS IN EAST LOS ANGELES EAST LA TRAFFIC SIGNAL SAFETY PROJECT PHASE I: TRAFFIC SIGNAL IMPROVEMENTS INCL. UPGRADING STANDARDS, MASTARMS, VEHICLE HEADS, PROTECTED LEFT TURN PHASING, ADA RAMPS, COMMUNICATION & ASSOCIATED EQUIPMENTS.",21,Safety,1077104.6,545043.48,L5953SCAG,5953.0,Los Angeles County,7.0,Los Angeles County,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments
1432,44826,Y450,Metropolitan Planning Program,0010200,0022000345S,OWP 22/23 -MERCED COUNTY REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR MERCED COUNTY ASSOCIATION OF GOVERNMENTS (MCAG) OVERALL WORK PROGRAM (O,999,Cong Dist 0,Active,OWP 22/23 -MERCED COUNTY REGION CONSOLIDATED PLANNING GRANT (CPG) PROJECT FOR MERCED COUNTY ASSOCIATION OF GOVERNMENTS (MCAG) OVERALL WORK PROGRAM (OWP) FOR FISCAL YEAR 2022/2023 (TC),18,Planning,956173.0,956173.0,S MCAG,,California,,Statewide,,


##### Check unknowns

In [None]:
len(df>>filter(_.implementing_agency=='Unknown'))

In [None]:
len(df>>filter(_.county_name=='Unknown'))

In [None]:
df>>filter(_.implementing_agency=="Unknown")

In [None]:
df>>filter(_.project_title.str.contains("Administrative Expense"))

#### Reformat Columns

To change: 
* ~~District: 2 digit / integer~~
* ~~Congressional District number~~
* Add Progam Code Description and amount to a new project description

In [11]:
df = _data_utils.change_col_to_integer(df, "congressional_district")

In [12]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
996,44951,Y001,National Highway Performance Program (NHPP),5904156,0116000098L,VARIOUS LOCATIONS IN HUMBOLDT COUNTY BRIDGE PREVENTIVE MAINTENANCE: MINOR CONCRETE REPAIR ON 8 BRIDGES.,23,2,Active,VARIOUS LOCATIONS IN HUMBOLDT COUNTY BRIDGE PREVENTIVE MAINTENANCE: MINOR CONCRETE REPAIR ON 8 BRIDGES.,47,Bridge Preventive Maintenance,367243.48,325120.66,L5904NON-MPO,5904.0,Humboldt County,1.0,Humboldt County,Humboldt County Association of Governments,NON-MPO


In [13]:
## move to _data_utils
# def change_col_to_integer(df, col):
    
#     df[col] = df[col].str.split(' ').str[-1]
    
#     return df

In [14]:

# (df>>select(_.congressional_district, _.congressional_district_2)).sample(20)

In [15]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
1765,44915,YS30,Highway Safety Improvement Program (HSIP),P055064,1219000083S,ORANGE AND ANAHEIM AT VARIOUS LOCATIONS FROM 0.1 MILE SOUTH OF KATELLA AVENUE UNDERCROSSING TO 0.5 MILE NORTH OF LINCOLN AVENUE UNDERCROSSING UPGRADE,59,46,Closed,ON STATE ROUTE: 55. ORANGE AND ANAHEIM AT VARIOUS LOCATIONS FROM 0.1 MILE SOUTH OF KATELLA AVENUE UNDERCROSSING TO 0.5 MILE NORTH OF LINCOLN AVENUE UNDERCROSSING UPGRADE TRAFFIC SAFETY DEVICES,17,Construction Engineering,292402.48,263162.54,S SCAG,,California,12.0,Orange County,,


In [16]:
## move to utils,
# def add_new_description_col(df):
#     df["obligations_amount_string"] = df["obligations_amount"].astype(str)
    
#     df["new_description_col"] = df["program_code_description"] + " for $" + df["obligations_amount_string"]
    
#     df.drop(columns =['obligations_amount_string'], axis=1, inplace=True)
    
#     return df

In [18]:
# (add_new_description_col(df))>>filter(_.program_code_description.notnull())

In [19]:
df.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,project_status_description,project_description,improvement_type,improvement_type_description,total_cost_amount,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
1045,44963,Y120,Bridge Formula Program,5930059,1000020522L,MONGE RANCH ROAD OVER COYOTE CREEK (BRIDGE 30C0021) BRIDGE REPLACEMENT (TC),9,4,Active,MONGE RANCH ROAD OVER COYOTE CREEK (BRIDGE 30C0021) BRIDGE REPLACEMENT (TC),11,Bridge Replacement - No Added Capacity,2365594.0,2365594.0,L5930NON-MPO,5930.0,Calaveras County,10.0,Calaveras County,Calaveras Council of Governments,NON-MPO


In [20]:
df.program_code_description.value_counts()

National Highway Performance Program (NHPP)                           505
Surface Transportation Block Grant (STBG) Program                     386
Highway Safety Improvement Program (HSIP)                             377
Congestion Mitigation & Air Quality Improvement (CMAQ)                104
Transportation Alternatives (Section 133(h))                           48
Projects to Reduce PM 2.5 Emissions                                    42
Section 164 Penalties - Use for HSIP Activities                        37
Bridge Formula Program                                                 33
Metropolitan Planning Program                                          19
Safe and Accessible Transportation Options - Metropolitan Planning     18
Railway-Highway Crossings Program (RHCP)                                7
National Highway Freight Program (NHFP)                                 6
Vulnerable Road User Safety Special Rule                                5
State Planning and Research (SPR)     

## Test & Export

In [None]:
# GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

In [21]:
###test agg. PASS df with title column, concat and 552 rows
##throwingerror
agg = _script_utils.get_clean_data(df, full_or_agg = 'agg')

In [22]:
agg.sample()

Unnamed: 0,fmis_transaction_date,project_number,implementing_agency,summary_recipient_defined_text_field_1_value,program_code,program_code_description,recipient_project_number,improvement_type,improvement_type_description,project_title,obligations_amount,congressional_district,district,county_code,county_name,county_name_title,implementing_agency_locode,rtpa_name,mpo_name,project_title_new,new_description_col
832,44910,5060362,Fresno,L5060COFCG,Y230,Surface Transportation Block Grant (STBG) Program,0619000149L,5|17,4R - Maintenance Resurfacing|Construction Engineering,"CHESTNUT AVENUE FROM KINGS CANYON ROAD TO BUTLER AVENUE AC OVERLAY AND INSTALLATION OF CURB RAMPS, SIGNAL LOOP DETECTORS, CLASS II BIKE LANES, SIGNAG",1381666.0,16,6.0,19,Fresno County,Fresno County,5060.0,Council of Fresno County Governments,Council Of Fresno County Goverments,Install Bike Lanes in Fresno,Surface Transportation Block Grant (STBG) Program for $1381666.0


In [None]:
df.head()

##### Check titles
moved to `_script_utils`

In [None]:
# sorted(list(agg.project_title_new.unique()))

In [60]:
# agg>>filter(_.implementing_agency.str.contains("California"))

In [61]:
# agg>>filter(_.project_title_new.str.contains("Caltrans"))

In [62]:
# len(agg)

In [29]:
# agg = utils.title_column_names(agg)

In [30]:
# agg.to_csv(f"{GCS_FILE_PATH}/IIJA_FMIS_AllProject_20230315_ToDLA_agg.csv")

In [63]:
###test full. PASS title, no concat and 1241 rows
# full = utils.get_clean_data(df, full_or_agg = 'full')

In [32]:
# full = utils.title_column_names(full)

In [33]:
# full.to_csv(f"{GCS_FILE_PATH}/FMIS_Projects_Universe_IIJA_Reporting_3-9-23_all.csv")

In [34]:
# agg = to_snakecase(agg)

In [66]:
# test_df = agg>>filter(_.implementing_agency.str.contains('Unknown')
                      # | _.implementing_agency.str.contains("Caltrans")
                     # )

In [67]:
# test_df.sample()

In [68]:
# locodes = to_snakecase(pd.read_excel(f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"))

In [69]:
# locodes['agency_name'] = locodes['agency_name'].str.upper()

In [70]:
# locode_names = sorted(list(locodes['agency_name'].unique()), reverse=True)

In [71]:
# locode_names.remove('ROSS')

In [72]:
# locode_names.append("State Parks")

In [73]:
# locode_names

In [74]:
# pattern = '|'.join(locode_names)

In [75]:
# test_df.project_title.str.contains(pattern)

In [76]:
### https://stackoverflow.com/questions/68869434/create-an-pandas-column-if-a-string-from-a-list-matches-from-another-column
#test_df["agencies_found"] = test_df["project_title"].apply(lambda x: next((a for a in locode_names if a in x), ""))
# test_df["agencies_found2"] = test_df["project_title"].map(lambda s: next((name for name in locode_names if name in s), np.nan))

In [77]:
# test_df["agencies_found2"] = test_df["agencies_found2"].str.title()

In [78]:
# test_df

In [44]:
# agg>>filter(_["Project Title New"].str.contains('Unknown'))

In [79]:
# agg>>filter(_["implementing_agency"].str.contains('Unknown'))

### Next