# IIJA All Projects 

In [1]:
# ! pip install nltk
# ! pip install textblob

In [2]:
import numpy as np
import pandas as pd
from siuba import *

from shared_utils import geography_utils
from dla_utils import _dla_utils

from calitp import to_snakecase

import utils



In [3]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)


## Read Data

In [4]:
proj = utils.read_data_all()

In [5]:
proj.head(1)

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value
0,2022-03-03,Y230,STBG-URBANIZED >200K IIJA,5002174,0314000222L,BELL AVE. FROM BOLLENBACHER AVE. TO ASTORIA ST ROAD REHABILITATION (TC),67.0,Cong Dist 6,4.0,4R - No Added Capacity,2841397.0,L5002SACOG


In [6]:
len(proj)

1241

In [7]:
## agency identifier column doesnt always have locode

In [8]:
df = utils.identify_agency(proj, 'summary_recipient_defined_text_field_1_value')

In [9]:
(df
 >>group_by(_.recipient_project_number, _.project_number, _.implementing_agency, _.project_title)
 >>summarize(n_descriptions = _.improvement_type_description.nunique())
 >>filter(_.n_descriptions>1)
 >>arrange(_.n_descriptions, _.n_descriptions)
)

Unnamed: 0,recipient_project_number,project_number,implementing_agency,project_title,n_descriptions
23,0100020391L,5904112,Humboldt County,PINE HILL ROAD BRIDGE OVER SWAIN SLOUGH 04C0173 BRIDGE REPLACEMENT WITH TOLL CREDITS (TC),2
26,0113000090S,P036118,California,IN HUMBOLDT COUNTY NEAR ALTON FROM 0.10 MILE EAST OF ROUTE 36/101 SEPARATION TO 1.65 MILE EAST OF ROUTE 36/101 SEPARATION SHOULDER WIDENING,2
27,0113000123S,P020198,California,IN MENDOCINO COUNTY NEAR UKIAH FROM 0.3 MILE WEST OF RUSSIAN RIVER BRIDGE AND OVERHEAD TO 0.4 MILE EAST OF COUNTY ROAD 144 REPLACE TWO BRIDGES,2
32,0116000098L,5904156,Humboldt County,VARIOUS LOCATIONS IN HUMBOLDT COUNTY BRIDGE PREVENTIVE MAINTENANCE: MINOR CONCRETE REPAIR ON 8 BRIDGES.,2
34,0117000018S,P020199,California,"IN LAKE COUNTY NEAR CLEAR LAKE OAKS AT BERYL WAY THIS PROJECT PROPOSES TO INSTALL A COMBINATION OF TWO ALTERNATIVE IN-LINE TERMINAL SYSTEMS, A RAIL E",2
...,...,...,...,...,...
514,1121000195S,0051993,California,"CONSTRUCT HOV LANES, AUXILARY LANES, AND ITS (CMGC) IN SAN DIEGO COUNTY AT VARIOUS LOCATIONS FROM LOMAS SANTA FE DRIVE UNDERCROSSING TO 0.2 MILES NOR",4
533,1218000034S,X059084,California,"CONST BRIDGES, RW, SW, JPCP, HMA PVMT, DRAINAGE AND ELECT SYSTEMS. ORANGE COUNTY IN IRVINE, SANTA ANA, AND TUSTIN ON ROUTE 5 AT NEWPORT AVENUE UNDERC",4
254,0513000017S,Q101401,California,MONTEREY COUNTY NEAR SOLEDAD FROM 0.5 MILE SOUTH TO 0.2 MILE NORTH OF THE NORTH SOLEDAD OVERHEAD BRIDGE REHABILITATION,5
288,0616000207S,P099671,California,IN MADERA COUNTY NEAR MADERA FROM 0.4 MILE SOUTH OF COTTONWOOD CREEK BRIDGE TO 0.1 MILE NORTH OF AVENUE 12 OVERCROSSING. REPLACE BRIDGES AND CONSTRUC,5


#### Get one row for each project

In [10]:
proj_unique = df.groupby(['fmis_transaction_date','program_code','project_number','recipient_project_number','project_title',
                    'implementing_agency', 'district', 'county_name'])['obligations_amount'].max().reset_index()

In [11]:
merge_cols = ['fmis_transaction_date', 'program_code','project_number','recipient_project_number','project_title',
                    'implementing_agency', 'county_name', 'district', 'obligations_amount']

In [12]:
proj_unique = (pd.merge(proj_unique, df, how='left', on=merge_cols))

In [13]:
proj_unique.sample()

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name
472,2022-09-09,Y001,Q101349,0518000131S,"SANTA BARBARA COUNTY, IN AND NEAR THE CITIES OF MONTECITO AND SANTA BARBARA, FROM SHELFFIELD DRIVE TO SYCAMORE CREEK. CONSTRUCT HOV LANES",California,5.0,Santa Barbara County,642000.0,NATIONAL HIGHWAY PERF IIJA,83.0,Cong Dist 24,15.0,Preliminary Engineering,S SBCAG,,,


In [14]:
proj_unique_cat = utils.add_description(proj_unique, 'project_title')

In [15]:
proj_unique_cat.sample()

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type
169,2022-05-12,Y001,6507003,0817000123L,"MOUNT VERNON AVENUE OVERHEAD AT THE BURLINGTON NORTHERN SANTA FE RAILROAD YARD FROM KING STREET TO 5TH STREET, BR. NO. 54C-0066 BRIDGE REPLACEMENT",San Bernardino County Transportation Authority,8.0,San Bernardino County,25813142.0,NATIONAL HIGHWAY PERF IIJA,71.0,Cong Dist 31,16.0,Right of Way,L6507SCAG,6507.0,NON-RTPA,Southern California Association Of Governments,Replace,Bridge


In [16]:
len(proj_unique_cat>>filter(_.project_type=="Project"))

222

In [17]:
proj_unique_cat_title = utils.add_new_title(proj_unique_cat, "project_method", 'project_type', 'implementing_agency')

In [18]:
proj_unique_cat_title.sample()

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type,project_name_new
320,2022-07-20,Y233,5937205,0416000438L,"ALDER CROFT HEIGHTS ROAD OVER HOOKER CREEK, 2.12 MILES SOUTH OF LOS GATOS SC ROAD. BRIDGE NO. 37C0506 REPLACE EXISTING ONE LANE BRIDGE WITH A NEW T",Santa Clara County,4.0,Santa Clara County,300000.0,STBG IIJA OFF-SYSTEM BRIDGE,85.0,Cong Dist 18,16.0,Right of Way,L5937MTC,5937.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Replace,Project,Replace Project in Santa Clara County


## Add New Title Function

In [19]:
#to run after first desc and title function

def add_new_title(df, first_col_method, second_col_type, third_col_name, alt_col_name):
    """
    Function to add new title. 
    Expected output example: "New Bike Lane in Eureka"
    """
    def return_name(df):
        
        if (df[third_col_name] == "California") & (df[alt_col_name] == "Statewide"):
            return (df[first_col_method] + " " + df[second_col_type] +" " + df[alt_col_name])
        
        elif (df[third_col_name] == "California"):
            return (df[first_col_method] + " " + df[second_col_type] + " in " + df[alt_col_name])
        
        elif (df[third_col_name] != "California"):
            return (df[first_col_method] + " " + df[second_col_type] + " in " + df[third_col_name])
        
        # elif (df[third_col_name] == "Metropolitan Transportation Commission"):
        #     return (df[first_col_method] + " " + df[second_col_type] + " in The " + df[third_col_name])

        return df

    df['project_name_new'] = df.apply(return_name, axis = 1)
    
    return df

In [20]:
proj_unique_cat_title = add_new_title(proj_unique_cat_title, "project_method", "project_type", "implementing_agency", "county_name")

In [21]:
proj_unique_cat_title.sample(3)

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type,project_name_new
110,2022-05-02,Y001,P395348,0900000030S,IN INYO COUNTY NEAR OLANCHA AND CARTAGO FROM 1.4 MILES SOUTH OF LOS ANGELES AQUEDUCT BRIDGE TO 0.2 MILE SOUTH OF ASH CREEK BRIDGE. CONSTRUCT 4 LANE E,California,9.0,Inyo County,64378000.0,NATIONAL HIGHWAY PERF IIJA,27.0,Cong Dist 8,3.0,4R - Added Capacity,S NON-MPO,,,,Construct,Project,Construct Project in Inyo County
247,2022-06-16,Y001,Q101401,0513000017S,MONTEREY COUNTY NEAR SOLEDAD FROM 0.5 MILE SOUTH TO 0.2 MILE NORTH OF THE NORTH SOLEDAD OVERHEAD BRIDGE REHABILITATION,California,5.0,Monterey County,2118100.0,NATIONAL HIGHWAY PERF IIJA,53.0,Cong Dist 20,6.0,4R - Restoration & Rehabilitation,S AMBAG,,,,,Bridge Rehabilitation,Bridge Rehabilitation in Monterey County
392,2022-08-23,Y003,6211149,1119000022L,IN SAN DIEGO COUNTY IN SAN DIEGO FROM 0.6 MILE NORTH FROM DEL MAR HEIGHTS ROAD OVERCROSSING TO VILLA DE LA VALLE OVERCROSSING. RESTORE WETLANDS AT SA,Caltrans,11.0,Multi-County,33824168.0,PROJ TO REDUCE PM 2.5 EMI IIJA,73.0,Cong Dist 52,44.0,Other,S6211SANDAG,6211.0,CT-ADMIN,CT-ADMIN,,Restore Wetlands,Restore Wetlands in Caltrans


In [30]:
proj_unique_cat_title = utils.update_no_matched(proj_unique_cat_title, 'project_type', 'improvement_type_description', 'implementing_agency')

In [44]:
(proj_unique_cat_title>>filter(_.project_type=="Project")).sample(3)

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type,project_name_new,project_name_new2
230,2022-06-15,Y001,804212,0321000109S,NEVADA AND SIERRA COUNTIES FROM 1.9 MILES EAST OF FARAD UNDERCROSSING TO THE SIERRACOUNTY LINE AND FROM 1.3 MILES EAST OF THE NEVADA COUNTY LINE TO T,California,3.0,Nevada County,3749733.07,NATIONAL HIGHWAY PERF IIJA,57.0,Cong Dist 1,6.0,4R - Restoration & Rehabilitation,S NON-MPO,,,,,Project,Project in Nevada County,Road Restoration & Rehabilitation in California
268,2022-06-24,Y003,6084282,0422000469L,SF BAY AREA: VARIOUS BRIDGE CORRIDORS AND CORRIDOR APPROACHES CONDUCT PRELIMINARY ENGINEERING AND PLANNING STUDIES TO ADVANCE BAY BRIDGE FORWARD PROJ,Metropolitan Transportation Commission,4.0,Multi-County,6000000.0,PROJ TO REDUCE PM 2.5 EMI IIJA,1.0,Cong Dist 13,15.0,Preliminary Engineering,L6084MTC,6084.0,NON-RTPA,Metropolitan Transportation Commission,,Project,Project in Metropolitan Transportation Commission,Preliminary Engineering Projects in Metropolitan Transportation Commission
220,2022-06-13,Y001,802374,0417000031S,SOLANO COUNTY IN VALLEJO ON ROUTE 80 FROM 0.2 MILE NORH OF CARQUINEZ BRIDGE TO PLAZA TO 0.3 MILES SOUTH OF MAGAZINE STREET OVERCROSSING AND AT ROUTE,California,4.0,Solano County,8055600.0,NATIONAL HIGHWAY PERF IIJA,95.0,Cong Dist 5,11.0,Bridge Replacement - No Added Capacity,S MTC,,,,,Project,Project in Solano County,Bridge Replacement in California


In [52]:
len(proj_unique_cat_title>>filter(_.project_type=='Project', _.project_name_new2==""))

19

### Attmept #2

In [49]:
no_match= proj_unique_cat_title>>filter(_.project_type=='Project', _.project_name_new2=="")

In [58]:
no_match.improvement_type_description.value_counts()

Right of Way                                     10
Other                                             6
Safety and Education for Peds/Bicyclists          1
Bridge Inspection and Bridge Related Training     1
Landscaping and Other Scenic Beautification       1
Name: improvement_type_description, dtype: int64

In [57]:
(no_match>>filter(_.improvement_type_description=="Right of Way")).sample(3)

Unnamed: 0,fmis_transaction_date,program_code,project_number,recipient_project_number,project_title,implementing_agency,district,county_name,obligations_amount,program_code_description,county_code,congressional_district,improvement_type,improvement_type_description,summary_recipient_defined_text_field_1_value,implementing_agency_locode,rtpa_name,mpo_name,project_method,project_type,project_name_new,project_name_new2
317,2022-07-18,Y233,5937207,0417000077L,ALDERCROFT HEIGHTS ROAD OVER LOS GATOS CREEK 2.38 MILES SOUTH OF LOS GATOS SC ROAD. BRIDGE 37C0508 REPLACE EXISTING ONE-LANE BRIDGE WITH A NEW 2-LANE,Santa Clara County,4.0,Santa Clara County,300000.0,STBG IIJA OFF-SYSTEM BRIDGE,85.0,Cong Dist 18,16.0,Right of Way,L5937MTC,5937.0,Metropolitan Transportation Commission,Metropolitan Transportation Commission,Replace,Project,Replace Project in Santa Clara County,
284,2022-07-01,Y233,5946142,0614000082L,BRIDGE 46C0340 ON AVENUE 428 OVER SAND CREEK REPLACE 2 LANE BRIDGE WITH 2 LANE BRIDGE (TC),Tulare County,6.0,Tulare County,5206.0,STBG IIJA OFF-SYSTEM BRIDGE,107.0,Cong Dist 22,16.0,Right of Way,L5946TCAG,5946.0,Tulare County Association of Governments,Tulare County Association Of Governments,Replace,Project,Replace Project in Tulare County,
151,2022-05-09,Y001,P001676,0716000335S9,"PROJECT ID 0716000335 (EA 07-4X970) LA-001-42.5/42.5 (PPNO-5059)ROUTE 001: IN MALIBU, SOUTH OF BIG ROCK DRIVE. SHORELINE EMBANKMENT RESTORATION",California,7.0,Los Angeles County,500000.0,NATIONAL HIGHWAY PERF IIJA,37.0,Cong Dist 33,16.0,Right of Way,S SCAG,,,,,Project,Project in Los Angeles County,


In [None]:
def update_no_matched(df, flag_col, desc_col, name_col):
    """
    function to itreate over projects that did not match the first time
    using an existing project's short description of project type. 
    """
    
    def return_project_type(df):
        
        if (df[flag_col] == "Project") & (df[desc_col] == "Bridge Rehabilitation") | (df[desc_col] =="Bridge Rehabilitation - No Added Capacity") | (df[desc_col] =="Bridge Rehabilitation - Added Capacity"):
            return ("Bridge Rehabilitation in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Facilities for Pedestrians and Bicycles"):
            return (df[desc_col] + " in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Safety"):
            return (df[desc_col] + " Improvements in " + df[name_col])
        
        # elif (df[flag_col] == "Project") & (df[flag_col] == "Planning") & (df[title_col].str.contains("REGION CONSOLIDATED PLANNING GRANT")):
        #     return ("Regional Planning Grant in " + df[name_col])
            
        elif (df[flag_col] == "Project") & (df[desc_col] == "Planning "):
            return "Project Planning in " + df[name_col]
            
        elif (df[flag_col] == "Project") & (df[desc_col] == "Preliminary Engineering"):
            return (df[desc_col] + " Projects in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Construction Engineering"):
            return (df[desc_col] + " Projects in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "4R - Restoration & Rehabilitation"):
            return ("Road Restoration & Rehabilitation in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "4R - Maintenance  Resurfacing"):
            return ("Maintenance Resurfacing in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Bridge Replacement - Added Capacity") | (df[desc_col] == "Bridge Replacement - No Added Capacity") | (df[desc_col] == "Bridge New Construction") | (df[desc_col] == "Special Bridge"):
            return ("Bridge Replacement in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Mitigation of Water Pollution due to Highway Runoff"):
            return (df[desc_col] + " in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "4R - Added Capacity"):
            return ("Added Roadway Capacity in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "4R - No Added Capacity"):
            return ("Road Construction in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "New  Construction Roadway"):
            return ("New Construction Roadway in " + df[name_col])
        
        elif (df[flag_col] == "Project") & (df[desc_col] == "Traffic Management/Engineering - HOV"):
            return ("Traffic Management Project in " + df[name_col])
        
        # elif (df[flag_col] == "Project") & (df[desc_col] != "Other") | (df[desc_col] != "Right of Way"):
        #     return (df[desc_col] + " in " + df[name_col])
        
        else:
            return "" #(df[desc_col] + " in " + df[name_col])

        return df


    df['project_name_new2'] = df.apply(return_project_type, axis = 1)
    
    #df.apply(func, axis=1)
    
    return df


### Try with one part of the project

- projects listed can have multiple entires bases on the phases they have. currently, if a project has multiple entries, the `improvement_type_description` has different description phrases, leading the title function to create a title that does not accuratley reflect the program
- what we want to do is apply the function to one entry and the use that accross the multiple projects

In [None]:
## How many projects have multiple funding componenets?
## a sample of those under the no_match flag with at least two entires in system

(no_match
 >>group_by(_.recipient_project_number, _.project_number, _.implementing_agency, _.project_title)
 >>summarize(n_descriptions = _.improvement_type_description.nunique())
 >>filter(_.n_descriptions>1)
 >>arrange(_.n_descriptions, _.n_descriptions)
)

* of the 133 projects wiht unique project IDs, 60 have more than 1 entry
* using all unique regardless of numbers 

In [None]:
df_copy = (df>>filter(_.project_type==('Project')))

In [None]:
len(df>>filter(_.project_type==('Project'))
 >>group_by(_.fmis_transaction_date, _.program_code, _.program_code_description, _.project_number,
                                                   _.recipient_project_number, _.project_title, _.county_code, _.congressional_district,
                                                   _.summary_recipient_defined_text_field_1_value, _.implementing_agency_locode,
                                                   _.implementing_agency, _.district, _.county_name, _.rtpa_name, _.mpo_name)
 >>summarize(n_descriptions = _.improvement_type_description.nunique())>>filter(_.n_descriptions>1)>>arrange(_.n_descriptions, _.n_descriptions))

In [None]:
len(df>>filter(_.project_type==('Project'))
 >>group_by(_.fmis_transaction_date, _.program_code, _.program_code_description, _.project_number,
                                                   _.recipient_project_number, _.project_title, _.county_code, _.congressional_district,
                                                   _.summary_recipient_defined_text_field_1_value, _.implementing_agency_locode,
                                                   _.implementing_agency, _.district, _.county_name, _.rtpa_name, _.mpo_name)
 >>summarize(n_descriptions = _.improvement_type_description.nunique()))

In [None]:
proj_unique = (df>>filter(_.project_type==('Project'))
#  >>group_by(_.fmis_transaction_date, _.program_code, _.program_code_description, _.project_number,
#                                                    _.recipient_project_number, _.project_title, _.county_code, _.congressional_district,
#                                                    _.summary_recipient_defined_text_field_1_value, _.implementing_agency_locode,
#                                                    _.implementing_agency, _.district, _.county_name, _.rtpa_name, _.mpo_name)
# >>arrange(_.project_number, _.obligations_amount)
              )

#### Merge to get full details

In [40]:
# proj_unique = (pd.merge(proj_unique, df_copy, how='left', on=merge_cols))

In [41]:
# proj_unique.sample(4)

#### run with update_no_matched function

In [34]:
# proj_unique_new = utils.update_no_matched(proj_unique, 'project_type', 'improvement_type_description', 'implementing_agency')

In [35]:
# proj_unique_new.sample(5)

In [36]:
# len(proj_unique_new>>filter(_.project_name_new2==""))

In [37]:
# (proj_unique_new>>filter(_.project_name_new2=="")>>select(_.project_number, _.recipient_project_number, _.project_title, _.improvement_type_description, _.implementing_agency, _.county_name,
#                                                             _.project_method, _.project_type, _.project_name_new, _.project_name_new2))

In [38]:
# remaining = (proj_unique_new>>filter(_.project_name_new2=='')).improvement_type_description.value_counts()
# remaining

In [39]:
# (proj_unique_new>>filter(_.improvement_type_description=='Other'))>>select(_.project_title, _.implementing_agency,
#                                                                           _.project_method,_.project_type, _.project_name_new,_.project_name_new2)



## Export progress

In [None]:
#test_df = utils.title_column_names(df)

In [None]:
#test_df.to_csv(f"{GCS_FILE_PATH}/FMIS_projects_wip_all.csv")