# Add Agency and Agency Information to FMIS IIJA

In [1]:
import numpy as np
import pandas as pd
from siuba import *

from shared_utils import geography_utils
from dla_utils import _dla_utils

from calitp import to_snakecase



In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)


In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

## Read In Data

In [4]:
locodes = to_snakecase(pd.read_excel(f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"))

In [5]:
county = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx",
                                   sheet_name='locode1',
                                   header=[0]))

In [6]:
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/CopyofFMIS_Projects_Universe_IIJA_Reporting_4.xls", 
                           # sheet_name='FMIS 5 Projects  ', header=[3]
                           sheet_name='IIJA-combined',
                           # sheet_name='FMIS 5 Projects  ',
                           ))

In [7]:
# proj2 = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of FMIS_Projects_Universe_(IIJA_Reporting)_ (5).xls", 
#                            sheet_name='FMIS 5 Projects  ', header=[3]))

In [8]:
# number of entries in the locodes list
len(locodes)

1041

In [9]:
# number of entries in the county locode list
len(county)

1072

In [10]:
county.sample(4)

Unnamed: 0,agency_locode,agency_name,district,mpo,county
317,5319,La Palma,12.0,SCAG,Orange County
820,6273,Alameda County Congestion Management Agency,4.0,MTC,Alameda County
742,6195,Yolo County Transportation District,3.0,SACOG,Yolo County
762,6215,Simi Recreation and Parks District,7.0,SCAG,Ventura County


In [11]:
locodes.head()

Unnamed: 0,agency_locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,6302,Humboldt Bay Harbor Recreation & Conservation District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
3,5049,Ukiah,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,Yes
4,5082,Willits,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,


In [12]:
proj.drop(columns =['unnamed:_0', 'unnamed:_13', 'total'], axis=1, inplace=True)

In [13]:
proj.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value
257,2022-09-08,Y240//Y240//Y240//Y240//Y240//Y240//Y240//Y240//Y240,SURFAC TRNSP BLK GRTS-FLX IIJA,6084284,0422000493L,"SAN FRANCISCO BAY AREA REGIONAL PLANNING ACTIVITIES AND PLANNING, PROGRAMMING, AND MONITORING (PPM). IN COORDINATION WITH MTC, COUNTY TRANSPORTATION",1,Cong Dist 17,44//44//44//44//44//44//44//44//44,Other//Other//Other//Other//Other//Other//Other//Other//Other,34404496.0,L6084MTC


## Get Locode Substring

In [14]:
string = proj['summary_recipient_defined_text_field_1_value'].iloc[0]

In [15]:
string

'L5253SCAG'

In [16]:
print(string.find('5'))
print(string.find('3'))

1
4


In [17]:
# need to extract string from position 1-4. 

In [18]:
proj['locode'] = proj.summary_recipient_defined_text_field_1_value.apply(lambda x: x[1:5])

In [19]:
proj.head()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,locode
0,2022-06-01,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5253021,0720000168L,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,37,Cong Dist 43,17//21,Construction Engineering//Safety,1305800.0,L5253SCAG,5253
1,2022-09-15,Y001,NATIONAL HIGHWAY PERF IIJA,NBIL539,0815000215L,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",71,Cong Dist 8,44,Other,301838.32,L5457SCAG,5457
2,2022-07-25,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5132050,0421000374L,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",95,Cong Dist 3,17//21,Construction Engineering//Safety,935150.0,L5132MTC,5132
3,2022-03-07,Y233//Y233//Y233//Y233,STBG IIJA OFF-SYSTEM BRIDGE,5914078,0100020461L,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),33,Cong Dist 3,11//15//16//17,Bridge Replacement - No Added Capacity//Preliminary Engineering//Right of Way//Construction Engineering,1615000.0,L5914NON-MPO,5914
4,2022-07-18,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5094071,0419000558L,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,95,Cong Dist 3,17//21,Construction Engineering//Safety,2027700.0,L5094MTC,5094


In [20]:
county.sample()

Unnamed: 0,agency_locode,agency_name,district,mpo,county
34,5035,Colusa,3.0,NON-MPO,Colusa County


## Merge data on Locode

In [21]:
proj['locode'] = pd.to_numeric(proj['locode'], errors='coerce')

In [22]:
proj_all = (pd.merge(proj, locodes, left_on='locode', right_on='agency_locode', how='left'))

In [23]:
proj_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 0 to 329
Data columns (total 21 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         330 non-null    datetime64[ns]
 1   program_code                                  330 non-null    object        
 2   program_code_description                      330 non-null    object        
 3   project_number                                330 non-null    object        
 4   recipient_project_number                      330 non-null    object        
 5   project_title                                 330 non-null    object        
 6   county_code                                   330 non-null    int64         
 7   congressional_district                        330 non-null    object        
 8   improvement_type                              330 non-null    object  

In [24]:
proj_all = proj_all.rename(columns={'agency_name':'implementing_agency',
                                   'locode':'implementing_agency_locode'})

In [25]:
#if we use other locode list then drop these columns
proj_all.drop(columns =['active_e76s______7_12_2021_', 'mpo_locode_fads', 'agency_locode'], axis=1, inplace=True)

In [26]:
proj_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 0 to 329
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         330 non-null    datetime64[ns]
 1   program_code                                  330 non-null    object        
 2   program_code_description                      330 non-null    object        
 3   project_number                                330 non-null    object        
 4   recipient_project_number                      330 non-null    object        
 5   project_title                                 330 non-null    object        
 6   county_code                                   330 non-null    int64         
 7   congressional_district                        330 non-null    object        
 8   improvement_type                              330 non-null    object  

In [27]:
# one locode did not match
proj_all>>filter(_.implementing_agency_locode.isnull())

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
284,2022-04-26,Y001,NATIONAL HIGHWAY PERF IIJA,NBIS522,0000001453L5,STATEWIDE - IN SERVICE BRIDGES OWNED BY LOCAL AGENCIES FEDERALLY MANDATED BRIDGE INSPECTION PROGRAM,67,Cong Dist 3,49,Bridge Inspection and Bridge Related Training,14164800.0,S NON-MPO,,,,,,


In [28]:
proj_all.head()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
0,2022-06-01,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5253021,0720000168L,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,37,Cong Dist 43,17//21,Construction Engineering//Safety,1305800.0,L5253SCAG,5253.0,Hawthorne,7.0,Los Angeles County,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments
1,2022-09-15,Y001,NATIONAL HIGHWAY PERF IIJA,NBIL539,0815000215L,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",71,Cong Dist 8,44,Other,301838.32,L5457SCAG,5457.0,Yucaipa,8.0,San Bernardino County,San Bernardino Associated Governments,Southern California Association Of Governments
2,2022-07-25,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5132050,0421000374L,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",95,Cong Dist 3,17//21,Construction Engineering//Safety,935150.0,L5132MTC,5132.0,Fairfield,4.0,Solano County,Metropolitan Transportation Commission,Metropolitan Transportation Commission
3,2022-03-07,Y233//Y233//Y233//Y233,STBG IIJA OFF-SYSTEM BRIDGE,5914078,0100020461L,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),33,Cong Dist 3,11//15//16//17,Bridge Replacement - No Added Capacity//Preliminary Engineering//Right of Way//Construction Engineering,1615000.0,L5914NON-MPO,5914.0,Lake County,1.0,Lake County,Lake County/City Area Planning Council,NON-MPO
4,2022-07-18,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5094071,0419000558L,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,95,Cong Dist 3,17//21,Construction Engineering//Safety,2027700.0,L5094MTC,5094.0,Vacaville,4.0,Solano County,Metropolitan Transportation Commission,Metropolitan Transportation Commission


In [29]:
list(proj_all.implementing_agency.unique())

['Hawthorne',
 'Yucaipa',
 'Fairfield',
 'Lake County',
 'Vacaville',
 'Benicia',
 'Pico Rivera',
 'Marin County',
 'Los Banos',
 'Santa Clara County',
 'Santa Barbara County',
 'Fresno County',
 'Santa Ana',
 'Stockton',
 'Caltrans',
 'Sacramento',
 'San Benito County',
 'Pinole',
 'Sacramento County',
 'Merced County',
 'Shasta County',
 'Livingston',
 'El Dorado County',
 'Sanger',
 'Palmdale',
 'Tulare County',
 'Coachella',
 'Bakersfield',
 'Colton',
 'Visalia',
 'Rialto',
 'Metropolitan Transportation Commission',
 'Bellflower',
 'Belmont',
 'Larkspur',
 'Monterey County',
 'Madera County',
 'Moraga',
 'Orinda',
 'Redding',
 'Cathedral City',
 'Alameda',
 'Clovis',
 'Gardena',
 'Mission Viejo',
 'Pleasanton',
 'Fresno',
 'Butte County',
 'Rohnert Park',
 'Alameda County Transportation Commission',
 'Yolo County',
 'Stanislaus County',
 'Placer County',
 'Contra Costa County',
 'Mariposa County',
 'Santa Barbara',
 'Nevada County',
 'Calaveras County',
 'Long Beach',
 'Los Angeles

## Writing to GCS

In [30]:
#proj_all.to_csv(f"{GCS_FILE_PATH}/FMIS_projects_wip.csv")

## Adding Place Names (can do)
* we have a list of city names to county from [Caltrans PlaceNames](https://dot.ca.gov/-/media/dot-media/programs/research-innovation-system-information/documents/place-names/2019-place-names-in-california-a11y.pdf)


In [31]:
# city_place_names = (to_snakecase(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/2020-place-names-locode.xlsx', sheet_name=0)))

In [32]:
# city_place_names.sample()

In [33]:
# city_place_names.drop(columns =['unnamed:_1', 'unnamed:_3', 'unnamed:_4','unnamed:_6','unnamed:_7', 'date_of_incorporation',
#                                'city_name_abbr_','name','dist_', 'co_'], axis=1, inplace=True)

In [34]:
# (pd.merge(proj_all, city_place_names, left_on='agency_locode', right_on='ct_city_code', how='left', indicator=True))._merge.value_counts()

In [35]:
# proj_all1 = (pd.merge(proj_all, city_place_names, left_on='agency_locode', right_on='ct_city_code', how='left', indicator='City'))

In [36]:
# proj_all1.sample(2)

In [37]:
# proj_all1>>filter(_.City!='both')

In [38]:
# county_place_names = (to_snakecase(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/2020-place-names-locode.xlsx', sheet_name=1)))

In [39]:
# county_place_names

## Project Location

In [40]:
location = (proj_all>>select(_.implementing_agency, _.county_name, _.project_title))

In [41]:
location.sample()

Unnamed: 0,implementing_agency,county_name,project_title
22,Pinole,Contra Costa County,"APPIAN WAY AND MARLESTA ROAD INTERSECTION SAFETY IMPROVEMENTS: INSTALLATION OF A TRAFFIC SIGNAL AND TRAFFIC SIGNAGE, INCLUDING ADVANCED WARNING SIGNS"


In [42]:
len(location)

330

## Project Title

In [43]:
#phrase extraction maybe: https://stackoverflow.com/questions/70995812/extract-keyword-from-sentences-in-a-pandas-text-column-using-nltk-and-or-regex

In [44]:
# ! pip install nltk
# ! pip install textblob
# ! pip install wordcloud

In [45]:
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [46]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [47]:

import re

In [48]:
#subset df to work on title
title = proj_all>>select(_.implementing_agency, _.program_code_description, _.project_title, _.program_code)

In [49]:
title

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code
0,Hawthorne,HIGHWAY SAFETY IMP PROG IIJA,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,YS30//YS30
1,Yucaipa,NATIONAL HIGHWAY PERF IIJA,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",Y001
2,Fairfield,HIGHWAY SAFETY IMP PROG IIJA,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",YS30//YS30
3,Lake County,STBG IIJA OFF-SYSTEM BRIDGE,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),Y233//Y233//Y233//Y233
4,Vacaville,HIGHWAY SAFETY IMP PROG IIJA,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,YS30//YS30
...,...,...,...,...
325,Indio,NATIONAL HIGHWAY PERF IIJA,"WESTBOUND INDIO BOULEVARD OVER WHITEWATER RIVER, BR. NO. 56C-0292 SEISMIC RETROFIT AND SCOUR COUNTERMEASURES",Y001//Y001
326,Kingsburg,CONGESTION MITIGATION IIJA,WESTSIDE OF 18TH AVE FROM STROUD AVE TO KLEPPER ST CONSTRUCT NEW SIDEWALKS,Y400//Y400
327,Santa Ana,TRANS ALTERNATIVES >200K IIJA,"WILLITS STREET FROM FAIRVIEW STREET TO E/S OF RAITT STREET INSTALL MEDIAN, PARKING-PROTECTED BICYCLE LANES, AND DEDICATED BICYCLE SIGNAL HEADS (TC)",Y301//Y301
328,San Joaquin County,STBG IIJA OFF-SYSTEM BRIDGE,WIMER ROAD OVER INDIAN CREEK NORTH BRANCH (BRIDGE 29C0303) BRIDGE REPLACEMENT (TC),Y233//Y233


In [50]:
title.program_code_description.value_counts()

HIGHWAY SAFETY IMP PROG IIJA      75
STBG-URBANIZED >200K IIJA         61
NATIONAL HIGHWAY PERF IIJA        59
STBG IIJA OFF-SYSTEM BRIDGE       49
CONGESTION MITIGATION IIJA        37
SURFAC TRNSP BLK GRTS-FLX IIJA    18
TRANS ALTERNATIVES >200K IIJA     14
PROJ TO REDUCE PM 2.5 EMI IIJA    12
TRANSP ALTERNATIVES FLEX IIJA      4
TRANS ALTERN 50K-200K POP IIJA     1
Name: program_code_description, dtype: int64

In [51]:
def get_list_of_words(df, col):
    nltk.download('stopwords')
    nltk.download('punkt')
    
    #get just the one col
    column = df[[col]]
    #remove single-dimensional entries from the shape of an array
    col_text = column.squeeze()
    # get list of words
    text_list = col_text.tolist()
    #join list of words 
    text_list = ' '.join(text_list).lower()
    
    # remove punctuation 
    text_list = re.sub(r'[^\w\s]','',text_list)
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words('english')]
    # remove stopwords
    clean_text_list = [word for word in word_tokenize(text_list.lower()) if word not in swords] 
    # turn into a dataframe
    clean_text_list = pd.DataFrame(np.array(clean_text_list))

    return clean_text_list

In [52]:
clean_title_list_df = get_list_of_words(title, "project_title")

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [53]:
clean_title_list_df.value_counts()

bridge     178
road       142
avenue     115
ave         77
street      75
          ... 
jamacha      1
bellota      1
belmont      1
j            1
00           1
Length: 1729, dtype: int64

In [54]:
clean_title_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4910 entries, 0 to 4909
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       4910 non-null   object
dtypes: object(1)
memory usage: 38.5+ KB


In [55]:
#splitting up the types of words based on the most common words appearing in title name

fix_type = ['REPLACEMENT', 'INSTALL', 'CONSTRUCT', 'REPLACE', 'SIGNAL', 'TRAFFIC',
           'IMPROVEMENT', 'PEDESTRIAN', 'LANES', 'NEW', 'REHABILITATION',
           'UPGRADE', 'CLASS', 'BIKE', 'WIDEN', 'LANDSCAPING', 'SAFETY', 'RAISED', 
            'SEISMIC', 'SIGNAGE', 'RETROFIT', 'ADD', 'PLANNING', 'PAVE'
           'PREVENTIVE','MAINTENANCE', 'REHAB', 'RESURFACE', 'REPAIR', 'ROUNDABOUT']

area_type = ['BRIDGE', 'ROAD', 'RD', 'AVENUE', 'AVE', 'STREET' , 'ST',
             'FRACTURED', 'LANE', 'DRIVE', 'boulevard', 'BLVD',
             'INTERSECTION', 'intersections', 'WAY', 'DR', 'CURB', 'ROADWAY',
             'TRAIL', 'PATH', 'CREEK', 'RIVER', 'SIDEWALK', 'CORRIDOR', 'PARKWAY',
            'RAMPS', 'GUARDRAIL']

jurisdiction = ['CITY', 'COUNTY', 'STATE', 'UNINCORPORATED']

other = ['TC', 'EXISTING']

In [56]:
def tokenize(texts):
    return [nltk.tokenize.word_tokenize(t) for t in texts]

### using  np.concatenate

In [57]:
#! pip install more-itertools

In [58]:
#from more_itertools import split_after

In [59]:
## code help: https://stackoverflow.com/questions/70995812/extract-keyword-from-sentences-in-a-pandas-text-column-using-nltk-and-or-regex
def key_word_intersection(df, text_col):
    summaries = []
    for x in tokenize(df[text_col].to_numpy()):
        keywords = np.concatenate([
                                np.intersect1d(x, ['BRIDGE REPLACEMENT', 'BRIDGE', 'INSTALL', 'CONSTRUCT', 'REPLACE',
                                                   'SIGNAL', 'SIGNALS', 'TRAFFIC', 'IMPROVEMENT', 'PEDESTRIAN', 
                                                   'LANES', 'NEW', 'REHABILITATION','UPGRADE', 'CLASS',
                                                   'BIKE', 'WIDEN', 'LANDSCAPING', 'SAFETY', 'RAISED', 
                                                   'SEISMIC', 'SIGNAGE', 'RETROFIT', 'ADD', 'PLANNING', 'PAVE',
                                                   'PREVENTIVE','MAINTENANCE', 'REHAB', 'RESURFACE', 'REPAIR', 'ROUNDABOUT'
                                                  'COMPLETE STREET', 'VIDEO DETECTION EQUIPMENT', 'SYNCHRONIZE CORRIDOR', 'ROADWAY REALIGNMENTS']),
                                np.intersect1d(x, [
                                    # 'BRIDGE', 'ROAD', 'RD', 'AVENUE', 'AVE', 'STREET' , 'ST',
                                                   # 'FRACTURED', 'LANE', 'DRIVE', 'BOULEVARD', 'BLVD',
                                                   'INTERSECTION', 'INTERSECTIONS', 'SIDEWALK', 
                                    # 'WAY', 'DR', 'CURB', 'ROADWAY',
                                                   # 'TRAIL', 'PATH', 'CREEK', 'RIVER', 
                                    # 'CORRIDOR', 'CROSSING','PARKWAY','RAMPS', 'GUARDRAIL'
                                ]), 
                                np.intersect1d(x, ['CITY', 'COUNTY', 'STATE', 'UNINCORPORATED'])])
    
        summaries.append(np.array(x)[[i for i, keyword in enumerate(x) if keyword in keywords]])
    return summaries 

In [60]:
title.sample()

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code
107,Vallejo,HIGHWAY SAFETY IMP PROG IIJA,"FIVE INTERSECTIONS: BROADWAY & HAMPSHIRE ST., BROADWAY & ILLINOIS ST., SPRINGS RD. & LASSEN ST., SPRINGS RD. & TREGASKIS AVE.,SPRINGS RD. & HEARTWOOD",YS30//YS30


In [61]:
text = title>>select (_.project_title)

In [62]:
text['summary'] = key_word_intersection(text, 'project_title')

In [63]:
text.sample(10)

Unnamed: 0,project_title,summary
37,"BAY AREA REGIONAL: REGIONAL PLANNING ACTIVITIES AND PLANNING, PROGRAMMING AND MONITORING (PPM). PRIOR YEAR FUNDING PROGRAMMED ON REG170001 (TC)","[PLANNING, PLANNING]"
109,"FOOTHILL RD (1.5 MI W OF SR33) @ CUYAMA RIVER, SANTA BARBARA COUNTY. REPLACE LWC #00L0047 WITH 2-LANE BRIDGE (TC).","[COUNTY, REPLACE, BRIDGE]"
11,ALAMO PINTADO BRIDGE #51C0081 ON ALAMO PINTADO RD @ ALAMO PINTADO CRK LOS OLIVOS REPAIR & CONSTRUCTION OF SCOUR COUNTERMEASURE (TC),"[BRIDGE, REPAIR]"
115,FOURTEEN (14) SELECT INTERSECTIONS CITYWIDE INSTALL FOURTEEN (14) PEDESTRIAN WARNING FLASHING BEACONS CITYWIDE,"[INTERSECTIONS, INSTALL, PEDESTRIAN]"
317,"VARIOUS SEGMENTS ALONG FLORIN ROAD INSTALL RAISED MEDIAN, INSTALL PEDESTRIAN HYBRID BEACONS, AND ADD PEDESTRIAN FENCING.","[INSTALL, RAISED, INSTALL, PEDESTRIAN, ADD, PEDESTRIAN]"
82,"DTLA ARTS DISTRICT AREA FROM 2ND STREET ON THE NORTH, 7TH STREET ON THE SOUTH, ALAMEDA STREET ON THE WEST AND THE LA RIVER ON THE EAST WITH A FOCUS O",[]
298,"US-50 FROM STATELINE AVENUE TO PIONEER TRAIL REALIGN ROADWAY, REDUCE LANES AND TRANSIT-BIKE-PEDESTRIAN LANE (TC)",[LANES]
126,"GOLDEN VALLEY PARKWAY (TOWNE CENTRE DRIVE AND LATHROP ROAD), RIVER ISLANDS PARKWAY (GOLDEN VALLEY PARKWAY LATHROP FIRE STATION 34), LATHROP ROAD. ROA",[]
5,21 LOCATIONS IN CITY OF BENICIA UPGRADE EXISTING GUARDRAILS AND END TREATMENTS.,"[CITY, UPGRADE]"
64,"COLEMAN CREEK PATH @ SNYDER LANE, HINEBAUGH CREEK PATH @ COUNTRY CLUB DRIVE, HINEBAUGH CREEK PATH @ STATE FARM DRIVE, HINEBAUGH CREEK PATH @ COMMERCE",[STATE]


In [64]:
text>>filter(_.project_title.str.contains('COMPLETE STREET'))

Unnamed: 0,project_title,summary
23,"ARDEN WAY FROM ETHAN WAY TO MORSE AVENUE. ARDEN WAY, FROM FULTON AVENUE TO MORSE AVENUE COMPLETE STREET IMPROVEMENTS INCLUDING SEPARATED SIDEWALK, C",[SIDEWALK]
29,"AVE R BETWEEN SIERRA HIGHWAY AND 25TH STREET. AVE R COMPLETE STREET WITH SIDEWALKS GAP CLOSURES, BIKE LANES, ADA RAMPS AND ENHANCED CROSSWALKS","[BIKE, LANES]"
99,EL CAMINO REAL (SR82) FROM ARROYO TO KAISER WAY COMPLETE STREET IMPROVEMENTS,[]
129,HEALDSBURG AVENUE BETWEEN POWELL AVENUE AND PASSALAQUA ROAD COMPLETE STREET IMPROVEMENTS,[]
290,THE TOWNSITE DRIVE COMPLETE STREETS PROJECT IS LOCATED IN THE CITY OF VISTA ALONG THE SEGMENT OF NORTH DRIVE FROM JUST NORTHWEST OF THE INTERSECTION,"[CITY, INTERSECTION]"


In [65]:

text>>filter(_.project_title.str.contains('BRIDGE REPLACEMENT'))

Unnamed: 0,project_title,summary
3,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),"[STATE, BRIDGE]"
6,3/8 MI WEST OF PARAMOUNT BLVDWASHINGTON BLVD OVER RIO HONDO.LOCAL BRIDGE # 53C0471 BRIDGE REPLACEMENT (DEMOLISH THE EXISTING 6 LANNE SUPERSTRUCTURE,"[BRIDGE, BRIDGE]"
10,ALAMITOS RD. BRIDGE @ ALAMITOS CK (37C0159) BRIDGE REPLACEMENT/SEISMIC RET (TC),"[BRIDGE, BRIDGE]"
24,ARROYA AVENUE OVER WEST DELTA CANAL (BRIDGE 39C0275) BRIDGE REPLACEMENT (TC),"[BRIDGE, BRIDGE]"
25,"ASH CREEK ROAD AT SACRAMENTO RIVER OVERFLOW, BRIDGE 06C0233 BRIDGE REPLACEMENT","[BRIDGE, BRIDGE]"
41,BON AIR BRIDGE (BON AIR RD OVER CORTE MADERA CREEK). BR.# 27C0028 BRIDGE REPLACEMENT,"[BRIDGE, BRIDGE]"
53,BUCK AVENUE OVER ALAMO CREEKBR NO. 23C0011 BRIDGE REPLACEMENT,[BRIDGE]
54,CANYON RD @ ACID CANAL. BR. # 06C0307 BRIDGE REPLACEMENT,[BRIDGE]
55,CASSEL FALL RIVER ROAD BRIDGE AT PIT RIVER. BR. # 06C0039 BRIDGE REPLACEMENT,"[BRIDGE, BRIDGE]"
68,"CR 40 OVER CACHE CREEK, 0.12 MILES SOUTH OF SR 16 BRIDGE REPLACEMENT. REPLACE EXISTING 1 LANE BRIDGE WITH A NEW 1 LANE BRIDGE. BR#22C0091 (TC)","[BRIDGE, REPLACE, BRIDGE, NEW, BRIDGE]"


In [66]:
text>>filter(_.project_title.str.contains('WIDEN'))

Unnamed: 0,project_title,summary
81,"DOWNTOWN LOS ANGELES, BROADWAY BETWEEN 4TH AND 6TH STREETS PEDESTRIAN SAFETY IMPROVEMENTS INCLUDING CURB EXTENSIONS, WIDENED SIDEWALK, CROSSWALK AND","[PEDESTRIAN, SAFETY, SIDEWALK]"
104,"FIRST STREET BETWEEN FLOWER ST AND STANDARD AVE WIDEN EXISTING SIDEWALKS BY 3FT, NARROW VEHICLE LANES, AND CONSTRUCT ADA IMPROVEMENTS ON SIDEWALKS AN","[WIDEN, LANES, CONSTRUCT]"
156,"IN STOCKTON, PARALLEL TO MARCH LANE IN THE EAST BAY MUD CORRIDOR BETWEEN BROOKSIDE ROAD AND HILLSBORO WAY. RECONSTRUCT, WIDEN AND IMPROVE EXISTING PA",[WIDEN]
158,"IN THE CITY OF GALT ON KOST ROAD AT UNION PACIFIC RAILROAD CROSSING, WEST OF JOY DR., EAST OF MARIA WAY. WIDEN 400 FEET OF KOST RD. ON EACH SIDE OF T","[CITY, WIDEN]"
181,LA PAZ ROAD: MUIRLANDS BLVD. TO CRISANTA DR. WIDEN TWO OVERHEADS OVER BNSF,[WIDEN]
194,MILLERTON ROAD FROM FRIANT ROAD TO MARINA DRIVE WIDEN ROADWAY FROM 2LU TO 4LD,[WIDEN]
204,"NEES AVENUE FROM MINNEWAWA AVENUE TO CLOVIS AVENUE ROAD WIDENING AND RECONSTRUCTION, INSTALLATION OF CURB AND GUTTERS, SIDEWALK, BICYCLE LANES, MODIF","[SIDEWALK, LANES]"
220,ON OLD OREGON TRAIL BETWEEN PASO ROBLES AVENUE AND BEAR MOUNTAIN ROAD WIDEN SHOULDERS AND UPGRADE DRAINAGE (TC),"[WIDEN, UPGRADE]"
233,PARK ROAD FROM 250 FT SOUTH OF OAK ROAD TO BAYSHORE ROAD REHAB./RESURFACE ROADWAY AND WIDEN ROAD TO INSTALL ASPHALT CONCRETE PAVED CLASS II/V BICYC,"[WIDEN, INSTALL, CLASS]"
241,"POWER INN ROAD FROM ELSIE AVENUE TO ABOUT 400 FEET NORTH OF MACFADDEN DRIVE. INSTALL CURB, GUTTER, SIDEWALK INFILL AND CURB RAMPS; WIDEN SUBSTANDARD","[INSTALL, SIDEWALK, WIDEN]"


In [67]:
type(text['summary'])

pandas.core.series.Series

### using if statement

In [68]:
title.sample()

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code
310,Riverside,HIGHWAY SAFETY IMP PROG IIJA,"VARIOUS LOCATIONS THROUGHOUT THE CITY OF RIVERSIDE INSTALL HIGH FRICTION SURFACE TREATMENT AT FIVE LOCATIONS AND 2 HAWK SIGNALS, AND DEPLOY NEW SIGNA",YS30


In [69]:
type_list = ['BRIDGE REPLACEMENT', 'COMPLETE STREET', 'VIDEO DETECTION EQUIPMENT', 'SYNCHRONIZE CORRIDOR', 'ROADWAY REALIGNMENTS']

In [70]:
text>>filter(
            _.project_title.str.contains('UPGRADE')| _.project_title.str.contains('IMPROVE')
            )

Unnamed: 0,project_title,summary
0,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,"[TRAFFIC, SIGNAL, INTERSECTIONS, SIGNAGE]"
4,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,"[INTERSECTIONS, SIGNAL]"
5,21 LOCATIONS IN CITY OF BENICIA UPGRADE EXISTING GUARDRAILS AND END TREATMENTS.,"[CITY, UPGRADE]"
20,AMERICAN AVENUE FROM MADERA AVENUE TO PLACER AVENUE SHOULDER IMPROVEMENTS,[]
22,"APPIAN WAY AND MARLESTA ROAD INTERSECTION SAFETY IMPROVEMENTS: INSTALLATION OF A TRAFFIC SIGNAL AND TRAFFIC SIGNAGE, INCLUDING ADVANCED WARNING SIGNS","[INTERSECTION, SAFETY, TRAFFIC, SIGNAL, TRAFFIC, SIGNAGE]"
23,"ARDEN WAY FROM ETHAN WAY TO MORSE AVENUE. ARDEN WAY, FROM FULTON AVENUE TO MORSE AVENUE COMPLETE STREET IMPROVEMENTS INCLUDING SEPARATED SIDEWALK, C",[SIDEWALK]
26,AT MAX FOSTER SPORTS COMPLEX AREA IN LIVINGSTON CONSTRUCT MULTI-USE PATH IMPROVEMENTS,[CONSTRUCT]
81,"DOWNTOWN LOS ANGELES, BROADWAY BETWEEN 4TH AND 6TH STREETS PEDESTRIAN SAFETY IMPROVEMENTS INCLUDING CURB EXTENSIONS, WIDENED SIDEWALK, CROSSWALK AND","[PEDESTRIAN, SAFETY, SIDEWALK]"
85,E. 20TH STREET FROM THE MALL TO THE END OF BUSINESS LANE (ADJACENT TO SR99) BIKEWAY IMPROVEMENTS,[]
99,EL CAMINO REAL (SR82) FROM ARROYO TO KAISER WAY COMPLETE STREET IMPROVEMENTS,[]


In [71]:
text.sample(10)

Unnamed: 0,project_title,summary
193,MIDWAY RD @ BUTTE CREEK 3.9 MILES SOUTH OF NELSON RD. BRIDGE NO. 12C0052 & 12C0053 BRIDGE REPLACEMENT,"[BRIDGE, BRIDGE]"
270,"SIXTH STREET VIADUCT OVER LA RIVER, US 101, AND UPRR, BNSF, AMTRACK, LACMTA AND METROLINK TRACKS BRIDGE # 53C-1880 AND STATE BRIDGE # 53-0595THIS PRO","[BRIDGE, STATE, BRIDGE]"
275,"SOUTHWORTH RD, GOLD STRIKE RD, POOL STATION RD, GARABALDI ST, MURPHYS GRADE RD, COPPER COVE DR, AND O'BYRNES FERRY RD. UPGRADE EXISTING GUARDRAILS,",[UPGRADE]
257,"SAN FRANCISCO BAY AREA REGIONAL PLANNING ACTIVITIES AND PLANNING, PROGRAMMING, AND MONITORING (PPM). IN COORDINATION WITH MTC, COUNTY TRANSPORTATION","[PLANNING, PLANNING, COUNTY]"
165,"INTERSECTION OF PELANDALE AVENUE AND SR 99 IN MODESTO, CA TO INSTALL A SECOND LEFT TURN LANE FROM EASTBOUND PELANDALE AVENUE TO NORTHBOUND SISK ROAD","[INTERSECTION, INSTALL]"
102,FERNALD PT. LN. BRIDGE #51C0137 @ ROMERO CRK IN SANTA BARBARA COUNTY BRIDGE REPLACEMENT (TC),"[BRIDGE, COUNTY, BRIDGE]"
79,DOGTOWN ROAD OVER MAXWELL CREEK (BRIDGE 40C0039) BRIDGE REPLACEMENT (TC),"[BRIDGE, BRIDGE]"
309,VARIOUS LOCATIONS THROUGHOUT THE CITY OF REDLANDS INSTALL EMERGENCY VEHICLE PREEMPTION DEVICES AT SIGNALIZED INTERSECTIONS,"[CITY, INSTALL, INTERSECTIONS]"
315,"VARIOUS ROAD CORRIDORS IN WESTERN NEVADA COUNTY. CONDUCT GUARDRAIL SAFETY AUDIT, IDENTIFY DEFICIENCIES, AND REPAIR OR UPGRADE GUARDRAIL AS NECESSARY","[COUNTY, SAFETY, REPAIR, UPGRADE]"
72,DANVILLE BOULEVARD BETWEEN STONE VALLEY ROAD AND JACKSON WAY IN UNINCORPORATED ALAMO. CONSTRUCT A ROUNDABOUT AT THE INTERSECTION OF DANVILLE BOULEVAR,"[UNINCORPORATED, CONSTRUCT, INTERSECTION]"


In [72]:
#  def project_cat(row): 
#         if (row.project_title.str.contains("BRIDGE REPLACEMENT")):
#             return "Bridge Replacement"
        
#         elif (row.project_title.str.contains(" WIDEN SHOULDERS ")):
#             return "Widen Shoulders"
        
#         elif (row.project_title.str.contains("SYNCHRONIZE CORRIDOR")):
#             return "Synchronize Corridor"
        
#         elif (row.project_title.str.contains("COMPLETE STREET")):
#             return "Complete Streets"
        
#         elif (row.project_title.str.contains('TRAFFIC SIGNAL') and row.project_title.str.contains('INSTALL')):
#             return "Install Traffic Signal"
        
#         elif (row.project_title.str.contains('BRIDGE PREVENTIVE MAINTENANCE')):
#             return "Bridge Preventive Maintenance"
        
#         elif (row.project_title.str.contains("UPGRADE") and row.project_title_str.contains('GUARDRAIL')):
#             return "Upgrade Guardrail"
#         elif (row.project_title.str.contains("UPGRADE") and row.project_title_str.contains('TRAFFIC SIG')):
#             return "Upgrade Traffic Signal"
    
#         ## return string to identify which cases do not fit.
#         else:
#             return ""
        
#     # df['project_type'] = df.apply(lambda x: project_cat(x), axis=1)

### Function for getting project types using np.where
(str format)

In [150]:
def add_description(df):
    ##using np.where. code help: https://stackoverflow.com/questions/43905930/conditional-if-statement-if-value-in-row-contains-string-set-another-column
    ## method for project in first column
    df['project_method'] = (np.where(df.project_title.str.contains("INSTALL"), "Install",
                        np.where(df.project_title.str.contains("CONSTRUCT"), "Construct",
                        np.where(df.project_title.str.contains("UPGRADE"), "Upgrade",
                        np.where(df.project_title.str.contains("IMPROVE"), "Improve",
                        np.where(df.project_title.str.contains("REPAIR"), "Repair",
                        np.where(df.project_title.str.contains("REPLACE"), "Replace",
                        np.where(df.project_title.str.contains("REPLACE ")& df.project_title.str.contains("BRIDGE"), "",
                        np.where(df.project_title.str.contains("REPLACE")& df.project_title.str.contains("GUARDRAIL"), "Replace",
                        np.where(df.project_title.str.contains("PAVE")| df.project_title.str.contains("PAVING"), "Pave",
                        
                                    ""))))))))))
    
    ## types of projects in second column
    df['project_type'] = (
                        #np.where(df.project_title.str.contains("BRIDGE REPLACEMENT") , "Bridge Replacement",
                        np.where(df.project_title.str.contains("SHOULDER"), "Shoulders",
                        np.where(df.project_title.str.contains("SYNCHRONIZE CORRIDOR"), "Synchronize Corridor",
                        np.where(df.project_title.str.contains("COMPLETE STREET"), "Complete Streets",
                        np.where(df.project_title.str.contains("BRIDGE PREVENTIVE MAINTENANCE"), "Bridge Preventive Maintenance",
                        np.where(df.project_title.str.contains("SIDEWALK"), "Sidewalk",
                        np.where(df.project_title.str.contains("SCOUR"), "Erosion Countermeasures",
                        np.where(df.project_title.str.contains("ROUNDABOUT"), "Roundabout",
                        np.where(df.project_title.str.contains("GUARDRAIL"), "Guardrails",
                        np.where(df.project_title.str.contains("VIDEO DETECTION EQUIPMENT"), "Video Detection Equipment",
                        np.where(df.project_title.str.contains("PEDESTRIAN") & df.project_title.str.contains("BIKE") , "Pedestrian  & Bike Safety Improvements",
                        np.where(df.project_title.str.contains("BRIDGE") & df.project_title.str.contains("REHAB") , "Bridge Rehabilitation",
                        np.where(df.project_title.str.contains("PAVEMENT") & df.project_title.str.contains("REHAB") , "Pavement Rehabilitation",
                        np.where(df.project_title.str.contains("PEDESTRIAN"), "Pedestrian Safety Improvements",
                        np.where(df.project_title.str.contains("TRAFFIC SIG"), "Traffic Signals",
                        np.where(df.project_title.str.contains("BIKE SHARE"), "Bike Share Program",
                        np.where(df.project_title.str.contains("BIKE"), "Bike Lanes",                  
                        np.where(df.project_title.str.contains("SIGNAL"), "Signals",
                        np.where(df.project_title.str.contains("SIGN"), "Signage",
                        np.where(df.project_title.str.contains("BRIDGE"), "Bridge",
                        np.where(df.project_title.str.contains("SAFETY ") & df.project_title.str.contains("IMPROVE") , "Safety Improvemnts",
                                 'Project')
                                   ))))))))))))))))))))#)
    
    ## need to expand this to include more. maybe try a list. but capture entries with multiple projects
    df['other'] = (np.where(df.project_title.str.contains("CURB") & df.project_title.str.contains("SIDEWALK") | df.project_title.str.contains("BIKE"), "Multiple Road",
                                 "Other Projects"))
    
    return df

In [151]:
#sample of function
((add_description(title))).head(10)

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
0,Hawthorne,HIGHWAY SAFETY IMP PROG IIJA,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,YS30//YS30,Upgrade,Traffic Signals,Other Projects,Upgrade Traffic Signals
1,Yucaipa,NATIONAL HIGHWAY PERF IIJA,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",Y001,Replace,Bridge,Other Projects,Replace Bridge
2,Fairfield,HIGHWAY SAFETY IMP PROG IIJA,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",YS30//YS30,Install,Signals,Other Projects,Install Signals
3,Lake County,STBG IIJA OFF-SYSTEM BRIDGE,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),Y233//Y233//Y233//Y233,Replace,Bridge,Other Projects,Replace Bridge Replacement
4,Vacaville,HIGHWAY SAFETY IMP PROG IIJA,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,YS30//YS30,Improve,Signals,Other Projects,Improve Signals
5,Benicia,HIGHWAY SAFETY IMP PROG IIJA,21 LOCATIONS IN CITY OF BENICIA UPGRADE EXISTING GUARDRAILS AND END TREATMENTS.,YS30//YS30,Upgrade,Guardrails,Other Projects,Upgrade Guardrails
6,Pico Rivera,NATIONAL HIGHWAY PERF IIJA,3/8 MI WEST OF PARAMOUNT BLVDWASHINGTON BLVD OVER RIO HONDO.LOCAL BRIDGE # 53C0471 BRIDGE REPLACEMENT (DEMOLISH THE EXISTING 6 LANNE SUPERSTRUCTURE,Y001,Replace,Bridge,Other Projects,Replace Bridge Replacement
7,Marin County,STBG-URBANIZED >200K IIJA,"4 AT TERNERS DRIVE @ DRIVEWAY ENTRANCES TO MULTI-FAMILY HOUSING CLOSEST TO DONAHUE, 4 AT TERNERS DRIVE @ TERRACE WAY, 4 AT TERNERS DRIVE @ TERRACE DR",Y230,,Project,Other Projects,Project
8,Marin County,HIGHWAY SAFETY IMP PROG IIJA,"59 TRAFFIC SIGNAL COUNTYWIDE. JURISDICTIONS INCLUDE CORTE MADERA, FAIRFAX, MILL VALLEY, NOVATO, SAN ANSELMO, SAN RAFAEL, SAUSALITO, MARIN COUNTY, LA",YS30//YS30,,Traffic Signals,Other Projects,Traffic Signals
9,Los Banos,CONGESTION MITIGATION IIJA,5-MILES STRETCH OF SR-152 SYNCHRONIZATION OF 14 TRAFFIC SIGNALS ON A FIVE MILE STRETCH OF SR-152 WITH INSTALLATION OF CCTV CAMERAS. (TC),Y400//Y400,Install,Traffic Signals,Other Projects,Install Traffic Signals


In [152]:
#how many entries with no tags
len((add_description(title))>>filter(_.project_type==('Project')))

98

In [153]:
#entries with no tag
((add_description(title))>>filter(_.project_type==('Project'))>>arrange(-_.project_method)).head(40)

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
34,Colton,NATIONAL HIGHWAY PERF IIJA,"BARTON ROAD OVERHEAD AT UNION PACIFIC RAILROAD, 0.25 MILE WEST OF I-215, BR. NO. 54C-0379 REPLACE OVERHEAD WITH TWO-LANE ROAD ACROSS DISCONTINUED RAI",Y001//Y001,Replace,Project,Other Projects,Replace Project
269,Los Angeles,NATIONAL HIGHWAY PERF IIJA,"SIXTH STREET VIADUCT OVER LA RIVER AND EAST SANTA ANNA FREEWAY, 53C1880 REPLACE STRUCT DEFICIENT VIADUCT WITH NEW (TC)",Y001,Replace,Project,Other Projects,Replace Project
281,Orange County Transportation Authority,STBG-URBANIZED >200K IIJA,STATE ROUTE 55 FROM INTERSTATE 405 TO INTERSTATE 5 ADD ONE MIXED-FLOW AND HIGH-OCCUPANCY LANE IN EACH DIRECTION AND REPAIR CHOKEPOINTS (TC),Y230//Y230//Y400,Repair,Project,Other Projects,Repair Project
312,Pasadena,HIGHWAY SAFETY IMP PROG IIJA,VARIOUS LOCATIONS THROUGHOUT THE WEST SIDE OF THE CITY. REPAIR EXISTING DAMAGED GUARDRAI,YS30//YS30,Repair,Project,Other Projects,Repair Project
93,Fresno,TRANSP ALTERNATIVES FLEX IIJA,"EASTBOUND MCKINLEY FROM MILLBROOK AVENUE TO CLOVIS AVENUE ALONG THE NORTH BANK OF THE MILL DITCH CANAL. PAVED PATH, LIGHTING, BENCHES,FENCING, IRRIGA",Y300//Y300,Pave,Project,Other Projects,Pave Project
213,Oakland,TRANS ALTERNATIVES >200K IIJA,"OAKLAND - 14TH STREET SAFE ROUTES IN THE CITY. ON 14TH ST BETWEEN BRUSH ST AND OAK ST REDUCE TRAVEL LANES FROM 4 TO 2, ADD PAVED CLASS IV PROTECTED B",Y301,Pave,Project,Other Projects,Pave Project
15,Fresno County,TRANS ALTERNATIVES >200K IIJA,"ALONG GROVE AVENUE FROM PROSPECT AVENUE TO VALENTINE AVENUE, AND ALONG VALENTINE AVENUE FROM GROVE AVENUE TO NORTH AVENUE INSTALL AN ASPHALT CONCRETE",Y301//Y301,Install,Project,Other Projects,Install Project
17,Stockton,HIGHWAY SAFETY IMP PROG IIJA,ALONG PACIFIC AVE. FROM CALAVERAS RIVER TRAIL TO WEST HAMMER LANE. INSTALL RAISED MEDIAN CURB,YS30,Install,Project,Other Projects,Install Project
96,La Puente,HIGHWAY SAFETY IMP PROG IIJA,"EIGHT (8) CROSSWALK LOCATIONS THROUGHOUT THE CITY OF LA PUENTE. INSTALL RECTANGULAR RAPID FLASHING BEACONS (RRFBS), CURB EXTENSIONS, ADVANCED PAVEMEN",YS30,Install,Project,Other Projects,Install Project
108,Sacramento,STBG-URBANIZED >200K IIJA,FLORIN ROAD BETWEEN GREENHAVEN DRIVE AND LUTHER DRIVE. CONVERT FROM PEDESTAL MOUNTED TO MAST ARMS AT 7 INTERSECTIONS; INSTALL ADVANCED DETECTION AT F,Y230//Y230//Y240//YS30//YS30,Install,Project,Other Projects,Install Project


In [154]:
#checking remaining common words for no project tags
((get_list_of_words(((add_description(title))>>filter(_.project_type=='Project')), "project_title")).value_counts()).head(60)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


avenue           51
st               44
ave              39
road             36
city             24
blvd             20
street           18
east             17
construct        17
north            15
rd               15
county           14
south            13
intersections    13
drive            12
along            12
west             12
san              11
area             10
way              10
dr                9
tc                9
lane              9
bay               8
path              8
parkway           7
river             7
redwood           6
various           6
project           6
class             6
creek             6
regional          6
streets           6
corridor          6
widen             6
park              6
trail             6
2                 6
install           6
including         6
4                 6
state             6
mile              6
interstate        5
la                5
side              5
intersection      5
alameda           5
planning          5


In [155]:
#sample of some descriptions that got cut off
((add_description(title)))>>filter(_.project_type=='Project')>>filter(_.project_title.str.contains('AVENUE'))

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
15,Fresno County,TRANS ALTERNATIVES >200K IIJA,"ALONG GROVE AVENUE FROM PROSPECT AVENUE TO VALENTINE AVENUE, AND ALONG VALENTINE AVENUE FROM GROVE AVENUE TO NORTH AVENUE INSTALL AN ASPHALT CONCRETE",Y301//Y301,Install,Project,Other Projects,Install Project
32,Palmdale,HIGHWAY SAFETY IMP PROG IIJA,AVENUE S-8 AND 40TH STREET EAST. CONVERT AN EXISTING FOUR WAY STOP TO A ROUDABOUT AT THE INTERSECTION OF AVENUE S-8 AND 40TH STREET EAST.. ORIGIO,YS30//YS30,,Project,Other Projects,Project
36,Rialto,HIGHWAY SAFETY IMP PROG IIJA,BASELINE ROAD - WILLOW AVENUE TO MERIDIAN AVENUE CONSTRUCT A 12' WIDE RAISED CENTER MEDIAN AND ADA RAMP.,YS30//YS30,Construct,Project,Other Projects,Construct Project
40,Belmont,STBG-URBANIZED >200K IIJA,"BELMONT: CHULA VISTA FROM ALAMEDA DE LAS PULGAS TO RALSTON AVE, 6TH AVENUE FROM RALSTON AVENUE TO HILL STREET, 6TH AVENUE FROM EMMETT AVENUE TO HARBO",Y230,,Project,Other Projects,Project
67,Sacramento County,HIGHWAY SAFETY IMP PROG IIJA,"COYLE AVENUE FROM WOODLEIGH TO DEWEY, HILLSDALE BOULEVARD FROM FRIZELL AVENUE TO MCCLOUD DRIVE, AND WATT AVENUE FROM ELKHORN BOULEVARD TO 700' NORTH",YS30//YS30,,Project,Other Projects,Project
87,Morgan Hill,STBG-URBANIZED >200K IIJA,EAST DUNNE AVENUE BETWEEN LOWER THOMAS GRADE AND THE EASTERN CITY LIMIT. FEDERAL PARTICIPATING LOCATION SEGMENTS: 1) LOWER THOMAS GRADE TO FLAMING OA,Y230//Y230,,Project,Other Projects,Project
92,Fairfield,PROJ TO REDUCE PM 2.5 EMI IIJA,EAST TABOR AVENUE ( BETWEEN DOVER AVENUE AND CLAY BANK ROAD) AND SUNSET AVENUE ( BETWEEN EAST TABOR AVENUE AND TRAVIS BOULEVARD) IMPLEMENT PEDESTRIA,Y003//Y003//Y240,,Project,Other Projects,Project
93,Fresno,TRANSP ALTERNATIVES FLEX IIJA,"EASTBOUND MCKINLEY FROM MILLBROOK AVENUE TO CLOVIS AVENUE ALONG THE NORTH BANK OF THE MILL DITCH CANAL. PAVED PATH, LIGHTING, BENCHES,FENCING, IRRIGA",Y300//Y300,Pave,Project,Other Projects,Pave Project
122,Fresno,HIGHWAY SAFETY IMP PROG IIJA,"FRESNO STREET AND THOMAS AVENUE INTERSECTIONS, FRESNO STREET AND SAN JOSE AVENUE INTERSECTION, FRESNO AND R STREETS INTERSECTION, FRESNO STREET AND C",YS30//YS30,,Project,Other Projects,Project
130,Santa Rosa,STBG-URBANIZED >200K IIJA,"HEARN AVENUE BETWEEN STONY POINT ROAD, DUTTON AVENUE AND RANGE AVENUE BETWEEN GUERNEVILLE ROAD AND JENNINGS AVENUE, AND RANGE AVENUE BETWEEN RUSSELL",Y230,,Project,Other Projects,Project


In [156]:
## sample of what third colum, `other` captures
((add_description(title)))>>filter(_.other!='Other Projects')


Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
27,El Dorado County,STBG-URBANIZED >200K IIJA,"AT MISSOURI FLAT RD 0.8 MILES SOUTH OF GOLDEN CENTER DRIVE CONSTRUCT BIKE TRAIL CONNECTION, EXTEND CURB, GUTTER, AND SIDEWALK. TC",Y230//Y230,Construct,Sidewalk,Multiple Road,Construct Sidewalk
29,Palmdale,STBG-URBANIZED >200K IIJA,"AVE R BETWEEN SIERRA HIGHWAY AND 25TH STREET. AVE R COMPLETE STREET WITH SIDEWALKS GAP CLOSURES, BIKE LANES, ADA RAMPS AND ENHANCED CROSSWALKS",Y230,,Complete Streets,Multiple Road,Complete Streets
39,Bellflower,HIGHWAY SAFETY IMP PROG IIJA,"BELLFLOWER BOULEVARD FROM ARTESIA BOULEVARD TO SOUTH CITY LIMITS (350' SOUTH OF ROSE STREET). INSTALLING CLASS II BIKE LANES, RAISED MEDIAN, BULB-OUT",YS30//YS30,Install,Bike Lanes,Multiple Road,Install Bike Lanes
57,Alameda,TRANS ALTERNATIVES >200K IIJA,"CENTRAL BETWEEN PACIFIC AVENUE/MAIN STREET AND SHERMAN STREET/ENCINAL AVENUE. REDUCE ROADWAY FROM 4 TO 3 LANES FOR BIKE LANES AND SEPARATED BIKEWAY,",Y301,,Bike Lanes,Multiple Road,Bike Lanes
81,Los Angeles,TRANSP ALTERNATIVES FLEX IIJA,"DOWNTOWN LOS ANGELES, BROADWAY BETWEEN 4TH AND 6TH STREETS PEDESTRIAN SAFETY IMPROVEMENTS INCLUDING CURB EXTENSIONS, WIDENED SIDEWALK, CROSSWALK AND",Y300//Y300,Improve,Sidewalk,Multiple Road,Improve Sidewalk
85,Chico,CONGESTION MITIGATION IIJA,E. 20TH STREET FROM THE MALL TO THE END OF BUSINESS LANE (ADJACENT TO SR99) BIKEWAY IMPROVEMENTS,Y400,Improve,Bike Lanes,Multiple Road,Improve Bike Lanes
95,Fresno,TRANS ALTERNATIVES >200K IIJA,"EASTSIDE OF BARTON AVENUE FROM CHURCH TO FLORENCE AND FLORENCE AVENUE FROM BARTON TO APPROXIMATELY 105 FT WEST OF JACKSON. INSTALL SIDEWALK, CURB RAM",Y301//Y301,Install,Sidewalk,Multiple Road,Install Sidewalk
119,Fremont,SURFAC TRNSP BLK GRTS-FLX IIJA,FREMONT BLVD. AND WALNUT AVE. INTERSECTION CONNECT TO EXISTING ELEVATED/SEPARATED CLASS IV BIKEWAYS AND SIDEWALKS,Y240//Y240,,Sidewalk,Multiple Road,Sidewalk
120,Metropolitan Transportation Commission,STBG-URBANIZED >200K IIJA,"FREMONT, RICHMOND, AND MARIN AND SONOMA COUNTIES, ALONG THE SMART CORRIDOR. BIKE SHARE CAPITAL PROGRAM (TC)",Y230,,Bike Share Program,Multiple Road,Bike Share Program
142,El Dorado County,CONGESTION MITIGATION IIJA,"IN EL DORADO COUNTY, IN THE COMMUNITY OF POLLOCK PINES, PONY EXPRESS TRAIL, FROM SANDERS DRIVE TO SLY PARK ROAD. INSTALL 1.7 MILES OF CLASS 2 BIKE LA",Y400//Y400,Install,Bike Lanes,Multiple Road,Install Bike Lanes


## Combine for Public friendly title

In [157]:
title_new = (add_description(title))

In [158]:
title_new.head()

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
0,Hawthorne,HIGHWAY SAFETY IMP PROG IIJA,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,YS30//YS30,Upgrade,Traffic Signals,Other Projects,Upgrade Traffic Signals
1,Yucaipa,NATIONAL HIGHWAY PERF IIJA,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",Y001,Replace,Bridge,Other Projects,Replace Bridge
2,Fairfield,HIGHWAY SAFETY IMP PROG IIJA,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",YS30//YS30,Install,Signals,Other Projects,Install Signals
3,Lake County,STBG IIJA OFF-SYSTEM BRIDGE,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),Y233//Y233//Y233//Y233,Replace,Bridge,Other Projects,Replace Bridge Replacement
4,Vacaville,HIGHWAY SAFETY IMP PROG IIJA,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,YS30//YS30,Improve,Signals,Other Projects,Improve Signals


In [159]:
title_new['project_name_new'] = title_new["project_method"] + " " + title_new["project_type"]

In [160]:
title_new

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code,project_method,project_type,other,project_name_new
0,Hawthorne,HIGHWAY SAFETY IMP PROG IIJA,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,YS30//YS30,Upgrade,Traffic Signals,Other Projects,Upgrade Traffic Signals
1,Yucaipa,NATIONAL HIGHWAY PERF IIJA,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",Y001,Replace,Bridge,Other Projects,Replace Bridge
2,Fairfield,HIGHWAY SAFETY IMP PROG IIJA,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",YS30//YS30,Install,Signals,Other Projects,Install Signals
3,Lake County,STBG IIJA OFF-SYSTEM BRIDGE,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),Y233//Y233//Y233//Y233,Replace,Bridge,Other Projects,Replace Bridge
4,Vacaville,HIGHWAY SAFETY IMP PROG IIJA,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,YS30//YS30,Improve,Signals,Other Projects,Improve Signals
...,...,...,...,...,...,...,...,...
325,Indio,NATIONAL HIGHWAY PERF IIJA,"WESTBOUND INDIO BOULEVARD OVER WHITEWATER RIVER, BR. NO. 56C-0292 SEISMIC RETROFIT AND SCOUR COUNTERMEASURES",Y001//Y001,,Erosion Countermeasures,Other Projects,Erosion Countermeasures
326,Kingsburg,CONGESTION MITIGATION IIJA,WESTSIDE OF 18TH AVE FROM STROUD AVE TO KLEPPER ST CONSTRUCT NEW SIDEWALKS,Y400//Y400,Construct,Sidewalk,Other Projects,Construct Sidewalk
327,Santa Ana,TRANS ALTERNATIVES >200K IIJA,"WILLITS STREET FROM FAIRVIEW STREET TO E/S OF RAITT STREET INSTALL MEDIAN, PARKING-PROTECTED BICYCLE LANES, AND DEDICATED BICYCLE SIGNAL HEADS (TC)",Y301//Y301,Install,Signals,Other Projects,Install Signals
328,San Joaquin County,STBG IIJA OFF-SYSTEM BRIDGE,WIMER ROAD OVER INDIAN CREEK NORTH BRANCH (BRIDGE 29C0303) BRIDGE REPLACEMENT (TC),Y233//Y233,Replace,Bridge,Other Projects,Replace Bridge
