# Add Agency and Agency Information to FMIS IIJA

In [1]:
import numpy as np
import pandas as pd
from siuba import *

from shared_utils import geography_utils
from dla_utils import _dla_utils

from calitp import to_snakecase



In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', None)


In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/dla/dla-iija'

## Read In Data

In [4]:
locodes = to_snakecase(pd.read_excel(f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"))

In [5]:
county = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx",
                                   sheet_name='locode1',
                                   header=[0]))

In [6]:
proj = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/CopyofFMIS_Projects_Universe_IIJA_Reporting_4.xls", 
                           # sheet_name='FMIS 5 Projects  ', header=[3]
                           sheet_name='IIJA-combined',
                           # sheet_name='FMIS 5 Projects  ',
                           ))

In [7]:
# proj2 = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of FMIS_Projects_Universe_(IIJA_Reporting)_ (5).xls", 
#                            sheet_name='FMIS 5 Projects  ', header=[3]))

In [8]:
# number of entries in the locodes list
len(locodes)

1041

In [9]:
# number of entries in the county locode list
len(county)

1072

In [10]:
county.sample(4)

Unnamed: 0,agency_locode,agency_name,district,mpo,county
280,5282,Palm Springs,8.0,SCAG,Riverside County
46,5047,Hollister,5.0,AMBAG,San Benito County
882,6335,San Joaquin Valley Air Pollution Control District,53.0,NON-MPO,
405,5408,Adelanto,8.0,SCAG,San Bernardino County


In [11]:
locodes.head()

Unnamed: 0,agency_locode,agency_name,district,county_name,rtpa_name,mpo_name,mpo_locode_fads,active_e76s______7_12_2021_
0,6302,Humboldt Bay Harbor Recreation & Conservation District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
1,6330,Willow Creek Community Services District,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
2,5036,Trinidad,1,Humboldt County,Humboldt County Association of Governments,NON-MPO,NON-MPO,
3,5049,Ukiah,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,Yes
4,5082,Willits,1,Mendocino County,Mendocino Council of Governments,NON-MPO,NON-MPO,


In [12]:
proj.drop(columns =['unnamed:_0', 'unnamed:_13', 'total'], axis=1, inplace=True)

In [13]:
proj.sample()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value
139,2022-08-29,Y400,CONGESTION MITIGATION IIJA,5475042,0318000188L,"IN CITRUS HEIGHTS, BETWEEN ARCADE CREEK PARK PRESERVE AND WACHTEL WAY. CONSTRUCT A 2.9 MILE LONG CLASS 1 MULTI-USE TRAIL. TC",67,Cong Dist 3,28,Facilities for Pedestrians and Bicycles,2000000.0,L5475SACOG


## Get Locode Substring

In [14]:
string = proj['summary_recipient_defined_text_field_1_value'].iloc[0]

In [15]:
string

'L5253SCAG'

In [16]:
print(string.find('5'))
print(string.find('3'))

1
4


In [17]:
# need to extract string from position 1-4. 

In [18]:
proj['locode'] = proj.summary_recipient_defined_text_field_1_value.apply(lambda x: x[1:5])

In [19]:
proj.head()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,locode
0,2022-06-01,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5253021,0720000168L,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,37,Cong Dist 43,17//21,Construction Engineering//Safety,1305800.0,L5253SCAG,5253
1,2022-09-15,Y001,NATIONAL HIGHWAY PERF IIJA,NBIL539,0815000215L,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",71,Cong Dist 8,44,Other,301838.32,L5457SCAG,5457
2,2022-07-25,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5132050,0421000374L,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",95,Cong Dist 3,17//21,Construction Engineering//Safety,935150.0,L5132MTC,5132
3,2022-03-07,Y233//Y233//Y233//Y233,STBG IIJA OFF-SYSTEM BRIDGE,5914078,0100020461L,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),33,Cong Dist 3,11//15//16//17,Bridge Replacement - No Added Capacity//Preliminary Engineering//Right of Way//Construction Engineering,1615000.0,L5914NON-MPO,5914
4,2022-07-18,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5094071,0419000558L,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,95,Cong Dist 3,17//21,Construction Engineering//Safety,2027700.0,L5094MTC,5094


In [20]:
county.sample()

Unnamed: 0,agency_locode,agency_name,district,mpo,county
282,5284,Woodlake,6.0,TCAG,Tulare County


## Merge data on Locode

In [21]:
proj['locode'] = pd.to_numeric(proj['locode'], errors='coerce')

In [22]:
proj_all = (pd.merge(proj, locodes, left_on='locode', right_on='agency_locode', how='left'))

In [23]:
proj_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 0 to 329
Data columns (total 21 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         330 non-null    datetime64[ns]
 1   program_code                                  330 non-null    object        
 2   program_code_description                      330 non-null    object        
 3   project_number                                330 non-null    object        
 4   recipient_project_number                      330 non-null    object        
 5   project_title                                 330 non-null    object        
 6   county_code                                   330 non-null    int64         
 7   congressional_district                        330 non-null    object        
 8   improvement_type                              330 non-null    object  

In [24]:
proj_all = proj_all.rename(columns={'agency_name':'implementing_agency',
                                   'locode':'implementing_agency_locode'})

In [25]:
#if we use other locode list then drop these columns
proj_all.drop(columns =['active_e76s______7_12_2021_', 'mpo_locode_fads', 'agency_locode'], axis=1, inplace=True)

In [26]:
proj_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330 entries, 0 to 329
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   fmis_transaction_date                         330 non-null    datetime64[ns]
 1   program_code                                  330 non-null    object        
 2   program_code_description                      330 non-null    object        
 3   project_number                                330 non-null    object        
 4   recipient_project_number                      330 non-null    object        
 5   project_title                                 330 non-null    object        
 6   county_code                                   330 non-null    int64         
 7   congressional_district                        330 non-null    object        
 8   improvement_type                              330 non-null    object  

In [27]:
# one locode did not match
proj_all>>filter(_.implementing_agency_locode.isnull())

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
284,2022-04-26,Y001,NATIONAL HIGHWAY PERF IIJA,NBIS522,0000001453L5,STATEWIDE - IN SERVICE BRIDGES OWNED BY LOCAL AGENCIES FEDERALLY MANDATED BRIDGE INSPECTION PROGRAM,67,Cong Dist 3,49,Bridge Inspection and Bridge Related Training,14164800.0,S NON-MPO,,,,,,


In [28]:
proj_all.head()

Unnamed: 0,fmis_transaction_date,program_code,program_code_description,project_number,recipient_project_number,project_title,county_code,congressional_district,improvement_type,improvement_type_description,obligations_amount,summary_recipient_defined_text_field_1_value,implementing_agency_locode,implementing_agency,district,county_name,rtpa_name,mpo_name
0,2022-06-01,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5253021,0720000168L,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,37,Cong Dist 43,17//21,Construction Engineering//Safety,1305800.0,L5253SCAG,5253.0,Hawthorne,7.0,Los Angeles County,Los Angeles County Metropolitan Transportation Auth.,Southern California Association Of Governments
1,2022-09-15,Y001,NATIONAL HIGHWAY PERF IIJA,NBIL539,0815000215L,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",71,Cong Dist 8,44,Other,301838.32,L5457SCAG,5457.0,Yucaipa,8.0,San Bernardino County,San Bernardino Associated Governments,Southern California Association Of Governments
2,2022-07-25,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5132050,0421000374L,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",95,Cong Dist 3,17//21,Construction Engineering//Safety,935150.0,L5132MTC,5132.0,Fairfield,4.0,Solano County,Metropolitan Transportation Commission,Metropolitan Transportation Commission
3,2022-03-07,Y233//Y233//Y233//Y233,STBG IIJA OFF-SYSTEM BRIDGE,5914078,0100020461L,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),33,Cong Dist 3,11//15//16//17,Bridge Replacement - No Added Capacity//Preliminary Engineering//Right of Way//Construction Engineering,1615000.0,L5914NON-MPO,5914.0,Lake County,1.0,Lake County,Lake County/City Area Planning Council,NON-MPO
4,2022-07-18,YS30//YS30,HIGHWAY SAFETY IMP PROG IIJA,5094071,0419000558L,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,95,Cong Dist 3,17//21,Construction Engineering//Safety,2027700.0,L5094MTC,5094.0,Vacaville,4.0,Solano County,Metropolitan Transportation Commission,Metropolitan Transportation Commission


In [29]:
list(proj_all.implementing_agency.unique())

['Hawthorne',
 'Yucaipa',
 'Fairfield',
 'Lake County',
 'Vacaville',
 'Benicia',
 'Pico Rivera',
 'Marin County',
 'Los Banos',
 'Santa Clara County',
 'Santa Barbara County',
 'Fresno County',
 'Santa Ana',
 'Stockton',
 'Caltrans',
 'Sacramento',
 'San Benito County',
 'Pinole',
 'Sacramento County',
 'Merced County',
 'Shasta County',
 'Livingston',
 'El Dorado County',
 'Sanger',
 'Palmdale',
 'Tulare County',
 'Coachella',
 'Bakersfield',
 'Colton',
 'Visalia',
 'Rialto',
 'Metropolitan Transportation Commission',
 'Bellflower',
 'Belmont',
 'Larkspur',
 'Monterey County',
 'Madera County',
 'Moraga',
 'Orinda',
 'Redding',
 'Cathedral City',
 'Alameda',
 'Clovis',
 'Gardena',
 'Mission Viejo',
 'Pleasanton',
 'Fresno',
 'Butte County',
 'Rohnert Park',
 'Alameda County Transportation Commission',
 'Yolo County',
 'Stanislaus County',
 'Placer County',
 'Contra Costa County',
 'Mariposa County',
 'Santa Barbara',
 'Nevada County',
 'Calaveras County',
 'Long Beach',
 'Los Angeles

## Writing to GCS

In [30]:
#proj_all.to_csv(f"{GCS_FILE_PATH}/FMIS_projects_wip.csv")

## Adding Place Names (can do)
* we have a list of city names to county from [Caltrans PlaceNames](https://dot.ca.gov/-/media/dot-media/programs/research-innovation-system-information/documents/place-names/2019-place-names-in-california-a11y.pdf)


In [31]:
# city_place_names = (to_snakecase(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/2020-place-names-locode.xlsx', sheet_name=0)))

In [32]:
# city_place_names.sample()

In [33]:
# city_place_names.drop(columns =['unnamed:_1', 'unnamed:_3', 'unnamed:_4','unnamed:_6','unnamed:_7', 'date_of_incorporation',
#                                'city_name_abbr_','name','dist_', 'co_'], axis=1, inplace=True)

In [34]:
# (pd.merge(proj_all, city_place_names, left_on='agency_locode', right_on='ct_city_code', how='left', indicator=True))._merge.value_counts()

In [35]:
# proj_all1 = (pd.merge(proj_all, city_place_names, left_on='agency_locode', right_on='ct_city_code', how='left', indicator='City'))

In [36]:
# proj_all1.sample(2)

In [37]:
# proj_all1>>filter(_.City!='both')

In [38]:
# county_place_names = (to_snakecase(pd.read_excel('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/2020-place-names-locode.xlsx', sheet_name=1)))

In [39]:
# county_place_names

## Project Location

In [40]:
location = (proj_all>>select(_.implementing_agency, _.county_name, _.project_title))

In [41]:
location.sample()

Unnamed: 0,implementing_agency,county_name,project_title
160,Los Angeles,Los Angeles County,"IN THE CITY OF LOS ANGELES, 15 INTERSECTIONS - 42ND PLACE/CENTRAL AVE, SUPERIOR ST/ZEIZAH AVE, PARTHENIA ST/WILLIS AVE, ROMAINE ST/VINE ST, 111TH ST"


## Project Title

In [42]:
#phrase extraction maybe: https://stackoverflow.com/questions/70995812/extract-keyword-from-sentences-in-a-pandas-text-column-using-nltk-and-or-regex

In [47]:
# ! pip install nltk
# ! pip install textblob
# ! pip install wordcloud

In [48]:
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [49]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [50]:

import re

In [51]:
#subset df to work on title
title = proj_all>>select(_.implementing_agency, _.program_code_description, _.project_title, _.program_code)

In [52]:
title

Unnamed: 0,implementing_agency,program_code_description,project_title,program_code
0,Hawthorne,HIGHWAY SAFETY IMP PROG IIJA,120TH STREET FROM PRAIRIE AVENUE TO FELTON AVENUE. TRAFFIC SIGNAL UPGRADES AT NINE SIGNALIZED INTERSECTIONS AND IMPROVE CROSSINGS AND SIGNAGE.,YS30//YS30
1,Yucaipa,NATIONAL HIGHWAY PERF IIJA,"13TH STREET OVER WILSON CREEK FROM OAK GLEN ROAD TO KENTUCKY STREET, LWC 00L0017 REPLACE LOW WATER CROSSING WITH 2-LANE BRIDGE",Y001
2,Fairfield,HIGHWAY SAFETY IMP PROG IIJA,"16 LOCATIONS IN THE CITY OF FAIRFIELD INSTALL ADAPTIVE SIGNAL TIMING AND ADVANCED DILEMMA-ZONE DETECTION, AND ENHANCE THE VISIBILITY OF SIGNALS.",YS30//YS30
3,Lake County,STBG IIJA OFF-SYSTEM BRIDGE,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),Y233//Y233//Y233//Y233
4,Vacaville,HIGHWAY SAFETY IMP PROG IIJA,21 INTERSECTIONS; ON ALAMO DRIVE FROM MERCHANT STREET TO NUT TREE ROAD AND ON PEABODY ROAD FROM ELMIRA ROAD TO FOXBORO PARKWAY IMPROVE SIGNAL HARDWAR,YS30//YS30
...,...,...,...,...
325,Indio,NATIONAL HIGHWAY PERF IIJA,"WESTBOUND INDIO BOULEVARD OVER WHITEWATER RIVER, BR. NO. 56C-0292 SEISMIC RETROFIT AND SCOUR COUNTERMEASURES",Y001//Y001
326,Kingsburg,CONGESTION MITIGATION IIJA,WESTSIDE OF 18TH AVE FROM STROUD AVE TO KLEPPER ST CONSTRUCT NEW SIDEWALKS,Y400//Y400
327,Santa Ana,TRANS ALTERNATIVES >200K IIJA,"WILLITS STREET FROM FAIRVIEW STREET TO E/S OF RAITT STREET INSTALL MEDIAN, PARKING-PROTECTED BICYCLE LANES, AND DEDICATED BICYCLE SIGNAL HEADS (TC)",Y301//Y301
328,San Joaquin County,STBG IIJA OFF-SYSTEM BRIDGE,WIMER ROAD OVER INDIAN CREEK NORTH BRANCH (BRIDGE 29C0303) BRIDGE REPLACEMENT (TC),Y233//Y233


In [53]:
title.program_code_description.value_counts()

HIGHWAY SAFETY IMP PROG IIJA      75
STBG-URBANIZED >200K IIJA         61
NATIONAL HIGHWAY PERF IIJA        59
STBG IIJA OFF-SYSTEM BRIDGE       49
CONGESTION MITIGATION IIJA        37
SURFAC TRNSP BLK GRTS-FLX IIJA    18
TRANS ALTERNATIVES >200K IIJA     14
PROJ TO REDUCE PM 2.5 EMI IIJA    12
TRANSP ALTERNATIVES FLEX IIJA      4
TRANS ALTERN 50K-200K POP IIJA     1
Name: program_code_description, dtype: int64

In [54]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
text = title[['project_title']]

In [59]:
title_text = text.squeeze()

In [60]:
title_list = title_text.tolist() 


In [61]:
title_list = ' '.join(title_list).lower()

In [62]:
title_list = re.sub(r'[^\w\s]','',title_list)

In [63]:
swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words('english')]


In [64]:
clean_title_list = [word for word in word_tokenize(title_list.lower()) if word not in swords] 

In [65]:
len(set(clean_title_list))


1729

In [66]:
#turn list back into df

In [67]:
clean_title_list_df = pd.DataFrame(np.array(clean_title_list))


In [68]:
clean_title_list_df.value_counts()

bridge     178
road       142
avenue     115
ave         77
street      75
          ... 
jamacha      1
bellota      1
belmont      1
j            1
00           1
Length: 1729, dtype: int64

In [69]:
clean_title_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4910 entries, 0 to 4909
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       4910 non-null   object
dtypes: object(1)
memory usage: 38.5+ KB


In [70]:
#splitting up the types of words based on the most common words appearing in title name

fix_type = ['REPLACEMENT', 'INSTALL', 'CONSTRUCT', 'REPLACE', 'SIGNAL', 'TRAFFIC',
           'IMPROVEMENT', 'PEDESTRIAN', 'LANES', 'NEW', 'REHABILITATION',
           'UPGRADE', 'CLASS', 'BIKE', 'WIDEN', 'LANDSCAPING', 'SAFETY', 'RAISED', 
            'SEISMIC', 'SIGNAGE', 'RETROFIT', 'ADD', 'PLANNING', 'PAVE'
           'PREVENTIVE','MAINTENANCE', 'REHAB', 'RESURFACE', 'REPAIR', 'ROUNDABOUT']

area_type = ['BRIDGE', 'ROAD', 'RD', 'AVENUE', 'AVE', 'STREET' , 'ST',
             'FRACTURED', 'LANE', 'DRIVE', 'boulevard', 'BLVD',
             'INTERSECTION', 'intersections', 'WAY', 'DR', 'CURB', 'ROADWAY',
             'TRAIL', 'PATH', 'CREEK', 'RIVER', 'SIDEWALK', 'CORRIDOR', 'PARKWAY',
            'RAMPS', 'GUARDRAIL']

jurisdiction = ['CITY', 'COUNTY', 'STATE', 'UNINCORPORATED']

other = ['TC', 'EXISTING']

In [71]:
def tokenize(texts):
    return [nltk.tokenize.word_tokenize(t) for t in texts]

#### using more-intertools

In [72]:
#! pip install more-itertools

In [73]:
#from more_itertools import split_after

In [124]:
## code help: https://stackoverflow.com/questions/70995812/extract-keyword-from-sentences-in-a-pandas-text-column-using-nltk-and-or-regex
def key_word_intersection(df, text_col):
    summaries = []
    for x in tokenize(df[text_col].to_numpy()):
        keywords = np.concatenate([
                                np.intersect1d(x, ['BRIDGE REPLACEMENT', 'INSTALL', 'CONSTRUCT', 'REPLACE',
                                                   'SIGNAL', 'SIGNALS', 'TRAFFIC', 'IMPROVEMENT', 'PEDESTRIAN', 
                                                   'LANES', 'NEW', 'REHABILITATION','UPGRADE', 'CLASS',
                                                   'BIKE', 'WIDEN', 'LANDSCAPING', 'SAFETY', 'RAISED', 
                                                   'SEISMIC', 'SIGNAGE', 'RETROFIT', 'ADD', 'PLANNING', 'PAVE',
                                                   'PREVENTIVE','MAINTENANCE', 'REHAB', 'RESURFACE', 'REPAIR', 'ROUNDABOUT'
                                                  'COMPLETE STREET', 'VIDEO DETECTION EQUIPMENT', 'SYNCHRONIZE CORRIDOR', 'ROADWAY REALIGNMENTS']),
                                np.intersect1d(x, [
                                    # 'BRIDGE', 'ROAD', 'RD', 'AVENUE', 'AVE', 'STREET' , 'ST',
                                                   # 'FRACTURED', 'LANE', 'DRIVE', 'BOULEVARD', 'BLVD',
                                                   'INTERSECTION', 'INTERSECTIONS', 'SIDEWALK', 
                                    # 'WAY', 'DR', 'CURB', 'ROADWAY',
                                                   # 'TRAIL', 'PATH', 'CREEK', 'RIVER', 
                                    # 'CORRIDOR', 'CROSSING','PARKWAY','RAMPS', 'GUARDRAIL'
                                ]), 
                                np.intersect1d(x, ['CITY', 'COUNTY', 'STATE', 'UNINCORPORATED'])])
    
        summaries.append(np.array(x)[[i for i, keyword in enumerate(x) if keyword in keywords]])
    return summaries 

In [125]:
text['summary'] = key_word_intersection(text, 'project_title')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [135]:
text.sample(10)

Unnamed: 0,project_title,summary
153,"IN SAN DIEGO, JUST NORTH OF SAN YSIDRO LAND PORT OF ENTRY TO INGRID AVENUE AND SATURN BOULEVARD IN IMPERIAL BEACH. ALONG PALM AVENUE, 7TH STREET, ELM",[]
179,JENSEN AVENUE FROM DICKERSEN TO MADERA AVENUES SHOULDER IMPROVEMENTS,[]
89,EAST RIO BONITO RD AT SUTTER BUTTES CANAL 0.8 MI EAST OF SR99 - BR. #12C-0165 BRIDGE REPLACEMENT (TC),[]
52,"BRIDGE NO. 28C0331, BEAR CREEK RD, OVER SAN PABLO CREEK, 0.2 MI EAST OF CAMINO PABLO SEISMIC RETROFIT","[SEISMIC, RETROFIT]"
205,"NINE INTERSECTIONS WITHIN THE CITY OF REDDING. ENHANCED PEDESTRIAN CROSSINGS INCLUDING INSTALL RECTANGULAR RAPID FLASHING BEACONS AND TRAFFIC SIGNS,","[INTERSECTIONS, CITY, PEDESTRIAN, INSTALL, TRAFFIC]"
60,CITY OF MISSION VIEJO - THE PROJECT IS LOCATED ON THE WESTERN EDGE OF MISSION VIEJO. THE MISSION VIEJO N/S CORRIDOR TRAIL IS BOUNDED BY LOS ALISOS BL,[CITY]
279,SR-116 INTERSECTIONS: @ HURLBUT AVE; @ CLEVELAND AVE; @ N MAIN ST; @ WALLACE ST.BODEGA AVENUE INTERSECTIONS: @ FLORENCE AVE; @ ROBINSON RD THE CITY O,"[INTERSECTIONS, INTERSECTIONS, CITY]"
15,"ALONG GROVE AVENUE FROM PROSPECT AVENUE TO VALENTINE AVENUE, AND ALONG VALENTINE AVENUE FROM GROVE AVENUE TO NORTH AVENUE INSTALL AN ASPHALT CONCRETE",[INSTALL]
170,"INTERSECTIONS OF 65TH ST EXPWY AT 21ST AVE, 65TH ST EXPWY AT FRUITRIDGE RD, ARDEN WAY AT HERITAGE LN, ARDEN WAY AT CHALLENGE WAY, HOWE AVE. AT SWARTH",[INTERSECTIONS]
216,ON GARFIELD AVE. BETWEEN SOUTHERN AVE. AND HOWERY ST. IN THE CITY OF SOUTH GATE CONSTRUCT RAISED CENTER MEDIAN,"[CITY, CONSTRUCT, RAISED]"


In [139]:
text>>filter(_.project_title.str.contains('COMPLETE STREET'))

Unnamed: 0,project_title,summary
23,"ARDEN WAY FROM ETHAN WAY TO MORSE AVENUE. ARDEN WAY, FROM FULTON AVENUE TO MORSE AVENUE COMPLETE STREET IMPROVEMENTS INCLUDING SEPARATED SIDEWALK, C",[SIDEWALK]
29,"AVE R BETWEEN SIERRA HIGHWAY AND 25TH STREET. AVE R COMPLETE STREET WITH SIDEWALKS GAP CLOSURES, BIKE LANES, ADA RAMPS AND ENHANCED CROSSWALKS","[BIKE, LANES]"
99,EL CAMINO REAL (SR82) FROM ARROYO TO KAISER WAY COMPLETE STREET IMPROVEMENTS,[]
129,HEALDSBURG AVENUE BETWEEN POWELL AVENUE AND PASSALAQUA ROAD COMPLETE STREET IMPROVEMENTS,[]
290,THE TOWNSITE DRIVE COMPLETE STREETS PROJECT IS LOCATED IN THE CITY OF VISTA ALONG THE SEGMENT OF NORTH DRIVE FROM JUST NORTHWEST OF THE INTERSECTION,"[CITY, INTERSECTION]"


In [127]:

text>>filter(_.project_title.str.contains('BRIDGE REPLACEMENT'))

Unnamed: 0,project_title,summary
3,2.2 MILES NORTH OF STATE ROUTE 20 ON WITTER SPRINGS ROAD OVER COOPER CREEK. BR.# 14C0119 BRIDGE REPLACEMENT (TC),[STATE]
6,3/8 MI WEST OF PARAMOUNT BLVDWASHINGTON BLVD OVER RIO HONDO.LOCAL BRIDGE # 53C0471 BRIDGE REPLACEMENT (DEMOLISH THE EXISTING 6 LANNE SUPERSTRUCTURE,[]
10,ALAMITOS RD. BRIDGE @ ALAMITOS CK (37C0159) BRIDGE REPLACEMENT/SEISMIC RET (TC),[]
24,ARROYA AVENUE OVER WEST DELTA CANAL (BRIDGE 39C0275) BRIDGE REPLACEMENT (TC),[]
25,"ASH CREEK ROAD AT SACRAMENTO RIVER OVERFLOW, BRIDGE 06C0233 BRIDGE REPLACEMENT",[]
41,BON AIR BRIDGE (BON AIR RD OVER CORTE MADERA CREEK). BR.# 27C0028 BRIDGE REPLACEMENT,[]
53,BUCK AVENUE OVER ALAMO CREEKBR NO. 23C0011 BRIDGE REPLACEMENT,[]
54,CANYON RD @ ACID CANAL. BR. # 06C0307 BRIDGE REPLACEMENT,[]
55,CASSEL FALL RIVER ROAD BRIDGE AT PIT RIVER. BR. # 06C0039 BRIDGE REPLACEMENT,[]
68,"CR 40 OVER CACHE CREEK, 0.12 MILES SOUTH OF SR 16 BRIDGE REPLACEMENT. REPLACE EXISTING 1 LANE BRIDGE WITH A NEW 1 LANE BRIDGE. BR#22C0091 (TC)","[REPLACE, NEW]"


In [120]:
text>>filter(_.project_title.str.contains('WIDEN'))

Unnamed: 0,project_title,summary
81,"DOWNTOWN LOS ANGELES, BROADWAY BETWEEN 4TH AND 6TH STREETS PEDESTRIAN SAFETY IMPROVEMENTS INCLUDING CURB EXTENSIONS, WIDENED SIDEWALK, CROSSWALK AND","[PEDESTRIAN, SAFETY, SIDEWALK]"
104,"FIRST STREET BETWEEN FLOWER ST AND STANDARD AVE WIDEN EXISTING SIDEWALKS BY 3FT, NARROW VEHICLE LANES, AND CONSTRUCT ADA IMPROVEMENTS ON SIDEWALKS AN","[WIDEN, LANES, CONSTRUCT]"
156,"IN STOCKTON, PARALLEL TO MARCH LANE IN THE EAST BAY MUD CORRIDOR BETWEEN BROOKSIDE ROAD AND HILLSBORO WAY. RECONSTRUCT, WIDEN AND IMPROVE EXISTING PA",[WIDEN]
158,"IN THE CITY OF GALT ON KOST ROAD AT UNION PACIFIC RAILROAD CROSSING, WEST OF JOY DR., EAST OF MARIA WAY. WIDEN 400 FEET OF KOST RD. ON EACH SIDE OF T","[CITY, WIDEN]"
181,LA PAZ ROAD: MUIRLANDS BLVD. TO CRISANTA DR. WIDEN TWO OVERHEADS OVER BNSF,[WIDEN]
194,MILLERTON ROAD FROM FRIANT ROAD TO MARINA DRIVE WIDEN ROADWAY FROM 2LU TO 4LD,[WIDEN]
204,"NEES AVENUE FROM MINNEWAWA AVENUE TO CLOVIS AVENUE ROAD WIDENING AND RECONSTRUCTION, INSTALLATION OF CURB AND GUTTERS, SIDEWALK, BICYCLE LANES, MODIF","[SIDEWALK, LANES]"
220,ON OLD OREGON TRAIL BETWEEN PASO ROBLES AVENUE AND BEAR MOUNTAIN ROAD WIDEN SHOULDERS AND UPGRADE DRAINAGE (TC),"[WIDEN, UPGRADE]"
233,PARK ROAD FROM 250 FT SOUTH OF OAK ROAD TO BAYSHORE ROAD REHAB./RESURFACE ROADWAY AND WIDEN ROAD TO INSTALL ASPHALT CONCRETE PAVED CLASS II/V BICYC,"[WIDEN, INSTALL, CLASS]"
241,"POWER INN ROAD FROM ELSIE AVENUE TO ABOUT 400 FEET NORTH OF MACFADDEN DRIVE. INSTALL CURB, GUTTER, SIDEWALK INFILL AND CURB RAMPS; WIDEN SUBSTANDARD","[INSTALL, SIDEWALK, WIDEN]"


In [77]:
type(text['summary'])

pandas.core.series.Series