
# NLP of the "project location" and "type of work" 


In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = "{:.2f}".format

In [3]:
df = pd.read_csv('gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/clean_waiting_obligated.csv', low_memory=False)




In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,prefix,project_no,agency,prepared_date,submit_to_hq_date,hq_review_date,submit_to_fhwa_date,to_fmis_date,fed_requested,ac_requested,total_requested,status_comment,locode,dist,status,dist_processing_days,hq_processing_days,fhwa_processing_days,ftip_no,project_location,type_of_work,seq,date_request_initiated,date_completed_request,mpo,warning,ProjectID,ProjectNO,projectID,projectNO,compare_id_locode
0,0,Obligated,BPMP,5904(121),Humboldt County,2018-12-18,2018-12-18,2018-12-18,2018-12-18,2018-12-27,0.0,0.0,0.0,Authorized,5904,1,E-76 approved on,,0.0,9.0,HBPLOCAL,14 Bridges In Humboldt County,Bridge Preventive Maintenance - Deck Joints,3,,,NONMPO,,5904,121,5904,121,True
1,1,Obligated,ER,32D0(008),Mendocino County,2018-12-17,2018-12-19,2018-12-20,2018-12-20,2018-12-27,11508.0,0.0,13000.0,Authorized,5910,1,E-76 approved on,1.0,1.0,7.0,,"Comptche Ukiah Road, Cr 223 Pm 17.25",Permanent Restoration,3,2018-12-17,2018-12-18,NONMPO,,32D0,8,32D0,8,False
2,2,Obligated,ER,4820(004),Humboldt County,2018-12-07,2018-12-21,2018-12-21,2018-12-21,2018-12-27,45499.64,0.0,51394.58,Authorized,5904,1,E-76 approved on,14.0,0.0,6.0,,Mattole Rd Pm 43.17,Permanent Restoration,5,2018-12-06,2018-12-07,NONMPO,,4820,4,4820,4,False
3,3,Obligated,CML,5924(244),Sacramento County,2018-12-11,2018-12-11,2018-12-21,2018-12-27,2018-12-27,207002.0,0.0,247002.0,Authorized,5924,3,E-76 approved on,4.0,16.0,0.0,SAC25086,Fair Oaks Blvd. Between Howe Ave And Munroe St,Create A Smart Growth Corridor With Barrier Se...,1,2018-12-07,2018-12-07,SACOG,,5924,244,5924,244,True
4,4,Obligated,CML,5924(214),Sacramento County,2018-12-05,2018-12-11,2018-12-21,2018-12-27,2018-12-27,0.0,5680921.0,5702041.0,Authorized,5924,3,E-76 approved on,7.0,16.0,0.0,SAC24753,Florin Rd Between Power Inn Rd. And Florin Per...,Streetscape (tc),3,2018-11-28,2018-12-04,SACOG,,5924,214,5924,214,True


In [5]:
df_text = df[['agency', 'project_location', 'type_of_work']].copy()

In [7]:
df_text.sample(5)

Unnamed: 0,agency,project_location,type_of_work
11470,Lafayette,Mt. Diablo Blvd. Oak Hill -mt. View Dr.,Pedestrian Walkway Improvement
16458,San Diego County,"Lawson Valley Road Over Lawson Creek, 57c0374",Bridge Replacement (tc)
6659,Santa Barbara County,Jalama Road Bridge 51c0013 At Sr 1 In Santa Ba...,Bridge Rehabilitation
18217,Selma,On Thompson Ave From Dinuba Ave To Rose Ave,"Install Rectangular Rapid Flashing Beacons, Cr..."
12986,Kerman,California Ave : Vineyard To California; May A...,Road Reconstruction (tc)


In [8]:
df_text.dtypes

agency              object
project_location    object
type_of_work        object
dtype: object

In [9]:
len(df_text)

20445

## Installing text analysis tools

In [14]:
#!pip install nltk
#! pip install textblob

In [15]:
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

In [26]:
import re

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
df_type_of_work_only = df_text[['type_of_work']]

In [19]:
work_series = df_type_of_work_only.squeeze()

In [23]:
print(work_series)

0              Bridge Preventive Maintenance - Deck Joints
1                                    Permanent Restoration
2                                    Permanent Restoration
3        Create A Smart Growth Corridor With Barrier Se...
4                                         Streetscape (tc)
                               ...                        
20440                                Bus Stops Improvement
20441            Electronic Locker Upgrade And Replacement
20442              Quick Builds And Tempo Lane Delineation
20443          Fhwa Transfer: Corridor Management Planning
20444                             Guardrail Var. Locations
Name: type_of_work, Length: 20445, dtype: object


In [27]:
def clean_string(text):
    text = re.sub(r"[^A-z\s]", "", text)
    swords = [re.sub(r"[^A-z\s]", "", sword) for sword in stopwords.words('english')]
    
    cleaned_list_of_words = [word for word in word_tokenize(text.lower()) if word not in swords]

    return cleaned_list_of_words

In [28]:
work_series_clean = work_series.apply(clean_string)

In [29]:
print(work_series_clean)

0          [bridge, preventive, maintenance, deck, joints]
1                                 [permanent, restoration]
2                                 [permanent, restoration]
3        [create, smart, growth, corridor, barrier, sep...
4                                        [streetscape, tc]
                               ...                        
20440                            [bus, stops, improvement]
20441           [electronic, locker, upgrade, replacement]
20442            [quick, builds, tempo, lane, delineation]
20443     [fhwa, transfer, corridor, management, planning]
20444                          [guardrail, var, locations]
Name: type_of_work, Length: 20445, dtype: object


In [33]:
#next: get list of words from this series.
