In [1]:
# system tools
import warnings
import json
import sys
import string
import ast

# data cleaning + analysis tools
import pandas as pd
import datetime as dt
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

#nltk tools
import spacy
import lda #Latent Dirichlet Allocation (create topics)
import gensim
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from gensim.models import Phrases
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams

#set notebook preferences
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


### Import JSON file with city metadata - including which cities have sufficient Public Record Request (PRR) data for analysis

In [2]:
json_file = '../data/cities.json'

with open(json_file, 'r') as f:
    md = json.load(f)

###  Create dataframe of PRR data for all relevant cities

In [3]:
data_raw = pd.DataFrame(columns = ['city', 'month_year', 'Summary'])
city_list = []
for key, value in md.items():
    city = value['name']
    filepath = '/Users/alenastern/Google Drive File Stream/My Drive/Alena_Project/PR_Data/{}.csv'.format(city)
    if value["desc"] == "Y":
        try:
            df = pd.read_csv(filepath)
        except:
            try:
                df = pd.read_csv(filepath, encoding='mac_roman')
            except:
                continue
        print(key)
        name = key.split(' ')
        # remove lower for new version
        city_list.extend([x for x in name[:-1]])
    else:
        continue
    
    try:
        df['Create Date'] = pd.to_datetime(df['Create Date'])
    except:
        df['New'] = pd.to_datetime(df['Create Date'].apply(lambda x: re.findall('^\S*', x)[0]))
        df.drop(columns=['Create Date'], inplace = True)
        df.rename(index=str, columns={"New": "Create Date"}, inplace = True)

    df['month_year'] = df['Create Date'].dt.to_period('M')
    
    mc = df[['month_year', 'Summary']]
    mc['city'] = city
    
    data_raw = pd.concat([data_raw, mc])
    
print(city_list)

Arlington city
Asheville city
Bainbridge Island city
Boulder County
Cathedral City city
Clearwater city
Dayton city
Denton city
Everett city
Fort Collins city
Greensboro city
Hayward city
Kirkland city
Las Cruces city
Lynnwood city
Mercer Island city
Miami city
Middleborough town
New Orleans city
Oakland city
Oklahoma City city
Olympia city
Palo Alto city
Peoria city
Pullman city
Rancho Cucamonga city
Redmond city
Renton city
Sacramento city
San Francisco city
Tukwila city
Vallejo city
West Sacramento city
Winchester city
['Arlington', 'Asheville', 'Bainbridge', 'Island', 'Boulder', 'Cathedral', 'City', 'Clearwater', 'Dayton', 'Denton', 'Everett', 'Fort', 'Collins', 'Greensboro', 'Hayward', 'Kirkland', 'Las', 'Cruces', 'Lynnwood', 'Mercer', 'Island', 'Miami', 'Middleborough', 'New', 'Orleans', 'Oakland', 'Oklahoma', 'City', 'Olympia', 'Palo', 'Alto', 'Peoria', 'Pullman', 'Rancho', 'Cucamonga', 'Redmond', 'Renton', 'Sacramento', 'San', 'Francisco', 'Tukwila', 'Vallejo', 'West', 'Sacrame

#### We can see the raw data below. Our raw dataset includes 86,416 PRRs from 33 different cities

In [4]:
data_raw.to_csv('data_raw.csv')

In [5]:
data_raw = pd.read_csv('data_raw.csv')

In [6]:
data_raw.shape

(110138, 4)

In [7]:
len(data_raw.city.unique())

34

In [8]:
data_raw.index = pd.RangeIndex(len(data_raw.index))
data_raw.reset_index(inplace=True)

### Create dataframe for cleaning by removing null summaries

In [9]:
data = data_raw.dropna(subset=['Summary'])

In [10]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Summary,city,month_year
0,0,0,"We are working with an engineering firm on an upcoming project. They have asked us to gather maps for this project. Would you be able to assist me in gathering maps/records (as builds) for any underground water facilities you may have? Something just showing the route of the water lines would do.\n\n207th ST NE to 92nd Ave NE, Arlington, Cascade Surveying & Engineering \n\nI have attached the scope for your convenience. Please let me know if you have questions.",Arlington,2018-06
1,1,1,"Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at 17922 59th DR NE Arlington WA 98223 between Arlington Airport, Topcub Aircraft, City of Arlington, HCI Steel Buildings and PUD.",Arlington,2018-06
2,2,2,"Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)",Arlington,2018-06
3,3,3,"police report filed to an officer against Wayne Parris (DOB 08-03-1957) from Brittany J. Parris. The paperwork I have has a case number D18-39 it is also stamped at the bottom with 18-1294, Iím not sure which number you will need. If there is any other information needed please let me know.",Arlington,2018-06
4,4,4,"Email Communications between Stephanie Shook, Dave Kraski, Bruce Stedman and Chad Schmidt in regards to Fire Protection District 21 billing and passage of contract for ALS Services. \n\nAlso any copies of Agenda Bills, D21 Contract and materials presented for review in Nov/Dec time frame in regards to the contract.",Arlington,2018-06


#### Function to convert nltk part of speech tags to wordnet tags (we use this to stem the words in data cleaning below):

In [11]:
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

## Clean PRR data to prepare for LDA analysis

In [12]:
# Replace common acronyms in Summary
data.Summary = data.Summary.str.replace('NOPD', 'police department')
data.Summary = data.Summary.str.replace('OPD' , 'police department')
data.Summary = data.Summary.str.replace('SFPD', 'police department')
data.Summary = data.Summary.str.replace('CPD', 'police department')
data.Summary = data.Summary.str.replace('APD', 'police department')
data.Summary = data.Summary.str.replace('GPD', 'police department')
data.Summary = data.Summary.str.replace('KPD', 'police department')
data.Summary = data.Summary.str.replace('TPD', 'police department')
data.Summary = data.Summary.str.replace('DPD', 'police department')
data.Summary = data.Summary.str.replace('EPD', 'police department')
data.Summary = data.Summary.str.replace('HPD', 'police department')
data.Summary = data.Summary.str.replace('LPD', 'police department')
data.Summary = data.Summary.str.replace('MDPD', 'police department')
data.Summary = data.Summary.str.replace('PPD', 'police department')
data.Summary = data.Summary.str.replace('SPD', 'police department')
data.Summary = data.Summary.str.replace('VPD', 'police department')
data.Summary = data.Summary.str.replace('CCPD', 'police department')
data.Summary = data.Summary.str.replace('FCPD', 'police department')
data.Summary = data.Summary.str.replace('TPD', 'police department')
data.Summary = data.Summary.str.replace('LCPD', 'police department')
data.Summary = data.Summary.str.replace('OKCPD', 'police department')
data.Summary = data.Summary.str.replace('PAPD', 'police department')
data.Summary = data.Summary.str.replace('RCPD', 'police department')
data.Summary = data.Summary.str.replace('WSPD', 'police department')

# PDRD = portable digital recording device (body cam) worn by police
data.Summary = data.Summary.str.replace('PDRD', 'police body camera')
data.Summary = data.Summary.str.replace('CPS', 'child protective services')


# Replace key numbers with strings
data.Summary = data.Summary.str.replace(' 911 ', ' nineoneone ')
data.Summary = data.Summary.str.replace(' 311 ', ' threeoneone ')
data.Summary = data.Summary.str.replace(' 9-11 ', ' nineoneone ')
data.Summary = data.Summary.str.replace(' 3-11 ', ' threeoneone ')

# Remove all punctuation
translator = str.maketrans('','', string.punctuation)
data.Summary = data.Summary.str.translate(translator)

# Remove all city names
for city in city_list:
    data.Summary = data.Summary.str.replace(city, '')

# Replace hyphen and slash with space
hyphen_translator = str.maketrans('-/','  ')
data.Summary = data.Summary.str.translate(hyphen_translator)

# Remove all state names
state_list = ['Washington', 'North Carolina', 'Carolina', 'Colorado', 'California',
             'Ohio', 'Texas', 'New Mexico', 'Florida', 'Massachusetts',
             'Louisiana', 'Oklahoma', 'Arizona', 'Rhode Island','Rhode', 'Virginia']

for state in state_list:
     data.Summary = data.Summary.str.replace(state, '')
            
# Remove digits
dig_translator = str.maketrans('','', string.digits)
data.Summary = data.Summary.str.translate(dig_translator)

#https://www.sfdph.org/dph/EH/HMUPA/HMUPAFormsMenu.asp - hazardous materials
#https://www.waterboards.ca.gov/ust/contacts/docs/lop_guide.pdf - water resources local oversight program
data.Summary = data.Summary.str.replace('LOP', 'water')
data.Summary = data.Summary.str.replace('HMUPA', 'hazardous materials')


## ID Proper Nouns ##

nlp = spacy.load('en_core_web_sm')
data['token_sp'] = data['Summary'].apply(lambda x: nlp(x))
data['pn'] = data['token_sp'].apply(lambda x: [ i.lemma_ for i in x if i.tag_ == 'NNP'])

# Turn to lowercase
data['sum_ed'] = data.Summary.str.lower()

#remove public record request phrases
data.sum_ed = data.sum_ed.str.replace('public record request', '')
data.sum_ed = data.sum_ed.str.replace('open record request', '')
data.sum_ed = data.sum_ed.str.replace('public records request', '')
data.sum_ed = data.sum_ed.str.replace('open records request', '')
data.sum_ed = data.sum_ed.str.replace('foia request', '')
data.sum_ed = data.sum_ed.str.replace('see attached', '')
data.sum_ed = data.sum_ed.str.replace('see attachment', '')
data.sum_ed = data.sum_ed.str.replace('to whom it may concern', '')
data.sum_ed = data.sum_ed.str.replace('public records act', '')
data.sum_ed = data.sum_ed.str.replace('electronic copy', '')
data.sum_ed = data.sum_ed.str.replace('electronic copies', '')
data.sum_ed = data.sum_ed.str.replace('freedom of information act', '')

data.sum_ed = data.sum_ed.str.replace('good afternoon', '')
data.sum_ed = data.sum_ed.str.replace('good morning', '')
data.sum_ed = data.sum_ed.str.replace('good day', '')

data.sum_ed = data.sum_ed.str.replace('final will', 'final final_will')


#remove empty strings, stopwords and stem
stop_words = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
data['token'] = data['sum_ed'].apply(lambda x: nltk.word_tokenize(x))
data['lemma'] = data['token'].apply(lambda x: nltk.pos_tag(x))
data['mash'] = data['lemma'].apply(lambda x: [lmtzr.lemmatize(i[0], get_wordnet_pos(i[1])) for i in x if len(i[0]) > 0 and i[0] not in stop_words])

# Remove whitespace
wsp_translator = str.maketrans('','', string.whitespace)
data['mash'] = data['mash'].apply(lambda x: [i.translate(wsp_translator) for i in x])

# Remove empty lists
data['mash_len'] = data['mash'].apply(lambda x: len(x))
data = data[data['mash_len'] > 0]

# Create bigrams
data['bigrams'] = data['mash'].apply(lambda x: ["_".join(w) for w in ngrams(x, 2)])


In [13]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,Summary,city,month_year,token_sp,pn,sum_ed,token,lemma,mash,mash_len,bigrams
0,0,0,We are working with an engineering firm on an upcoming project They have asked us to gather maps for this project Would you be able to assist me in gathering mapsrecords as builds for any underground water facilities you may have Something just showing the route of the water lines would do\n\nth ST NE to nd Ave NE Cascade Surveying Engineering \n\nI have attached the scope for your convenience Please let me know if you have questions,Arlington,2018-06,"(We, are, working, with, an, engineering, firm, on, an, upcoming, project, , They, have, asked, us, to, gather, maps, for, this, project, , Would, you, be, able, to, assist, me, in, gathering, mapsrecords, as, builds, for, any, underground, water, facilities, you, may, have, , Something, just, showing, the, route, of, the, water, lines, would, do, \n\n, th, ST, NE, to, nd, Ave, NE, , Cascade, Surveying, , Engineering, \n\n, I, have, attached, the, scope, for, your, convenience, , Please, let, me, know, if, you, have, questions)","[ave, ne, cascade, surveying, engineering]",we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering mapsrecords as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, mapsrecords, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[(we, PRP), (are, VBP), (working, VBG), (with, IN), (an, DT), (engineering, NN), (firm, NN), (on, IN), (an, DT), (upcoming, JJ), (project, NN), (they, PRP), (have, VBP), (asked, VBN), (us, PRP), (to, TO), (gather, VB), (maps, NNS), (for, IN), (this, DT), (project, NN), (would, MD), (you, PRP), (be, VB), (able, JJ), (to, TO), (assist, VB), (me, PRP), (in, IN), (gathering, VBG), (mapsrecords, NNS), (as, IN), (builds, NNS), (for, IN), (any, DT), (underground, JJ), (water, NN), (facilities, NNS), (you, PRP), (may, MD), (have, VB), (something, NN), (just, RB), (showing, VBG), (the, DT), (route, NN), (of, IN), (the, DT), (water, NN), (lines, NNS), (would, MD), (do, VB), (th, VB), (st, VB), (ne, JJ), (to, TO), (nd, VB), (ave, VB), (ne, JJ), (cascade, NN), (surveying, VBG), (engineering, NN), (i, NN), (have, VBP), (attached, VBN), (the, DT), (scope, NN), (for, IN), (your, PRP$), (convenience, NN), (please, NN), (let, VB), (me, PRP), (know, VB), (if, IN), (you, PRP), (have, VBP), (questions...","[work, engineering, firm, upcoming, project, ask, u, gather, map, project, would, able, assist, gather, mapsrecords, build, underground, water, facility, may, something, show, route, water, line, would, th, st, ne, nd, ave, ne, cascade, survey, engineering, attach, scope, convenience, please, let, know, question]",42,"[work_engineering, engineering_firm, firm_upcoming, upcoming_project, project_ask, ask_u, u_gather, gather_map, map_project, project_would, would_able, able_assist, assist_gather, gather_mapsrecords, mapsrecords_build, build_underground, underground_water, water_facility, facility_may, may_something, something_show, show_route, route_water, water_line, line_would, would_th, th_st, st_ne, ne_nd, nd_ave, ave_ne, ne_cascade, cascade_survey, survey_engineering, engineering_attach, attach_scope, scope_convenience, convenience_please, please_let, let_know, know_question]"
1,1,1,Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at th DR NE WA between Airport Topcub Aircraft of HCI Steel Buildings and PUD,Arlington,2018-06,"(Need, copies, of, contracts, and, all, related, documents, pertaining, to, Topcub, Aircraft, property, located, at, , th, DR, NE, , WA, , between, , Airport, Topcub, Aircraft, , of, , HCI, Steel, Buildings, and, PUD)","[topcub, aircraft, dr, ne, wa, airport, topcub, aircraft, hci, steel, pud]",need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft of hci steel buildings and pud,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, of, hci, steel, buildings, and, pud]","[(need, NN), (copies, NNS), (of, IN), (contracts, NNS), (and, CC), (all, DT), (related, JJ), (documents, NNS), (pertaining, VBG), (to, TO), (topcub, VB), (aircraft, NN), (property, NN), (located, VBN), (at, IN), (th, NN), (dr, NN), (ne, JJ), (wa, NN), (between, IN), (airport, NN), (topcub, NN), (aircraft, NN), (of, IN), (hci, NN), (steel, NN), (buildings, NNS), (and, CC), (pud, NN)]","[need, copy, contract, related, document, pertain, topcub, aircraft, property, locate, th, dr, ne, wa, airport, topcub, aircraft, hci, steel, building, pud]",21,"[need_copy, copy_contract, contract_related, related_document, document_pertain, pertain_topcub, topcub_aircraft, aircraft_property, property_locate, locate_th, th_dr, dr_ne, ne_wa, wa_airport, airport_topcub, topcub_aircraft, aircraft_hci, hci_steel, steel_building, building_pud]"
2,2,2,Copies of Building Permits of valuation and up min for ReRoofs min for Cell Tower upgrades Electrical Mechanical Plumbing at min and Solar Panels Swimming Pools Foundations at any valuation,Arlington,2018-06,"(Copies, of, Building, Permits, of, , valuation, and, up, , min, for, ReRoofs, , min, for, Cell, Tower, upgrades, Electrical, Mechanical, , Plumbing, at, , min, and, Solar, Panels, Swimming, Pools, , Foundations, at, any, valuation)","[building, reroofs, cell, tower, electrical, mechanical, plumbing, solar, swimming, pools]",copies of building permits of valuation and up min for reroofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,"[copies, of, building, permits, of, valuation, and, up, min, for, reroofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[(copies, NNS), (of, IN), (building, VBG), (permits, NNS), (of, IN), (valuation, NN), (and, CC), (up, RB), (min, NN), (for, IN), (reroofs, NN), (min, NN), (for, IN), (cell, NN), (tower, NN), (upgrades, JJ), (electrical, JJ), (mechanical, JJ), (plumbing, NN), (at, IN), (min, NN), (and, CC), (solar, JJ), (panels, NNS), (swimming, VBG), (pools, JJ), (foundations, NNS), (at, IN), (any, DT), (valuation, NN)]","[copy, build, permit, valuation, min, reroofs, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",20,"[copy_build, build_permit, permit_valuation, valuation_min, min_reroofs, reroofs_min, min_cell, cell_tower, tower_upgrades, upgrades_electrical, electrical_mechanical, mechanical_plumbing, plumbing_min, min_solar, solar_panel, panel_swim, swim_pools, pools_foundation, foundation_valuation]"
3,3,3,police report filed to an officer against Wayne Parris DOB from Brittany J Parris The paperwork I have has a case number D it is also stamped at the bottom with Iím not sure which number you will need If there is any other information needed please let me know,Arlington,2018-06,"(police, report, filed, to, an, officer, against, Wayne, Parris, DOB, , from, Brittany, J, Parris, The, paperwork, I, have, has, a, case, number, D, it, is, also, stamped, at, the, bottom, with, , Iím, not, sure, which, number, you, will, need, If, there, is, any, other, information, needed, please, let, me, know)","[wayne, parris, dob, brittany, j, parris]",police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[(police, NNS), (report, NN), (filed, VBD), (to, TO), (an, DT), (officer, NN), (against, IN), (wayne, JJ), (parris, JJ), (dob, NN), (from, IN), (brittany, JJ), (j, NN), (parris, VBD), (the, DT), (paperwork, NN), (i, NN), (have, VBP), (has, VBZ), (a, DT), (case, NN), (number, NN), (d, NN), (it, PRP), (is, VBZ), (also, RB), (stamped, VBN), (at, IN), (the, DT), (bottom, NN), (with, IN), (iím, JJ), (not, RB), (sure, JJ), (which, WDT), (number, NN), (you, PRP), (will, MD), (need, VB), (if, IN), (there, EX), (is, VBZ), (any, DT), (other, JJ), (information, NN), (needed, VBN), (please, NN), (let, VB), (me, PRP), (know, VB)]","[police, report, file, officer, wayne, parris, dob, brittany, j, parris, paperwork, case, number, also, stamp, bottom, iím, sure, number, need, information, need, please, let, know]",25,"[police_report, report_file, file_officer, officer_wayne, wayne_parris, parris_dob, dob_brittany, brittany_j, j_parris, parris_paperwork, paperwork_case, case_number, number_also, also_stamp, stamp_bottom, bottom_iím, iím_sure, sure_number, number_need, need_information, information_need, need_please, please_let, let_know]"
4,4,4,Email Communications between Stephanie Shook Dave Kraski Bruce Stedman and Chad Schmidt in regards to Fire Protection District billing and passage of contract for ALS Services \n\nAlso any copies of Agenda Bills D Contract and materials presented for review in NovDec time frame in regards to the contract,Arlington,2018-06,"(Email, Communications, between, Stephanie, Shook, Dave, Kraski, Bruce, Stedman, and, Chad, Schmidt, in, regards, to, Fire, Protection, District, , billing, and, passage, of, contract, for, ALS, Services, \n\n, Also, any, copies, of, Agenda, Bills, D, Contract, and, materials, presented, for, review, in, NovDec, time, frame, in, regards, to, the, contract)","[communications, stephanie, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, als, agenda, bills, d, contract, novdec]",email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in novdec time frame in regards to the contract,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, novdec, time, frame, in, regards, to, the, contract]","[(email, NN), (communications, NNS), (between, IN), (stephanie, JJ), (shook, NN), (dave, VBP), (kraski, VBN), (bruce, NN), (stedman, NN), (and, CC), (chad, VBD), (schmidt, VBN), (in, IN), (regards, NNS), (to, TO), (fire, VB), (protection, NN), (district, NN), (billing, NN), (and, CC), (passage, NN), (of, IN), (contract, NN), (for, IN), (als, NNS), (services, NNS), (also, RB), (any, DT), (copies, NNS), (of, IN), (agenda, NN), (bills, NNS), (d, VBP), (contract, NN), (and, CC), (materials, NNS), (presented, VBN), (for, IN), (review, NN), (in, IN), (novdec, JJ), (time, NN), (frame, NN), (in, IN), (regards, NNS), (to, TO), (the, DT), (contract, NN)]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, regard, fire, protection, district, billing, passage, contract, al, service, also, copy, agenda, bill, contract, material, present, review, novdec, time, frame, regard, contract]",32,"[email_communication, communication_stephanie, stephanie_shook, shook_dave, dave_kraski, kraski_bruce, bruce_stedman, stedman_chad, chad_schmidt, schmidt_regard, regard_fire, fire_protection, protection_district, district_billing, billing_passage, passage_contract, contract_al, al_service, service_also, also_copy, copy_agenda, agenda_bill, bill_contract, contract_material, material_present, present_review, review_novdec, novdec_time, time_frame, frame_regard, regard_contract]"


### Identify and remove commonly used words in PRRs

In [14]:
word_list = [y for x in list(data['mash']) for y in x]
counts = Counter(word_list)
Counter(word_list).most_common(50)

[('report', 38853),
 ('request', 31053),
 ('record', 27126),
 ('please', 21961),
 ('copy', 20054),
 ('police', 16578),
 ('date', 14011),
 ('property', 12990),
 ('street', 12119),
 ('information', 11886),
 ('number', 11519),
 ('th', 11096),
 ('provide', 10801),
 ('driver', 10752),
 ('would', 10605),
 ('include', 10482),
 ('case', 10033),
 ('document', 9670),
 ('permit', 8986),
 ('accident', 8929),
 ('thank', 8913),
 ('wa', 8841),
 ('email', 8467),
 ('address', 8350),
 ('st', 8346),
 ('department', 8247),
 ('incident', 8221),
 ('like', 8038),
 ('location', 7942),
 ('regard', 7862),
 ('state', 7753),
 ('cw', 6825),
 ('relate', 6725),
 ('ave', 6622),
 ('insurance', 6594),
 ('building', 6551),
 ('call', 6464),
 ('code', 6437),
 ('ne', 6420),
 ('public', 6363),
 ('following', 6331),
 ('file', 6028),
 ('county', 5969),
 ('need', 5915),
 ('plan', 5814),
 ('type', 5766),
 ('name', 5659),
 ('company', 5621),
 ('fire', 5585),
 ('violation', 5494)]

In [15]:
common_list = ['report', 'request', 'record', 'city', 'please', 'copy', 'date', 'information', 'would', 'regard', 'public',
              'include', 'document', 'provide', 'like', 'thank', 'need', 'know', 'thanks', 'pursuant', 'dear', 'file',
              'relate', 'from', 'either', 'hello', 'hi', 'foia', 'requestors', 'requestor', 'receive', 'available', 
               'make', 'attach', 'pertain', 'might', 'see', 'near']

In [16]:
# remove general words that are common to public record requests

#for word in common_list:
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in common_list])

In [17]:
# remove number suffixes
suffix_list = ['th', 'nd', 'st', 'rd', 'blvd', 'pkwy']
#for word in suffix_list:
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in suffix_list])

In [18]:
# remove city and state abbreviations
abbv_list = ['wa', 'nc', 'co', 'ca', 'oh', 'tx', 'nm', 'fl', 'ma', 'la', 'ok', 'az', 'ri', 'va', 
             'francisco', 'sf', 'okc', 'lv', 'nola', 'slc', 'cw']
#for word in suffix_list:
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in abbv_list])

In [19]:
# remove spelled numbers
num_list = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in  num_list])

In [20]:
# replace 'inc' with 'incident
data['mash'] = data['mash'].apply(lambda x: ['incident' if i=='inc' else i for i in x])

In [21]:
data.count_total.describe()

AttributeError: 'DataFrame' object has no attribute 'count_total'

In [None]:
common_bigrams = ['police_report', 'insurance_company', 'location_loss', 'date_occurrence', 'reportcase_number',
                  'insure_driver', 'auto_accident', 'occurrence_location', 'transactionreference_insurance', 'number_date', 'type_auto',
                  'accident_reportcase', 'code_violation', 'copy_police', 'incident_report', 'police_department', 'certificate_occupancy',
                  'accident_report', 'property_locate', 'storage_tank','driver_note', 'building_permit', 'driver_driver','case_number', 
                  'hazardous_material', 'collision_report', 'state_farm', 'site_plan', 'fire_department', 'ftp_report', 'auto_theft',
                  'fire_code', 'request_police', 'farm_claim', 'claim_compass', 'site_assessment', 'compass_report', 'environmental_site', 
                  'tax_sale', 'loss_cross','city_council', 'code_enforcement', 'subject_property', 'report_case', 'phase_environmental', 
                  'report_incident', 'date_loss', 'police_case', 'witness_statement', 'driving_record', 'break_in', 'birth_certificate', 
                  'death_certificate', 'background_check', 'public_works', 'lease_agreement', 'medical_record', 'billing_record', 
                  'record_check', 'records_check', 'marriage_certificate', 'marriage_record', 'park_ticket', 'miss_person',
                 'marriage_license', 'reckless_driving', 'arrest_report', 'medical_billing', 'medical_report', 'criminal_record',
                 'floor_plan', 'site_plan', 'building_plan', 'building_code', 'code_enforcement', 'personnel_file']
 

In [None]:
bigram_list = [y for x in list(data['bigrams']) for y in x]
counts = Counter(bigram_list)
Counter(bigram_list).most_common(100)

In [None]:
data['common_bigrams'] = data['bigrams'].apply(lambda x: [i for i in x if i in common_bigrams])

In [None]:
noise = ['dr', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sept', 'sep', 'oct', 'nov', 'dec', 
        'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 
        'december', 'ne', 'nw', 'se', 'sw', 'ct', 'dr', 'way', 'dv', 'ave', 'aka', 'get', 'look', 'im', 'want', 
        'find', 'could', 'go', 'take', 'e', 'n', 's', 'w', '“', '’', '”', '•', 'northeast', 'northwest', 'southeast', 
        'southwest', 'north', 'south', 'east', 'west', 'orleans', '–', 'a', 'b', 'c', 'd', 'f', 'g', 'h', 'i', 'j', 'k',
        'l', 'm', 'o', 'p', 'q', 'r', 't', 'u', 'v', 'x', 'y', 'z', 'am', 'pm', 'hr', 'mr', 'ms', 'mrs', 'johnson', 
        'jr', 'kent', 'christopher', 'miller', 'joe', 'willows', 'david', 'michael', 'john', 'red', 'robert',
        'ask', 'able', 'let', 'question', 'also', 'snohomish', '¬ß', 'per', 'available', 'test', '√Ø']



In [None]:
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in noise])

In [None]:
data['final_mash'] = data['mash'] + data['common_bigrams']

In [None]:
# create column with the length of mash for each PRR

data['mash_len'] = data['final_mash'].apply(len)

In [None]:
# remove entries of length 0

data = data[data['mash_len'] > 0]

In [None]:
data['mash_len'].describe()

In [None]:
data_gp = data.groupby('city').mean()
data_gp['mash_len']

#### We can see a couple of examples of the cleaned mash and the original request:

In [None]:
data.shape

In [None]:
data['Summary'][data.index == 164]

In [None]:
data['final_mash'][data.index == 164]

In [None]:
data['final_mash'][data.index == 60000]

In [None]:
data["Summary"][60000]

In [None]:
data['final_mash'][10000]

In [None]:
data.to_csv('data.csv', index=False)