In [67]:
# system tools
import warnings
import json
import sys
import string
import ast

# data cleaning + analysis tools
import pandas as pd
import datetime as dt
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

#nlp tools
import lda #Latent Dirichlet Allocation (create topics)
import gensim
import spacy
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from gensim.models import Phrases
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams

#set notebook preferences
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


##  Import data frame from cleaning/preprocessing

In [68]:
data = pd.read_csv('data.csv')
data['final_mash'] = data['final_mash'].apply(lambda x:  ast.literal_eval(x))
data['mash'] = data['mash'].apply(lambda x:  ast.literal_eval(x))
data['common_bigrams'] = data['common_bigrams'].apply(lambda x:  ast.literal_eval(x))
data['token'] = data['token'].apply(lambda x:  ast.literal_eval(x))
data['lemma'] = data['lemma'].apply(lambda x:  ast.literal_eval(x))


In [107]:
data.head()

Unnamed: 0,index,Summary,city,month_year,sum_ed,token,lemma,mash,mash_len,bigrams,count_total,common_bigrams,final_mash
0,0,"We are working with an engineering firm on an upcoming project. They have asked us to gather maps for this project. Would you be able to assist me in gathering maps/records (as builds) for any underground water facilities you may have? Something just showing the route of the water lines would do.\n\n207th ST NE to 92nd Ave NE, Arlington, Cascade Surveying & Engineering \n\nI have attached the scope for your convenience. Please let me know if you have questions.",Arlington,2018-06,we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering maps records as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, maps, records, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[(we, PRP), (are, VBP), (working, VBG), (with, IN), (an, DT), (engineering, NN), (firm, NN), (on, IN), (an, DT), (upcoming, JJ), (project, NN), (they, PRP), (have, VBP), (asked, VBN), (us, PRP), (to, TO), (gather, VB), (maps, NNS), (for, IN), (this, DT), (project, NN), (would, MD), (you, PRP), (be, VB), (able, JJ), (to, TO), (assist, VB), (me, PRP), (in, IN), (gathering, VBG), (maps, NNS), (records, NNS), (as, IN), (builds, NNS), (for, IN), (any, DT), (underground, JJ), (water, NN), (facilities, NNS), (you, PRP), (may, MD), (have, VB), (something, NN), (just, RB), (showing, VBG), (the, DT), (route, NN), (of, IN), (the, DT), (water, NN), (lines, NNS), (would, MD), (do, VB), (th, VB), (st, VB), (ne, JJ), (to, TO), (nd, VB), (ave, VB), (ne, JJ), (cascade, NN), (surveying, VBG), (engineering, NN), (i, NN), (have, VBP), (attached, VBN), (the, DT), (scope, NN), (for, IN), (your, PRP$), (convenience, NN), (please, NN), (let, VB), (me, PRP), (know, VB), (if, IN), (you, PRP), (have, VBP), (...","[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, scope, convenience]",25,"['work_engineering', 'engineering_firm', 'firm_upcoming', 'upcoming_project', 'project_ask', 'ask_u', 'u_gather', 'gather_map', 'map_project', 'project_would', 'would_able', 'able_assist', 'assist_gather', 'gather_map', 'map_record', 'record_build', 'build_underground', 'underground_water', 'water_facility', 'facility_may', 'may_something', 'something_show', 'show_route', 'route_water', 'water_line', 'line_would', 'would_th', 'th_st', 'st_ne', 'ne_nd', 'nd_ave', 'ave_ne', 'ne_cascade', 'cascade_survey', 'survey_engineering', 'engineering_attach', 'attach_scope', 'scope_convenience', 'convenience_please', 'please_let', 'let_know', 'know_question']",61815,[],"[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, scope, convenience]"
1,1,"Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at 17922 59th DR NE Arlington WA 98223 between Arlington Airport, Topcub Aircraft, City of Arlington, HCI Steel Buildings and PUD.",Arlington,2018-06,need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft of hci steel buildings and pud,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, of, hci, steel, buildings, and, pud]","[(need, NN), (copies, NNS), (of, IN), (contracts, NNS), (and, CC), (all, DT), (related, JJ), (documents, NNS), (pertaining, VBG), (to, TO), (topcub, VB), (aircraft, NN), (property, NN), (located, VBN), (at, IN), (th, NN), (dr, NN), (ne, JJ), (wa, NN), (between, IN), (airport, NN), (topcub, NN), (aircraft, NN), (of, IN), (hci, NN), (steel, NN), (buildings, NNS), (and, CC), (pud, NN)]","[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud]",15,"['need_copy', 'copy_contract', 'contract_related', 'related_document', 'document_pertain', 'pertain_topcub', 'topcub_aircraft', 'aircraft_property', 'property_locate', 'locate_th', 'th_dr', 'dr_ne', 'ne_wa', 'wa_airport', 'airport_topcub', 'topcub_aircraft', 'aircraft_hci', 'hci_steel', 'steel_building', 'building_pud']",37214,['property_locate'],"[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud, property_locate]"
2,2,"Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)",Arlington,2018-06,copies of building permits of valuation and up min for re roofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,"[copies, of, building, permits, of, valuation, and, up, min, for, re, roofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[(copies, NNS), (of, IN), (building, VBG), (permits, NNS), (of, IN), (valuation, NN), (and, CC), (up, RB), (min, NN), (for, IN), (re, NN), (roofs, NNS), (min, VBP), (for, IN), (cell, NN), (tower, NN), (upgrades, JJ), (electrical, JJ), (mechanical, JJ), (plumbing, NN), (at, IN), (min, NN), (and, CC), (solar, JJ), (panels, NNS), (swimming, VBG), (pools, JJ), (foundations, NNS), (at, IN), (any, DT), (valuation, NN)]","[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",19,"['copy_build', 'build_permit', 'permit_valuation', 'valuation_min', 'min_roof', 'roof_min', 'min_cell', 'cell_tower', 'tower_upgrades', 'upgrades_electrical', 'electrical_mechanical', 'mechanical_plumbing', 'plumbing_min', 'min_solar', 'solar_panel', 'panel_swim', 'swim_pools', 'pools_foundation', 'foundation_valuation']",13672,[],"[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]"
3,3,"police report filed to an officer against Wayne Parris (DOB 08-03-1957) from Brittany J. Parris. The paperwork I have has a case number D18-39 it is also stamped at the bottom with 18-1294, Iím not sure which number you will need. If there is any other information needed please let me know.",Arlington,2018-06,police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[(police, NNS), (report, NN), (filed, VBD), (to, TO), (an, DT), (officer, NN), (against, IN), (wayne, JJ), (parris, JJ), (dob, NN), (from, IN), (brittany, JJ), (j, NN), (parris, VBD), (the, DT), (paperwork, NN), (i, NN), (have, VBP), (has, VBZ), (a, DT), (case, NN), (number, NN), (d, NN), (it, PRP), (is, VBZ), (also, RB), (stamped, VBN), (at, IN), (the, DT), (bottom, NN), (with, IN), (iím, JJ), (not, RB), (sure, JJ), (which, WDT), (number, NN), (you, PRP), (will, MD), (need, VB), (if, IN), (there, EX), (is, VBZ), (any, DT), (other, JJ), (information, NN), (needed, VBN), (please, NN), (let, VB), (me, PRP), (know, VB)]","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number]",17,"['police_report', 'report_file', 'file_officer', 'officer_wayne', 'wayne_parris', 'parris_dob', 'dob_brittany', 'brittany_j', 'j_parris', 'parris_paperwork', 'paperwork_case', 'case_number', 'number_also', 'also_stamp', 'stamp_bottom', 'bottom_iím', 'iím_sure', 'sure_number', 'number_need', 'need_information', 'information_need', 'need_please', 'please_let', 'let_know']",70002,"['police_report', 'case_number']","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number, police_report, case_number]"
4,4,"Email Communications between Stephanie Shook, Dave Kraski, Bruce Stedman and Chad Schmidt in regards to Fire Protection District 21 billing and passage of contract for ALS Services. \n\nAlso any copies of Agenda Bills, D21 Contract and materials presented for review in Nov/Dec time frame in regards to the contract.",Arlington,2018-06,email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in nov dec time frame in regards to the contract,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, nov, dec, time, frame, in, regards, to, the, contract]","[(email, NN), (communications, NNS), (between, IN), (stephanie, JJ), (shook, NN), (dave, VBP), (kraski, VBN), (bruce, NN), (stedman, NN), (and, CC), (chad, VBD), (schmidt, VBN), (in, IN), (regards, NNS), (to, TO), (fire, VB), (protection, NN), (district, NN), (billing, NN), (and, CC), (passage, NN), (of, IN), (contract, NN), (for, IN), (als, NNS), (services, NNS), (also, RB), (any, DT), (copies, NNS), (of, IN), (agenda, NN), (bills, NNS), (d, VBP), (contract, NN), (and, CC), (materials, NNS), (presented, VBN), (for, IN), (review, NN), (in, IN), (nov, JJ), (dec, NN), (time, NN), (frame, NN), (in, IN), (regards, NNS), (to, TO), (the, DT), (contract, NN)]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]",27,"['email_communication', 'communication_stephanie', 'stephanie_shook', 'shook_dave', 'dave_kraski', 'kraski_bruce', 'bruce_stedman', 'stedman_chad', 'chad_schmidt', 'schmidt_regard', 'regard_fire', 'fire_protection', 'protection_district', 'district_billing', 'billing_passage', 'passage_contract', 'contract_al', 'al_service', 'service_also', 'also_copy', 'copy_agenda', 'agenda_bill', 'bill_contract', 'contract_material', 'material_present', 'present_review', 'review_nov', 'nov_dec', 'dec_time', 'time_frame', 'frame_regard', 'regard_contract']",47312,[],"[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]"


In [69]:
data.mash_len.describe()

count    93744.000000
mean        15.715096
std         27.687456
min          1.000000
25%          3.000000
50%          7.000000
75%         18.000000
max       2729.000000
Name: mash_len, dtype: float64

## 1) Remove words at or below certain count

### count = 1

In [70]:
word_list = [y for x in list(data['mash']) for y in x]
counts = Counter(word_list)

In [71]:
count_df = pd.Series(counts, name = 'count') 
count_df.index.name = 'word'
count_df = count_df.reset_index()
count_df['count'].describe()

count    54183.000000
mean        25.364838
std        248.658395
min          1.000000
25%          1.000000
50%          1.000000
75%          4.000000
max      15231.000000
Name: count, dtype: float64

In [113]:
data1 = data.copy()
data1['mash'] = data1['mash'].apply(lambda x: [i for i in x if counts[i] > 10])
data1['final_mash'] = data1['mash'] + data1['common_bigrams']

In [114]:
data1['mash_len'] = data1['final_mash'].apply(len)
data1 = data1[data1['mash_len'] != 0]

In [115]:
data1.mash_len.describe()

count    92563.000000
mean        14.872476
std         26.004775
min          1.000000
25%          3.000000
50%          7.000000
75%         17.000000
max       2411.000000
Name: mash_len, dtype: float64

### count = 10

In [110]:
data10 = data.copy()
data10['mash'] = data10['mash'].apply(lambda x: [i for i in x if counts[i] > 10])
data10['final_mash'] = data10['mash'] + data10['common_bigrams']

In [112]:
data10 = data10[data10['mash_len'] != 0]
data10['mash_len'] = data10['final_mash'].apply(len)
data10.mash_len.describe()

count    93744.000000
mean        14.685111
std         25.893634
min          0.000000
25%          3.000000
50%          7.000000
75%         17.000000
max       2411.000000
Name: mash_len, dtype: float64

## 2) Remove proper names

### 2a) Using NLTK

In [80]:
data_pn_nltk = data.copy()

In [81]:
data_pn_nltk.head()

Unnamed: 0,index,Summary,city,month_year,sum_ed,token,lemma,mash,mash_len,bigrams,count_total,common_bigrams,final_mash
0,0,"We are working with an engineering firm on an upcoming project. They have asked us to gather maps for this project. Would you be able to assist me in gathering maps/records (as builds) for any underground water facilities you may have? Something just showing the route of the water lines would do.\n\n207th ST NE to 92nd Ave NE, Arlington, Cascade Surveying & Engineering \n\nI have attached the scope for your convenience. Please let me know if you have questions.",Arlington,2018-06,we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering maps records as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, maps, records, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[(we, PRP), (are, VBP), (working, VBG), (with, IN), (an, DT), (engineering, NN), (firm, NN), (on, IN), (an, DT), (upcoming, JJ), (project, NN), (they, PRP), (have, VBP), (asked, VBN), (us, PRP), (to, TO), (gather, VB), (maps, NNS), (for, IN), (this, DT), (project, NN), (would, MD), (you, PRP), (be, VB), (able, JJ), (to, TO), (assist, VB), (me, PRP), (in, IN), (gathering, VBG), (maps, NNS), (records, NNS), (as, IN), (builds, NNS), (for, IN), (any, DT), (underground, JJ), (water, NN), (facilities, NNS), (you, PRP), (may, MD), (have, VB), (something, NN), (just, RB), (showing, VBG), (the, DT), (route, NN), (of, IN), (the, DT), (water, NN), (lines, NNS), (would, MD), (do, VB), (th, VB), (st, VB), (ne, JJ), (to, TO), (nd, VB), (ave, VB), (ne, JJ), (cascade, NN), (surveying, VBG), (engineering, NN), (i, NN), (have, VBP), (attached, VBN), (the, DT), (scope, NN), (for, IN), (your, PRP$), (convenience, NN), (please, NN), (let, VB), (me, PRP), (know, VB), (if, IN), (you, PRP), (have, VBP), (...","[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, scope, convenience]",25,"['work_engineering', 'engineering_firm', 'firm_upcoming', 'upcoming_project', 'project_ask', 'ask_u', 'u_gather', 'gather_map', 'map_project', 'project_would', 'would_able', 'able_assist', 'assist_gather', 'gather_map', 'map_record', 'record_build', 'build_underground', 'underground_water', 'water_facility', 'facility_may', 'may_something', 'something_show', 'show_route', 'route_water', 'water_line', 'line_would', 'would_th', 'th_st', 'st_ne', 'ne_nd', 'nd_ave', 'ave_ne', 'ne_cascade', 'cascade_survey', 'survey_engineering', 'engineering_attach', 'attach_scope', 'scope_convenience', 'convenience_please', 'please_let', 'let_know', 'know_question']",61815,[],"[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, scope, convenience]"
1,1,"Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at 17922 59th DR NE Arlington WA 98223 between Arlington Airport, Topcub Aircraft, City of Arlington, HCI Steel Buildings and PUD.",Arlington,2018-06,need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft of hci steel buildings and pud,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, of, hci, steel, buildings, and, pud]","[(need, NN), (copies, NNS), (of, IN), (contracts, NNS), (and, CC), (all, DT), (related, JJ), (documents, NNS), (pertaining, VBG), (to, TO), (topcub, VB), (aircraft, NN), (property, NN), (located, VBN), (at, IN), (th, NN), (dr, NN), (ne, JJ), (wa, NN), (between, IN), (airport, NN), (topcub, NN), (aircraft, NN), (of, IN), (hci, NN), (steel, NN), (buildings, NNS), (and, CC), (pud, NN)]","[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud]",15,"['need_copy', 'copy_contract', 'contract_related', 'related_document', 'document_pertain', 'pertain_topcub', 'topcub_aircraft', 'aircraft_property', 'property_locate', 'locate_th', 'th_dr', 'dr_ne', 'ne_wa', 'wa_airport', 'airport_topcub', 'topcub_aircraft', 'aircraft_hci', 'hci_steel', 'steel_building', 'building_pud']",37214,['property_locate'],"[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud, property_locate]"
2,2,"Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)",Arlington,2018-06,copies of building permits of valuation and up min for re roofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,"[copies, of, building, permits, of, valuation, and, up, min, for, re, roofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[(copies, NNS), (of, IN), (building, VBG), (permits, NNS), (of, IN), (valuation, NN), (and, CC), (up, RB), (min, NN), (for, IN), (re, NN), (roofs, NNS), (min, VBP), (for, IN), (cell, NN), (tower, NN), (upgrades, JJ), (electrical, JJ), (mechanical, JJ), (plumbing, NN), (at, IN), (min, NN), (and, CC), (solar, JJ), (panels, NNS), (swimming, VBG), (pools, JJ), (foundations, NNS), (at, IN), (any, DT), (valuation, NN)]","[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",19,"['copy_build', 'build_permit', 'permit_valuation', 'valuation_min', 'min_roof', 'roof_min', 'min_cell', 'cell_tower', 'tower_upgrades', 'upgrades_electrical', 'electrical_mechanical', 'mechanical_plumbing', 'plumbing_min', 'min_solar', 'solar_panel', 'panel_swim', 'swim_pools', 'pools_foundation', 'foundation_valuation']",13672,[],"[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]"
3,3,"police report filed to an officer against Wayne Parris (DOB 08-03-1957) from Brittany J. Parris. The paperwork I have has a case number D18-39 it is also stamped at the bottom with 18-1294, Iím not sure which number you will need. If there is any other information needed please let me know.",Arlington,2018-06,police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[(police, NNS), (report, NN), (filed, VBD), (to, TO), (an, DT), (officer, NN), (against, IN), (wayne, JJ), (parris, JJ), (dob, NN), (from, IN), (brittany, JJ), (j, NN), (parris, VBD), (the, DT), (paperwork, NN), (i, NN), (have, VBP), (has, VBZ), (a, DT), (case, NN), (number, NN), (d, NN), (it, PRP), (is, VBZ), (also, RB), (stamped, VBN), (at, IN), (the, DT), (bottom, NN), (with, IN), (iím, JJ), (not, RB), (sure, JJ), (which, WDT), (number, NN), (you, PRP), (will, MD), (need, VB), (if, IN), (there, EX), (is, VBZ), (any, DT), (other, JJ), (information, NN), (needed, VBN), (please, NN), (let, VB), (me, PRP), (know, VB)]","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number]",17,"['police_report', 'report_file', 'file_officer', 'officer_wayne', 'wayne_parris', 'parris_dob', 'dob_brittany', 'brittany_j', 'j_parris', 'parris_paperwork', 'paperwork_case', 'case_number', 'number_also', 'also_stamp', 'stamp_bottom', 'bottom_iím', 'iím_sure', 'sure_number', 'number_need', 'need_information', 'information_need', 'need_please', 'please_let', 'let_know']",70002,"['police_report', 'case_number']","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number, police_report, case_number]"
4,4,"Email Communications between Stephanie Shook, Dave Kraski, Bruce Stedman and Chad Schmidt in regards to Fire Protection District 21 billing and passage of contract for ALS Services. \n\nAlso any copies of Agenda Bills, D21 Contract and materials presented for review in Nov/Dec time frame in regards to the contract.",Arlington,2018-06,email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in nov dec time frame in regards to the contract,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, nov, dec, time, frame, in, regards, to, the, contract]","[(email, NN), (communications, NNS), (between, IN), (stephanie, JJ), (shook, NN), (dave, VBP), (kraski, VBN), (bruce, NN), (stedman, NN), (and, CC), (chad, VBD), (schmidt, VBN), (in, IN), (regards, NNS), (to, TO), (fire, VB), (protection, NN), (district, NN), (billing, NN), (and, CC), (passage, NN), (of, IN), (contract, NN), (for, IN), (als, NNS), (services, NNS), (also, RB), (any, DT), (copies, NNS), (of, IN), (agenda, NN), (bills, NNS), (d, VBP), (contract, NN), (and, CC), (materials, NNS), (presented, VBN), (for, IN), (review, NN), (in, IN), (nov, JJ), (dec, NN), (time, NN), (frame, NN), (in, IN), (regards, NNS), (to, TO), (the, DT), (contract, NN)]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]",27,"['email_communication', 'communication_stephanie', 'stephanie_shook', 'shook_dave', 'dave_kraski', 'kraski_bruce', 'bruce_stedman', 'stedman_chad', 'chad_schmidt', 'schmidt_regard', 'regard_fire', 'fire_protection', 'protection_district', 'district_billing', 'billing_passage', 'passage_contract', 'contract_al', 'al_service', 'service_also', 'also_copy', 'copy_agenda', 'agenda_bill', 'bill_contract', 'contract_material', 'material_present', 'present_review', 'review_nov', 'nov_dec', 'dec_time', 'time_frame', 'frame_regard', 'regard_contract']",47312,[],"[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]"


In [82]:
# two methods of identifying proper nouns, first tends to type II error (pn), second tends to type I error (pn2)

data_pn_nltk['pn'] = data_pn_nltk['lemma'].apply(lambda x: [i[0] for i in x if i[1] == 'NNP'])
data_pn_nltk['lemma2'] = data_pn_nltk['Summary'].apply(lambda x: nltk.tag.pos_tag(x.split()))
data_pn_nltk['pn2'] = data_pn_nltk['lemma2'].apply(lambda x: [i[0] for i in x if i[1] == 'NNP'])
data_pn_nltk['pn2'] = data_pn_nltk['pn2'].apply(lambda x: [i.lower() for i in x])
data_pn_nltk['final_mash2'] = data_pn_nltk.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn2']], axis =1)
data_pn_nltk['final_mash'] = data_pn_nltk.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn']], axis =1)
data_pn_nltk['mash_len'] = data_pn_nltk['final_mash'].apply(len)
data_pn_nltk['mash_len2'] = data_pn_nltk['final_mash2'].apply(len)

data_pn_nltk2 = data_pn_nltk[data_pn_nltk['mash_len2'] > 0]
data_pn_nltk = data_pn_nltk[data_pn_nltk['mash_len'] > 0]

In [83]:
data_pn_nltk2['mash_len2'].describe()

count    87278.000000
mean        12.408946
std         21.410852
min          1.000000
25%          2.000000
50%          6.000000
75%         14.000000
max       1643.000000
Name: mash_len2, dtype: float64

In [84]:
data_pn_nltk['mash_len'].describe()

count    93743.000000
mean        15.679624
std         27.586053
min          1.000000
25%          3.000000
50%          7.000000
75%         18.000000
max       2724.000000
Name: mash_len, dtype: float64

### 2b) Using spaCy

In [85]:
nlp = spacy.load('en_core_web_sm')
data_sp = data.copy()

In [86]:
data_sp['token_sp'] = data_sp['sum_ed'].apply(lambda x: nlp(x))
data_sp['token_sp2'] = data_sp['Summary'].apply(lambda x: nlp(x))
data_sp['pn'] = data_sp['token_sp'].apply(lambda x: [ i.lemma_ for i in x if i.tag_ == 'NNP'])
data_sp['pn2'] = data_sp['token_sp2'].apply(lambda x: [ i.lemma_ for i in x if i.tag_ == 'NNP'])
data_sp['final_mash2'] = data_sp.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn2']], axis =1)
data_sp['final_mash'] = data_sp.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn']], axis =1)


In [87]:
data_sp['mash_len'] = data_sp['final_mash'].apply(len)
data_sp['mash_len2'] = data_sp['final_mash2'].apply(len)
data_sp2 = data_sp[data_sp['mash_len2'] > 0]
data_sp = data_sp[data_sp['mash_len'] > 0]    

In [37]:
data_sp2['mash_len2'].describe()

count    90748.000000
mean        11.367303
std         20.714069
min          1.000000
25%          2.000000
50%          5.000000
75%         13.000000
max       1674.000000
Name: mash_len2, dtype: float64

In [38]:
data_sp['mash_len'].describe()

count    95292.000000
mean        15.747723
std         27.907075
min          1.000000
25%          3.000000
50%          7.000000
75%         18.000000
max       2746.000000
Name: mash_len, dtype: float64

In [32]:
data_sp.head()

Unnamed: 0,index,Summary,city,month_year,sum_ed,token,lemma,mash,mash_len,bigrams,common_bigrams,final_mash,token_sp,token_sp2,pn,pn2,final_mash2
0,0,"We are working with an engineering firm on an upcoming project. They have asked us to gather maps for this project. Would you be able to assist me in gathering maps/records (as builds) for any underground water facilities you may have? Something just showing the route of the water lines would do.\n\n207th ST NE to 92nd Ave NE, Arlington, Cascade Surveying & Engineering \n\nI have attached the scope for your convenience. Please let me know if you have questions.",Arlington,2018-06,we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering maps records as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, maps, records, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[('we', 'PRP'), ('are', 'VBP'), ('working', 'VBG'), ('with', 'IN'), ('an', 'DT'), ('engineering', 'NN'), ('firm', 'NN'), ('on', 'IN'), ('an', 'DT'), ('upcoming', 'JJ'), ('project', 'NN'), ('they', 'PRP'), ('have', 'VBP'), ('asked', 'VBN'), ('us', 'PRP'), ('to', 'TO'), ('gather', 'VB'), ('maps', 'NNS'), ('for', 'IN'), ('this', 'DT'), ('project', 'NN'), ('would', 'MD'), ('you', 'PRP'), ('be', 'VB'), ('able', 'JJ'), ('to', 'TO'), ('assist', 'VB'), ('me', 'PRP'), ('in', 'IN'), ('gathering', 'VBG'), ('maps', 'NNS'), ('records', 'NNS'), ('as', 'IN'), ('builds', 'NNS'), ('for', 'IN'), ('any', 'DT'), ('underground', 'JJ'), ('water', 'NN'), ('facilities', 'NNS'), ('you', 'PRP'), ('may', 'MD'), ('have', 'VB'), ('something', 'NN'), ('just', 'RB'), ('showing', 'VBG'), ('the', 'DT'), ('route', 'NN'), ('of', 'IN'), ('the', 'DT'), ('water', 'NN'), ('lines', 'NNS'), ('would', 'MD'), ('do', 'VB'), ('th', 'VB'), ('st', 'VB'), ('ne', 'JJ'), ('to', 'TO'), ('nd', 'VB'), ('ave', 'VB'), ('ne', 'JJ'), ('c...","[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, attach, scope, convenience]",26,"['work_engineering', 'engineering_firm', 'firm_upcoming', 'upcoming_project', 'project_ask', 'ask_u', 'u_gather', 'gather_map', 'map_project', 'project_would', 'would_able', 'able_assist', 'assist_gather', 'gather_map', 'map_record', 'record_build', 'build_underground', 'underground_water', 'water_facility', 'facility_may', 'may_something', 'something_show', 'show_route', 'route_water', 'water_line', 'line_would', 'would_th', 'th_st', 'st_ne', 'ne_nd', 'nd_ave', 'ave_ne', 'ne_cascade', 'cascade_survey', 'survey_engineering', 'engineering_attach', 'attach_scope', 'scope_convenience', 'convenience_please', 'please_let', 'let_know', 'know_question']",[],"[work, engineering, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, cascade, survey, engineering, attach, scope, convenience]","(we, are, working, with, an, engineering, firm, on, an, upcoming, project, , they, have, asked, us, to, gather, maps, for, this, project, , would, you, be, able, to, assist, me, in, gathering, maps, records, as, builds, for, any, underground, water, facilities, you, may, have, , something, just, showing, the, route, of, the, water, lines, would, do, \n\n, th, st, ne, to, nd, ave, ne, , cascade, surveying, , engineering, \n\n, i, have, attached, the, scope, for, your, convenience, , please, let, me, know, if, you, have, questions)","(We, are, working, with, an, engineering, firm, on, an, upcoming, project, ., , They, have, asked, us, to, gather, maps, for, this, project, ., , Would, you, be, able, to, assist, me, in, gathering, maps, /, records, (, as, builds, ), for, any, underground, water, facilities, you, may, have, ?, , Something, just, showing, the, route, of, the, water, lines, would, do, ., \n\n, 207th, ST, NE, to, 92nd, Ave, NE, ,, Arlington, ,, Cascade, Surveying, &, Engineering, \n\n, I, have, attached, the, scope, for, your, convenience, ., , Please, let, me, know, if, you, have, questions, .)",[],"[ne, ave, ne, arlington, cascade, surveying, engineering]","[work, firm, upcoming, project, gather, map, project, assist, gather, map, build, underground, water, facility, something, show, route, water, line, survey, attach, scope, convenience]"
1,1,"Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at 17922 59th DR NE Arlington WA 98223 between Arlington Airport, Topcub Aircraft, City of Arlington, HCI Steel Buildings and PUD.",Arlington,2018-06,need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft of hci steel buildings and pud,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, of, hci, steel, buildings, and, pud]","[('need', 'NN'), ('copies', 'NNS'), ('of', 'IN'), ('contracts', 'NNS'), ('and', 'CC'), ('all', 'DT'), ('related', 'JJ'), ('documents', 'NNS'), ('pertaining', 'VBG'), ('to', 'TO'), ('topcub', 'VB'), ('aircraft', 'NN'), ('property', 'NN'), ('located', 'VBN'), ('at', 'IN'), ('th', 'NN'), ('dr', 'NN'), ('ne', 'JJ'), ('wa', 'NN'), ('between', 'IN'), ('airport', 'NN'), ('topcub', 'NN'), ('aircraft', 'NN'), ('of', 'IN'), ('hci', 'NN'), ('steel', 'NN'), ('buildings', 'NNS'), ('and', 'CC'), ('pud', 'NN')]","[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud]",15,"['need_copy', 'copy_contract', 'contract_related', 'related_document', 'document_pertain', 'pertain_topcub', 'topcub_aircraft', 'aircraft_property', 'property_locate', 'locate_th', 'th_dr', 'dr_ne', 'ne_wa', 'wa_airport', 'airport_topcub', 'topcub_aircraft', 'aircraft_hci', 'hci_steel', 'steel_building', 'building_pud']",['property_locate'],"[contract, related, pertain, topcub, aircraft, property, locate, airport, topcub, aircraft, hci, steel, building, pud, property_locate]","(need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, , th, dr, ne, , wa, , between, , airport, topcub, aircraft, , of, , hci, steel, buildings, and, pud)","(Need, copies, of, contracts, and, all, related, documents, pertaining, to, Topcub, Aircraft, property, located, at, 17922, 59th, DR, NE, Arlington, WA, 98223, between, Arlington, Airport, ,, Topcub, Aircraft, ,, City, of, Arlington, ,, HCI, Steel, Buildings, and, PUD, .)",[],"[topcub, aircraft, dr, ne, arlington, wa, arlington, airport, topcub, aircraft, city, arlington, hci, steel, pud]","[contract, related, pertain, property, locate, building, property_locate]"
2,2,"Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)",Arlington,2018-06,copies of building permits of valuation and up min for re roofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,"[copies, of, building, permits, of, valuation, and, up, min, for, re, roofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[('copies', 'NNS'), ('of', 'IN'), ('building', 'VBG'), ('permits', 'NNS'), ('of', 'IN'), ('valuation', 'NN'), ('and', 'CC'), ('up', 'RB'), ('min', 'NN'), ('for', 'IN'), ('re', 'NN'), ('roofs', 'NNS'), ('min', 'VBP'), ('for', 'IN'), ('cell', 'NN'), ('tower', 'NN'), ('upgrades', 'JJ'), ('electrical', 'JJ'), ('mechanical', 'JJ'), ('plumbing', 'NN'), ('at', 'IN'), ('min', 'NN'), ('and', 'CC'), ('solar', 'JJ'), ('panels', 'NNS'), ('swimming', 'VBG'), ('pools', 'JJ'), ('foundations', 'NNS'), ('at', 'IN'), ('any', 'DT'), ('valuation', 'NN')]","[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",19,"['copy_build', 'build_permit', 'permit_valuation', 'valuation_min', 'min_roof', 'roof_min', 'min_cell', 'cell_tower', 'tower_upgrades', 'upgrades_electrical', 'electrical_mechanical', 'mechanical_plumbing', 'plumbing_min', 'min_solar', 'solar_panel', 'panel_swim', 'swim_pools', 'pools_foundation', 'foundation_valuation']",[],"[build, permit, valuation, min, roof, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]","(copies, of, building, permits, of, , valuation, and, up, , min, for, re, roofs, , min, for, cell, tower, upgrades, electrical, mechanical, , plumbing, at, , min, and, solar, panels, swimming, pools, , foundations, at, any, valuation)","(Copies, of, Building, Permits, of, $, 5,000, valuation, and, up, (, $, 20,000, min, for, Re, -, Roofs, ), ,, (, $, 50,000, min, ., for, Cell, Tower, upgrades, ), ,, (, Electrical, ,, Mechanical, &, Plumbing, at, $, 100,000, min, ., ), and, (, Solar, Panels, ,, Swimming, Pools, &, Foundations, at, any, valuation, ))",[],"[building, re, roofs, cell, tower, mechanical, plumbing, solar, swimming, pools]","[build, permit, valuation, min, roof, min, upgrades, electrical, min, panel, swim, foundation, valuation]"
3,3,"police report filed to an officer against Wayne Parris (DOB 08-03-1957) from Brittany J. Parris. The paperwork I have has a case number D18-39 it is also stamped at the bottom with 18-1294, Iím not sure which number you will need. If there is any other information needed please let me know.",Arlington,2018-06,police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[('police', 'NNS'), ('report', 'NN'), ('filed', 'VBD'), ('to', 'TO'), ('an', 'DT'), ('officer', 'NN'), ('against', 'IN'), ('wayne', 'JJ'), ('parris', 'JJ'), ('dob', 'NN'), ('from', 'IN'), ('brittany', 'JJ'), ('j', 'NN'), ('parris', 'VBD'), ('the', 'DT'), ('paperwork', 'NN'), ('i', 'NN'), ('have', 'VBP'), ('has', 'VBZ'), ('a', 'DT'), ('case', 'NN'), ('number', 'NN'), ('d', 'NN'), ('it', 'PRP'), ('is', 'VBZ'), ('also', 'RB'), ('stamped', 'VBN'), ('at', 'IN'), ('the', 'DT'), ('bottom', 'NN'), ('with', 'IN'), ('iím', 'JJ'), ('not', 'RB'), ('sure', 'JJ'), ('which', 'WDT'), ('number', 'NN'), ('you', 'PRP'), ('will', 'MD'), ('need', 'VB'), ('if', 'IN'), ('there', 'EX'), ('is', 'VBZ'), ('any', 'DT'), ('other', 'JJ'), ('information', 'NN'), ('needed', 'VBN'), ('please', 'NN'), ('let', 'VB'), ('me', 'PRP'), ('know', 'VB')]","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number]",17,"['police_report', 'report_file', 'file_officer', 'officer_wayne', 'wayne_parris', 'parris_dob', 'dob_brittany', 'brittany_j', 'j_parris', 'parris_paperwork', 'paperwork_case', 'case_number', 'number_also', 'also_stamp', 'stamp_bottom', 'bottom_iím', 'iím_sure', 'sure_number', 'number_need', 'need_information', 'information_need', 'need_please', 'please_let', 'let_know']","['police_report', 'case_number']","[police, officer, wayne, parris, dob, brittany, parris, paperwork, case, number, stamp, bottom, iím, sure, number, police_report, case_number]","(police, report, filed, to, an, officer, against, wayne, parris, dob, , from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, , it, is, also, stamped, at, the, bottom, with, , iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know)","(police, report, filed, to, an, officer, against, Wayne, Parris, (, DOB, 08, -, 03, -, 1957, ), from, Brittany, J., Parris, ., The, paperwork, I, have, has, a, case, number, D18, -, 39, it, is, also, stamped, at, the, bottom, with, 18, -, 1294, ,, Iím, not, sure, which, number, you, will, need, ., If, there, is, any, other, information, needed, please, let, me, know, .)",[],"[wayne, parris, dob, brittany, j., parris, d18]","[police, officer, paperwork, case, number, stamp, bottom, iím, sure, number, police_report, case_number]"
4,4,"Email Communications between Stephanie Shook, Dave Kraski, Bruce Stedman and Chad Schmidt in regards to Fire Protection District 21 billing and passage of contract for ALS Services. \n\nAlso any copies of Agenda Bills, D21 Contract and materials presented for review in Nov/Dec time frame in regards to the contract.",Arlington,2018-06,email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in nov dec time frame in regards to the contract,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, nov, dec, time, frame, in, regards, to, the, contract]","[('email', 'NN'), ('communications', 'NNS'), ('between', 'IN'), ('stephanie', 'JJ'), ('shook', 'NN'), ('dave', 'VBP'), ('kraski', 'VBN'), ('bruce', 'NN'), ('stedman', 'NN'), ('and', 'CC'), ('chad', 'VBD'), ('schmidt', 'VBN'), ('in', 'IN'), ('regards', 'NNS'), ('to', 'TO'), ('fire', 'VB'), ('protection', 'NN'), ('district', 'NN'), ('billing', 'NN'), ('and', 'CC'), ('passage', 'NN'), ('of', 'IN'), ('contract', 'NN'), ('for', 'IN'), ('als', 'NNS'), ('services', 'NNS'), ('also', 'RB'), ('any', 'DT'), ('copies', 'NNS'), ('of', 'IN'), ('agenda', 'NN'), ('bills', 'NNS'), ('d', 'VBP'), ('contract', 'NN'), ('and', 'CC'), ('materials', 'NNS'), ('presented', 'VBN'), ('for', 'IN'), ('review', 'NN'), ('in', 'IN'), ('nov', 'JJ'), ('dec', 'NN'), ('time', 'NN'), ('frame', 'NN'), ('in', 'IN'), ('regards', 'NNS'), ('to', 'TO'), ('the', 'DT'), ('contract', 'NN')]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]",27,"['email_communication', 'communication_stephanie', 'stephanie_shook', 'shook_dave', 'dave_kraski', 'kraski_bruce', 'bruce_stedman', 'stedman_chad', 'chad_schmidt', 'schmidt_regard', 'regard_fire', 'fire_protection', 'protection_district', 'district_billing', 'billing_passage', 'passage_contract', 'contract_al', 'al_service', 'service_also', 'also_copy', 'copy_agenda', 'agenda_bill', 'bill_contract', 'contract_material', 'material_present', 'present_review', 'review_nov', 'nov_dec', 'dec_time', 'time_frame', 'frame_regard', 'regard_contract']",[],"[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, billing, passage, contract, al, service, agenda, bill, contract, material, present, review, time, frame, contract]","(email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, , billing, and, passage, of, contract, for, als, services, \n\n, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, nov, dec, time, frame, in, regards, to, the, contract)","(Email, Communications, between, Stephanie, Shook, ,, Dave, Kraski, ,, Bruce, Stedman, and, Chad, Schmidt, in, regards, to, Fire, Protection, District, 21, billing, and, passage, of, contract, for, ALS, Services, ., \n\n, Also, any, copies, of, Agenda, Bills, ,, D21, Contract, and, materials, presented, for, review, in, Nov, /, Dec, time, frame, in, regards, to, the, contract, .)",[],"[communications, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, fire, protection, district, als, agenda, bills, d21, contract, nov, dec]","[email, communication, billing, passage, al, service, bill, material, present, review, time, frame]"


In [88]:
data_sp2 = data_sp[data_sp['mash_len2'] > 0]
data_sp = data_sp[data_sp['mash_len'] > 0]

In [None]:
data_sp2['mash_len2'].describe()

In [None]:
data_sp['mash_len'].describe()

## 3) Drop cities with low (less than 4) average count - Asheville, Greensboro, Dayton, OKC

In [89]:
data_lg_cities = data.copy()

In [90]:
data_lg_cities = data_lg_cities[(data['city'] != 'Asheville') & (data['city'] != 'Greensboro') & 
                               (data['city'] != 'Dayton') & (data['city'] != 'OKC')]

## 4) Remove observations with small number of words in final mash 

### 4a) Greater than 2

In [91]:
data_4a = data.copy()
data_4a = data_4a[data_4a['mash_len'] > 2]

#### 4a.1) Model 1a

In [116]:
data_4a1 = data1.copy()
data_4a1 = data_4a1[data_4a1['mash_len'] > 2]

#### 4a.2) Model 1b

In [117]:
data_4a2 = data10.copy()
data_4a2 = data_4a2[data_4a2['mash_len'] > 2]

#### 4a.3) Model 2a

In [94]:
data_4a3 = data_sp2.copy()
data_4a3 = data_4a3[data_4a3['mash_len'] > 2]

#### 4a.4) Model 2b

In [95]:
data_4a4 = data_pn_nltk2.copy()
data_4a4 = data_4a4[data_4a4['mash_len'] > 2]

#### 4a.5) Model 3

In [96]:
data_4a5 = data_lg_cities.copy()
data_4a5 = data_4a5[data_4a5['mash_len'] > 2]

### 4b) Less than 3

In [99]:
data_4b = data.copy()
data_4b = data_4b[data_4b['mash_len'] > 3]

#### 4b.1) Model 1a

In [118]:
data_4b1 = data1.copy()
data_4b1 = data_4b1[data_4b1['mash_len'] > 3]

#### 4b.2) Model 1b

In [119]:
data_4b2 = data10.copy()
data_4b2 = data_4b2[data_4b2['mash_len'] > 3]

#### 4b.3) Model 2a

In [102]:
data_4b3 = data_sp2.copy()
data_4b3 = data_4b3[data_4b3['mash_len'] > 3]

#### 4b.4) Model 2b

In [103]:
data_4b4 = data_pn_nltk2.copy()
data_4b4 = data_4b4[data_4b4['mash_len'] > 3]

#### 4b.5) Model 3

In [104]:
data_4b5 = data_lg_cities.copy()
data_4b5 = data_4b5[data_4b5['mash_len'] > 3]

## Run all LDA models

In [105]:
# Model 1a - words count > 1

texts = list(data1['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data1 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data1"
lda_data1.save(model_name)
corpus_lda = lda_data1[corpus]
corpus_lda_list = list(corpus_lda)
topics = data1.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data1.csv"
topics.to_csv(file_name)



In [106]:
# Model 1b - words count > 10

texts = list(data10['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data10 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data10"
lda_data10.save(model_name)
corpus_lda = lda_data10[corpus]
corpus_lda_list = list(corpus_lda)
topics = data10.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data10.csv"
topics.to_csv(file_name)

# Model 2a

texts = list(data_pn_nltk2['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_nltk = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_nltk"
lda_data_nltk.save(model_name)
corpus_lda = lda_data_nltk[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_pn_nltk2.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_nltk.csv"
topics.to_csv(file_name)


# Model 2b

texts = list(data_sp2['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_sp = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_sp"
lda_data_sp.save(model_name)
corpus_lda = lda_data_nltk[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_sp2.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_sp.csv"
topics.to_csv(file_name)


KeyboardInterrupt: 

In [None]:
# Model 3

texts = list(data_lg_cities['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_lg_cities = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_lg_cities"
lda_data_lg_cities.save(model_name)
corpus_lda = lda_data_lg_cities[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_lg_cities.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_lg_cities.csv"
topics.to_csv(file_name)


# Model 4a

texts = list(data_4a['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4a"
lda_data_4a.save(model_name)
corpus_lda = lda_data_4a[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a.csv"
topics.to_csv(file_name)

# Model 4a1

texts = list(data_4a1['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a1 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4a1"
lda_data_4a1.save(model_name)
corpus_lda = lda_data_4a1[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a1.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a1.csv"
topics.to_csv(file_name)

# Model 4a2

texts = list(data_4a2['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a2 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_42a"
lda_data_4a2.save(model_name)
corpus_lda = lda_data_4a2[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a2.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a2.csv"
topics.to_csv(file_name)

# Model 4a3

texts = list(data_4a3['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4a3"
lda_data_4a3.save(model_name)
corpus_lda = lda_data_4a3[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a3.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a3.csv"
topics.to_csv(file_name)

# Model 4a4

texts = list(data_4a4['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a4 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4a4"
lda_data_4a4.save(model_name)
corpus_lda = lda_data_4a4[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a4.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a4.csv"
topics.to_csv(file_name)

# Model 4a5

texts = list(data_4a5['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4a5 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4a5"
lda_data_4a5.save(model_name)
corpus_lda = lda_data_4a5[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4a5.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4a5.csv"
topics.to_csv(file_name)

# Model 4b

texts = list(data_4b['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b"
lda_data_4b.save(model_name)
corpus_lda = lda_data_4b[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b.csv"
topics.to_csv(file_name)

# Model 4b1

texts = list(data_4b1['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b1 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b1"
lda_data_4b1.save(model_name)
corpus_lda = lda_data_4b1[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b1.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b1.csv"
topics.to_csv(file_name)

# Model 4b2

texts = list(data_4b2['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b2 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b2"
lda_data_4b2.save(model_name)
corpus_lda = lda_data_4b2[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b2.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b2.csv"
topics.to_csv(file_name)

# Model 4b3

texts = list(data_4b3['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b3"
lda_data_4b3.save(model_name)
corpus_lda = lda_data_4b3[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b3.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b3.csv"
topics.to_csv(file_name)

# Model 4b4

texts = list(data_4b4['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b4 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b4"
lda_data_4b4.save(model_name)
corpus_lda = lda_data_4b4[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b4.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b4.csv"
topics.to_csv(file_name)

# Model 4b5

texts = list(data_4b5['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_data_4b5 = gensim.models.ldamodel.LdaModel(corpus, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_data_4b5"
lda_data_4b5.save(model_name)
corpus_lda = lda_data_4b5[corpus]
corpus_lda_list = list(corpus_lda)
topics = data_4b5.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_data_4b5.csv"
topics.to_csv(file_name)

## Model 5 - Proper Nouns with low count

In [None]:
# Create proper noun counter

pn_list = [y for x in list(data_pn_nltk['pn2']) for y in x]
pn_counts = Counter(pn_list)

In [None]:
words = list(pn_counts.keys())
cnt = list(pn_counts.values())
pn_count_df = pd.DataFrame({'word': words, 'cnt': cnt})

In [None]:
data_pn_nltk['final_mash2'] = data_pn_nltk.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn2']], axis =1)
data_pn_nltk['final_mash'] = data_pn_nltk.apply(lambda row: [i for i in row['final_mash'] if i not in row['pn']], axis =1)
data_pn_nltk['mash_len'] = data_pn_nltk['final_mash'].apply(len)
data_pn_nltk['mash_len2'] = data_pn_nltk['final_mash2'].apply(len)

data_pn_nltk2 = data_pn_nltk[data_pn_nltk['mash_len2'] > 0]
data_pn_nltk = data_pn_nltk[data_pn_nltk['mash_len'] > 0]