In [23]:
# system tools
import warnings
import json
import sys
import string

# data cleaning + analysis tools
import pandas as pd
import datetime as dt
import numpy as np
import re

#nltk tools
import lda #Latent Dirichlet Allocation (create topics)
import gensim
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

#set notebook preferences
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

### Import JSON file with city metadata - including which cities have sufficient Public Record Request (PRR) data for analysis

In [7]:
json_file = '../data/cities.json'

with open(json_file, 'r') as f:
    md = json.load(f)

###  Create dataframe of PRR data for all relevant cities

In [8]:
data_raw = pd.DataFrame(columns = ['city', 'month_year', 'Summary'])
city_list = []
for key, value in md.items():
    city = value['name']
    filepath = '/Users/alenastern/Google Drive File Stream/My Drive/Alena_Project/PR_Data/{}.csv'.format(city)
    if value["desc"] == "Y":
        try:
            df = pd.read_csv(filepath)
        except:
            try:
                df = pd.read_csv(filepath, encoding='mac_roman')
            except:
                continue
        print(key)
        name = key.split(' ')
        city_list.append(name[0].lower())
    else:
        continue
    
    try:
        df['Create Date'] = pd.to_datetime(df['Create Date'])
    except:
        df['New'] = pd.to_datetime(df['Create Date'].apply(lambda x: re.findall('^\S*', x)[0]))
        df.drop(columns=['Create Date'], inplace = True)
        df.rename(index=str, columns={"New": "Create Date"}, inplace = True)

    df['month_year'] = df['Create Date'].dt.to_period('M')
    
    mc = df[['month_year', 'Summary']]
    mc['city'] = city
    
    data_raw = pd.concat([data_raw, mc])

Arlington city
Asheville city
Bainbridge Island city
Boulder County
Cathedral City city
Dayton city
Denton city
Everett city
Fort Collins city
Greensboro city
Hayward city
Kirkland city
Las Cruces city
Lynnwood city
Mercer Island city
Miami city
Middleborough town
New Orleans city
Oakland city
Oklahoma City city
Olympia city
Palo Alto city
Peoria city
Pullman city
Rancho Cucamonga city
Redmond city
Renton city
Sacramento city
San Francisco city
Tukwila city
Vallejo city
West Sacramento city
Winchester city


#### We can see the raw data below. Our raw dataset includes 86,416 PRRs from 33 different cities

In [110]:
data_raw.head()

Unnamed: 0,index,Summary,city,month_year
0,0,We are working with an engineering firm on an ...,Arlington,2018-06
1,1,Need copies of contracts and all related docum...,Arlington,2018-06
2,2,"Copies of Building Permits of $5,000 valuation...",Arlington,2018-06
3,3,police report filed to an officer against Wayn...,Arlington,2018-06
4,4,"Email Communications between Stephanie Shook, ...",Arlington,2018-06


In [93]:
data_raw.shape

(86416, 3)

In [81]:
len(data_raw.city.unique())

33

In [9]:
data_raw.index = pd.RangeIndex(len(data_raw.index))

In [10]:
data_raw.reset_index(inplace=True)

### Create dataframe for cleaning by removing null summaries

In [53]:
data = data_raw.dropna(subset=['Summary'])

#### Function to convert nltk part of speech tags to wordnet tags (we use this to stem the words in data cleaning below):

In [12]:
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

## Clean PRR data to prepare for LDA analysis

In [54]:
# Turn to lowercase
data.Summary = data.Summary.str.lower()

# Remove all punctuation
translator = str.maketrans('','', string.punctuation)
data.Summary = data.Summary.str.translate(translator)

# Remove all city names
for city in city_list:
    data.Summary = data.Summary.str.replace(city, '')

# Remove all state names
state_list = ['washington', 'north carolina', 'colorado', 'california',
             'ohio', 'texas', 'new mexico', 'florida', 'massachusetts',
             'lousiana', 'oklahoma', 'arizona', 'rhode island', 'virginia']
for state in state_list:
     data.Summary = data.Summary.str.replace(state, '')
    
# Remove digits
dig_translator = str.maketrans('','', string.digits)
data.Summary = data.Summary.str.translate(dig_translator)

#remove empty strings, stopwords and stem
stop_words = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
data['token'] = data['Summary'].apply(lambda x: nltk.word_tokenize(x))
data['lemma'] = data['token'].apply(lambda x: nltk.pos_tag(x))
data['mash'] = data['lemma'].apply(lambda x: [lmtzr.lemmatize(i[0], get_wordnet_pos(i[1])) for i in x if len(i[0]) > 0 and i[0] not in stop_words])

# Remove whitespace
wsp_translator = str.maketrans('','', string.whitespace)
data['mash'] = data['mash'].apply(lambda x: [i.translate(wsp_translator) for i in x])

# Remove empty lists
data['mash_len'] = data['mash'].apply(lambda x: len(x))
data = data[data['mash_len'] > 0]


### Identify and remove commonly used words in PRRs

In [34]:
common_list = ['report', 'request', 'record', 'city', 'please', 'copy', 'date', 'information', 'would',
              'include', 'document', 'provide', 'like', 'email', 'thank', 'need', 'know', 'thanks']
word_list = [y for x in list(data['mash']) for y in x]
counts = Counter(word_list)
Counter(word_list).most_common(50)

[('report', 30544),
 ('request', 29113),
 ('record', 26821),
 ('city', 18382),
 ('please', 18067),
 ('copy', 17480),
 ('police', 13216),
 ('date', 13110),
 ('property', 11584),
 ('street', 11495),
 ('information', 10945),
 ('th', 10631),
 ('number', 10611),
 ('driver', 10537),
 ('would', 9966),
 ('include', 9398),
 ('document', 9380),
 ('provide', 9242),
 ('wa', 8842),
 ('public', 8796),
 ('permit', 8660),
 ('case', 8620),
 ('accident', 7879),
 ('st', 7845),
 ('location', 7798),
 ('like', 7595),
 ('state', 7446),
 ('email', 7236),
 ('thank', 7219),
 ('incident', 7213),
 ('address', 6804),
 ('regard', 6730),
 ('insurance', 6498),
 ('ne', 6383),
 ('building', 6376),
 ('relate', 6349),
 ('ave', 6224),
 ('department', 6089),
 ('plan', 5736),
 ('type', 5709),
 ('county', 5615),
 ('company', 5538),
 ('following', 5506),
 ('file', 5458),
 ('fire', 5334),
 ('code', 5296),
 ('auto', 5273),
 ('loss', 5202),
 ('need', 5041),
 ('insure', 4963)]

In [None]:
Counter(word_list).most_common()[-50:-1]

In [55]:
common_list = ['report', 'request', 'record', 'city', 'please', 'copy', 'date', 'information', 'would',
              'include', 'document', 'provide', 'like', 'email', 'thank', 'need', 'know', 'thanks']

In [3]:
"request" in common_list

True

In [56]:
# remove general words that are common to public record requests

#for word in common_list:
data['mash2'] = data['mash'].apply(lambda x: [i for i in x if i not in common_list])

In [65]:
# remove general words that are common to public record requests
suffix_list = ['th', 'nd', 'st', 'rd']
#for word in suffix_list:
data['mash2'] = data['mash2'].apply(lambda x: [i for i in x if i not in suffix_list])

In [58]:
# create column with the length of mash for each PRR

data['mash_len'] = data['mash'].apply(lambda x: len(x))

In [59]:
# remove entries of length 0

data = data[data['mash_len'] > 0]

In [60]:
data['mash_len'].describe()

count    74529.000000
mean        21.688511
std         36.966960
min          1.000000
25%          5.000000
50%         12.000000
75%         27.000000
max       2922.000000
Name: mash_len, dtype: float64

#### We can see a couple of examples of the cleaned mash and the original request:

In [61]:
data_raw['Summary'][data_raw.index == 164]

164    Police Traffic Collision Report and any other police reports, records or documents relating to \nCase # 2017- 00022138 \n\nUnit 1 - Carla Jaramillo\nUnit 2 - Jordan Boss\n\nTime of incident - 1616
Name: Summary, dtype: object

In [62]:
data['mash2'][data.index == 164]

164    [police, traffic, collision, police, relate, case, unit, carla, jaramillo, unit, jordan, bos, time, incident]
Name: mash2, dtype: object

In [63]:
data['mash'][86000]

['would',
 'like',
 'request',
 'build',
 'permit',
 'isnpection',
 'history',
 'report',
 'certoificate',
 'occupancy',
 'muscovy',
 'rd',
 'specifically',
 'look',
 'apoproved',
 'final',
 'date',
 'information']

In [64]:
data_raw["Summary"].iloc[86000]

'I would like to request for building permit with isnpection history report or certoificate of occupancy for 1943 Muscovy Rd.\n\xa0\nI am specifically looking for apoproved final date information.'

In [66]:
data['nd'] = data['mash2'].apply(lambda x: "nd" in x)

In [67]:
data_rep = data[data['nd'] == True]

In [68]:
data_rep.head()

Unnamed: 0,index,Summary,city,month_year,token,lemma,mash,mash_len,mash2,nd


# LDA Analysis

In [79]:
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,index,Summary,city,month_year,token,lemma,mash,mash_len,mash2,nd
0,0,we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering mapsrecords as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,Arlington,2018-06,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, mapsrecords, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[(we, PRP), (are, VBP), (working, VBG), (with, IN), (an, DT), (engineering, NN), (firm, NN), (on, IN), (an, DT), (upcoming, JJ), (project, NN), (they, PRP), (have, VBP), (asked, VBN), (us, PRP), (to, TO), (gather, VB), (maps, NNS), (for, IN), (this, DT), (project, NN), (would, MD), (you, PRP), (be, VB), (able, JJ), (to, TO), (assist, VB), (me, PRP), (in, IN), (gathering, VBG), (mapsrecords, NNS), (as, IN), (builds, NNS), (for, IN), (any, DT), (underground, JJ), (water, NN), (facilities, NNS), (you, PRP), (may, MD), (have, VB), (something, NN), (just, RB), (showing, VBG), (the, DT), (route, NN), (of, IN), (the, DT), (water, NN), (lines, NNS), (would, MD), (do, VB), (th, VB), (st, VB), (ne, JJ), (to, TO), (nd, VB), (ave, VB), (ne, JJ), (cascade, NN), (surveying, VBG), (engineering, NN), (i, NN), (have, VBP), (attached, VBN), (the, DT), (scope, NN), (for, IN), (your, PRP$), (convenience, NN), (please, NN), (let, VB), (me, PRP), (know, VB), (if, IN), (you, PRP), (have, VBP), (questions...","[work, engineering, firm, upcoming, project, ask, u, gather, map, project, would, able, assist, gather, mapsrecords, build, underground, water, facility, may, something, show, route, water, line, would, th, st, ne, nd, ave, ne, cascade, survey, engineering, attach, scope, convenience, please, let, know, question]",42,"[work, engineering, firm, upcoming, project, ask, u, gather, map, project, able, assist, gather, mapsrecords, build, underground, water, facility, may, something, show, route, water, line, ne, ave, ne, cascade, survey, engineering, attach, scope, convenience, let, question]",False
1,1,need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft city of hci steel buildings and pud,Arlington,2018-06,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, city, of, hci, steel, buildings, and, pud]","[(need, NN), (copies, NNS), (of, IN), (contracts, NNS), (and, CC), (all, DT), (related, JJ), (documents, NNS), (pertaining, VBG), (to, TO), (topcub, VB), (aircraft, NN), (property, NN), (located, VBN), (at, IN), (th, NN), (dr, NN), (ne, JJ), (wa, NN), (between, IN), (airport, NN), (topcub, NN), (aircraft, NN), (city, NN), (of, IN), (hci, JJ), (steel, NN), (buildings, NNS), (and, CC), (pud, NN)]","[need, copy, contract, related, document, pertain, topcub, aircraft, property, locate, th, dr, ne, wa, airport, topcub, aircraft, city, hci, steel, building, pud]",22,"[contract, related, pertain, topcub, aircraft, property, locate, dr, ne, wa, airport, topcub, aircraft, hci, steel, building, pud]",False
2,2,copies of building permits of valuation and up min for reroofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,Arlington,2018-06,"[copies, of, building, permits, of, valuation, and, up, min, for, reroofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[(copies, NNS), (of, IN), (building, VBG), (permits, NNS), (of, IN), (valuation, NN), (and, CC), (up, RB), (min, NN), (for, IN), (reroofs, NN), (min, NN), (for, IN), (cell, NN), (tower, NN), (upgrades, JJ), (electrical, JJ), (mechanical, JJ), (plumbing, NN), (at, IN), (min, NN), (and, CC), (solar, JJ), (panels, NNS), (swimming, VBG), (pools, JJ), (foundations, NNS), (at, IN), (any, DT), (valuation, NN)]","[copy, build, permit, valuation, min, reroofs, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",20,"[build, permit, valuation, min, reroofs, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",False
3,3,police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,Arlington,2018-06,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[(police, NNS), (report, NN), (filed, VBD), (to, TO), (an, DT), (officer, NN), (against, IN), (wayne, JJ), (parris, JJ), (dob, NN), (from, IN), (brittany, JJ), (j, NN), (parris, VBD), (the, DT), (paperwork, NN), (i, NN), (have, VBP), (has, VBZ), (a, DT), (case, NN), (number, NN), (d, NN), (it, PRP), (is, VBZ), (also, RB), (stamped, VBN), (at, IN), (the, DT), (bottom, NN), (with, IN), (iím, JJ), (not, RB), (sure, JJ), (which, WDT), (number, NN), (you, PRP), (will, MD), (need, VB), (if, IN), (there, EX), (is, VBZ), (any, DT), (other, JJ), (information, NN), (needed, VBN), (please, NN), (let, VB), (me, PRP), (know, VB)]","[police, report, file, officer, wayne, parris, dob, brittany, j, parris, paperwork, case, number, also, stamp, bottom, iím, sure, number, need, information, need, please, let, know]",25,"[police, file, officer, wayne, parris, dob, brittany, j, parris, paperwork, case, number, also, stamp, bottom, iím, sure, number, let]",False
4,4,email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in novdec time frame in regards to the contract,Arlington,2018-06,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, novdec, time, frame, in, regards, to, the, contract]","[(email, NN), (communications, NNS), (between, IN), (stephanie, JJ), (shook, NN), (dave, VBP), (kraski, VBN), (bruce, NN), (stedman, NN), (and, CC), (chad, VBD), (schmidt, VBN), (in, IN), (regards, NNS), (to, TO), (fire, VB), (protection, NN), (district, NN), (billing, NN), (and, CC), (passage, NN), (of, IN), (contract, NN), (for, IN), (als, NNS), (services, NNS), (also, RB), (any, DT), (copies, NNS), (of, IN), (agenda, NN), (bills, NNS), (d, VBP), (contract, NN), (and, CC), (materials, NNS), (presented, VBN), (for, IN), (review, NN), (in, IN), (novdec, JJ), (time, NN), (frame, NN), (in, IN), (regards, NNS), (to, TO), (the, DT), (contract, NN)]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, regard, fire, protection, district, billing, passage, contract, al, service, also, copy, agenda, bill, contract, material, present, review, novdec, time, frame, regard, contract]",32,"[communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, regard, fire, protection, district, billing, passage, contract, al, service, also, agenda, bill, contract, material, present, review, novdec, time, frame, regard, contract]",False


In [69]:
# create dictionary and corpus
texts = list(data['mash2'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [70]:
# 30 topics and 45 passes
lda_30_45 = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, 
                                         passes = 45, random_state=7)

In [71]:
# show topics for model
lda_30_45.show_topics(num_topics=30, formatted=False)

[(0,
  [('arrest', 0.08289417),
   ('check', 0.06838536),
   ('amount', 0.025789259),
   ('charge', 0.01968468),
   ('dob', 0.019565014),
   ('background', 0.018853486),
   ('gang', 0.016788905),
   ('payee', 0.015202984),
   ('warrant', 0.01444699),
   ('afternoon', 0.014204199)]),
 (1,
  [('police', 0.34644425),
   ('department', 0.060999524),
   ('usts', 0.022946382),
   ('waste', 0.02238407),
   ('letter', 0.019682474),
   ('activity', 0.016499719),
   ('issue', 0.016077707),
   ('license', 0.015728468),
   ('pertain', 0.013066108),
   ('robert', 0.011728265)]),
 (2,
  [('way', 0.1093242),
   ('collision', 0.062762275),
   ('page', 0.042848866),
   ('see', 0.041105945),
   ('attached', 0.031422168),
   ('first', 0.03063568),
   ('agreement', 0.029097088),
   ('avondale', 0.019600632),
   ('mall', 0.018794566),
   ('evergreen', 0.014810583)]),
 (3,
  [('business', 0.07392372),
   ('november', 0.050002754),
   ('sale', 0.037945986),
   ('license', 0.032632407),
   ('range', 0.0176758

In [198]:
# show topics for model
lda_50_45_2.show_topics(num_topics=50, formatted=False)

[(0,
  [('check', 0.10369195),
   ('see', 0.096796714),
   ('month', 0.03366025),
   ('account', 0.029787252),
   ('payee', 0.022412835),
   ('amount', 0.022256557),
   ('attachment', 0.022072963),
   ('escrow', 0.020911321),
   ('issue', 0.018710237),
   ('still', 0.018030368)]),
 (1,
  [('•', 0.12580512),
   ('east', 0.039862804),
   ('local', 0.038408782),
   ('emergency', 0.028766017),
   ('prevent', 0.024785917),
   ('hereby', 0.024181787),
   ('group', 0.021493912),
   ('impact', 0.019941604),
   ('nature', 0.01577148),
   ('replacement', 0.014921277)]),
 (2,
  [('fine', 0.04978936),
   ('february', 0.04836553),
   ('cam', 0.037812322),
   ('body', 0.036818836),
   ('injunction', 0.03467748),
   ('training', 0.024110764),
   ('stolen', 0.01653608),
   ('andrew', 0.015775861),
   ('lee', 0.015763437),
   ('harrison', 0.01561658)]),
 (3,
  [('lot', 0.105513096),
   ('collision', 0.08798288),
   ('blvd', 0.08653386),
   ('block', 0.072315335),
   ('south', 0.0612986),
   ('northeast

In [73]:
# save good model
lda_30_45.save('lda_30_45_model')

In [74]:
corpus_lda = lda_30_45[corpus] #this is just a wrapper; calculates on the fly when you call it



In [81]:
corpus_lda_list = list(corpus_lda)

In [76]:
for index, score in sorted(lda_30_45[corpus[600]], key=lambda tup: -1*tup[1]): #600th document
    print("Score: {}\t Topic: {} \n".format(score, lda_30_45.print_topic(index, 15))) #15 word topics

Score: 0.39515647292137146	 Topic: 0.106*"incident" + 0.090*"police" + 0.050*"involve" + 0.045*"officer" + 0.038*"accident" + 0.035*"vehicle" + 0.031*"occur" + 0.030*"car" + 0.022*"pm" + 0.015*"assault" + 0.014*"steal" + 0.013*"cad" + 0.013*"take" + 0.012*"footage" + 0.011*"collision" 

Score: 0.18334083259105682	 Topic: 0.028*"relate" + 0.021*"tax" + 0.019*"communication" + 0.018*"regard" + 0.016*"correspondence" + 0.014*"council" + 0.014*"project" + 0.014*"andor" + 0.013*"staff" + 0.012*"public" + 0.011*"meeting" + 0.011*"program" + 0.011*"’" + 0.010*"alto" + 0.010*"june" 

Score: 0.14973963797092438	 Topic: 0.052*"public" + 0.023*"may" + 0.021*"act" + 0.017*"concern" + 0.016*"office" + 0.015*"make" + 0.015*"send" + 0.013*"time" + 0.013*"day" + 0.013*"disclosure" + 0.013*"fax" + 0.012*"via" + 0.011*"available" + 0.011*"matter" + 0.011*"’" 

Score: 0.12363619357347488	 Topic: 0.057*"call" + 0.035*"get" + 0.019*"make" + 0.019*"want" + 0.016*"go" + 0.015*"one" + 0.015*"find" + 0.013*"co

In [80]:
data.mash2[600]

['release',
 'pertain',
 'police',
 'call',
 'fight',
 'timber',
 'vintage',
 'premise',
 'occur',
 'approximately',
 'pm',
 'name',
 'people',
 'involve']

In [82]:
topics = data.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
topics.head() #the topic_comp column are actual Python lists

Unnamed: 0,index,Summary,city,month_year,token,lemma,mash,mash_len,mash2,nd,topic_comp
0,0,we are working with an engineering firm on an upcoming project they have asked us to gather maps for this project would you be able to assist me in gathering mapsrecords as builds for any underground water facilities you may have something just showing the route of the water lines would do\n\nth st ne to nd ave ne cascade surveying engineering \n\ni have attached the scope for your convenience please let me know if you have questions,Arlington,2018-06,"[we, are, working, with, an, engineering, firm, on, an, upcoming, project, they, have, asked, us, to, gather, maps, for, this, project, would, you, be, able, to, assist, me, in, gathering, mapsrecords, as, builds, for, any, underground, water, facilities, you, may, have, something, just, showing, the, route, of, the, water, lines, would, do, th, st, ne, to, nd, ave, ne, cascade, surveying, engineering, i, have, attached, the, scope, for, your, convenience, please, let, me, know, if, you, have, questions]","[(we, PRP), (are, VBP), (working, VBG), (with, IN), (an, DT), (engineering, NN), (firm, NN), (on, IN), (an, DT), (upcoming, JJ), (project, NN), (they, PRP), (have, VBP), (asked, VBN), (us, PRP), (to, TO), (gather, VB), (maps, NNS), (for, IN), (this, DT), (project, NN), (would, MD), (you, PRP), (be, VB), (able, JJ), (to, TO), (assist, VB), (me, PRP), (in, IN), (gathering, VBG), (mapsrecords, NNS), (as, IN), (builds, NNS), (for, IN), (any, DT), (underground, JJ), (water, NN), (facilities, NNS), (you, PRP), (may, MD), (have, VB), (something, NN), (just, RB), (showing, VBG), (the, DT), (route, NN), (of, IN), (the, DT), (water, NN), (lines, NNS), (would, MD), (do, VB), (th, VB), (st, VB), (ne, JJ), (to, TO), (nd, VB), (ave, VB), (ne, JJ), (cascade, NN), (surveying, VBG), (engineering, NN), (i, NN), (have, VBP), (attached, VBN), (the, DT), (scope, NN), (for, IN), (your, PRP$), (convenience, NN), (please, NN), (let, VB), (me, PRP), (know, VB), (if, IN), (you, PRP), (have, VBP), (questions...","[work, engineering, firm, upcoming, project, ask, u, gather, map, project, would, able, assist, gather, mapsrecords, build, underground, water, facility, may, something, show, route, water, line, would, th, st, ne, nd, ave, ne, cascade, survey, engineering, attach, scope, convenience, please, let, know, question]",42,"[work, engineering, firm, upcoming, project, ask, u, gather, map, project, able, assist, gather, mapsrecords, build, underground, water, facility, may, something, show, route, water, line, ne, ave, ne, cascade, survey, engineering, attach, scope, convenience, let, question]",False,"[(5, 0.2501662), (18, 0.549489), (27, 0.1753448)]"
1,1,need copies of contracts and all related documents pertaining to topcub aircraft property located at th dr ne wa between airport topcub aircraft city of hci steel buildings and pud,Arlington,2018-06,"[need, copies, of, contracts, and, all, related, documents, pertaining, to, topcub, aircraft, property, located, at, th, dr, ne, wa, between, airport, topcub, aircraft, city, of, hci, steel, buildings, and, pud]","[(need, NN), (copies, NNS), (of, IN), (contracts, NNS), (and, CC), (all, DT), (related, JJ), (documents, NNS), (pertaining, VBG), (to, TO), (topcub, VB), (aircraft, NN), (property, NN), (located, VBN), (at, IN), (th, NN), (dr, NN), (ne, JJ), (wa, NN), (between, IN), (airport, NN), (topcub, NN), (aircraft, NN), (city, NN), (of, IN), (hci, JJ), (steel, NN), (buildings, NNS), (and, CC), (pud, NN)]","[need, copy, contract, related, document, pertain, topcub, aircraft, property, locate, th, dr, ne, wa, airport, topcub, aircraft, city, hci, steel, building, pud]",22,"[contract, related, pertain, topcub, aircraft, property, locate, dr, ne, wa, airport, topcub, aircraft, hci, steel, building, pud]",False,"[(4, 0.065632835), (21, 0.059163287), (22, 0.061050314), (27, 0.76600546)]"
2,2,copies of building permits of valuation and up min for reroofs min for cell tower upgrades electrical mechanical plumbing at min and solar panels swimming pools foundations at any valuation,Arlington,2018-06,"[copies, of, building, permits, of, valuation, and, up, min, for, reroofs, min, for, cell, tower, upgrades, electrical, mechanical, plumbing, at, min, and, solar, panels, swimming, pools, foundations, at, any, valuation]","[(copies, NNS), (of, IN), (building, VBG), (permits, NNS), (of, IN), (valuation, NN), (and, CC), (up, RB), (min, NN), (for, IN), (reroofs, NN), (min, NN), (for, IN), (cell, NN), (tower, NN), (upgrades, JJ), (electrical, JJ), (mechanical, JJ), (plumbing, NN), (at, IN), (min, NN), (and, CC), (solar, JJ), (panels, NNS), (swimming, VBG), (pools, JJ), (foundations, NNS), (at, IN), (any, DT), (valuation, NN)]","[copy, build, permit, valuation, min, reroofs, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",20,"[build, permit, valuation, min, reroofs, min, cell, tower, upgrades, electrical, mechanical, plumbing, min, solar, panel, swim, pools, foundation, valuation]",False,"[(0, 0.059634298), (21, 0.784754), (27, 0.11061168)]"
3,3,police report filed to an officer against wayne parris dob from brittany j parris the paperwork i have has a case number d it is also stamped at the bottom with iím not sure which number you will need if there is any other information needed please let me know,Arlington,2018-06,"[police, report, filed, to, an, officer, against, wayne, parris, dob, from, brittany, j, parris, the, paperwork, i, have, has, a, case, number, d, it, is, also, stamped, at, the, bottom, with, iím, not, sure, which, number, you, will, need, if, there, is, any, other, information, needed, please, let, me, know]","[(police, NNS), (report, NN), (filed, VBD), (to, TO), (an, DT), (officer, NN), (against, IN), (wayne, JJ), (parris, JJ), (dob, NN), (from, IN), (brittany, JJ), (j, NN), (parris, VBD), (the, DT), (paperwork, NN), (i, NN), (have, VBP), (has, VBZ), (a, DT), (case, NN), (number, NN), (d, NN), (it, PRP), (is, VBZ), (also, RB), (stamped, VBN), (at, IN), (the, DT), (bottom, NN), (with, IN), (iím, JJ), (not, RB), (sure, JJ), (which, WDT), (number, NN), (you, PRP), (will, MD), (need, VB), (if, IN), (there, EX), (is, VBZ), (any, DT), (other, JJ), (information, NN), (needed, VBN), (please, NN), (let, VB), (me, PRP), (know, VB)]","[police, report, file, officer, wayne, parris, dob, brittany, j, parris, paperwork, case, number, also, stamp, bottom, iím, sure, number, need, information, need, please, let, know]",25,"[police, file, officer, wayne, parris, dob, brittany, j, parris, paperwork, case, number, also, stamp, bottom, iím, sure, number, let]",False,"[(0, 0.06567758), (1, 0.09477545), (4, 0.051666666), (9, 0.0669689), (11, 0.1931221), (17, 0.25166667), (22, 0.088629805), (23, 0.15082614)]"
4,4,email communications between stephanie shook dave kraski bruce stedman and chad schmidt in regards to fire protection district billing and passage of contract for als services \n\nalso any copies of agenda bills d contract and materials presented for review in novdec time frame in regards to the contract,Arlington,2018-06,"[email, communications, between, stephanie, shook, dave, kraski, bruce, stedman, and, chad, schmidt, in, regards, to, fire, protection, district, billing, and, passage, of, contract, for, als, services, also, any, copies, of, agenda, bills, d, contract, and, materials, presented, for, review, in, novdec, time, frame, in, regards, to, the, contract]","[(email, NN), (communications, NNS), (between, IN), (stephanie, JJ), (shook, NN), (dave, VBP), (kraski, VBN), (bruce, NN), (stedman, NN), (and, CC), (chad, VBD), (schmidt, VBN), (in, IN), (regards, NNS), (to, TO), (fire, VB), (protection, NN), (district, NN), (billing, NN), (and, CC), (passage, NN), (of, IN), (contract, NN), (for, IN), (als, NNS), (services, NNS), (also, RB), (any, DT), (copies, NNS), (of, IN), (agenda, NN), (bills, NNS), (d, VBP), (contract, NN), (and, CC), (materials, NNS), (presented, VBN), (for, IN), (review, NN), (in, IN), (novdec, JJ), (time, NN), (frame, NN), (in, IN), (regards, NNS), (to, TO), (the, DT), (contract, NN)]","[email, communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, regard, fire, protection, district, billing, passage, contract, al, service, also, copy, agenda, bill, contract, material, present, review, novdec, time, frame, regard, contract]",32,"[communication, stephanie, shook, dave, kraski, bruce, stedman, chad, schmidt, regard, fire, protection, district, billing, passage, contract, al, service, also, agenda, bill, contract, material, present, review, novdec, time, frame, regard, contract]",False,"[(0, 0.033859484), (4, 0.03333333), (5, 0.049412042), (12, 0.13010752), (15, 0.30020332), (17, 0.06829699), (21, 0.110456906), (22, 0.03333333), (24, 0.072874755), (25, 0.065591395), (26, 0.08210082)]"


In [None]:
def calculate_topic_pop(df):
    results_dict = {}
    
    for row_num in df.index:
        for tup in df.topic_comp[row_num]:
            if not tup[0] in results_dict:
                if tup[1] >= 0.1:
                    results_dict[tup[0]] = (tup[1] * (df.iloc[row_num].download_count + 
                                             df.iloc[row_num].page_views_total_log))
                else:
                    pass
            if tup[0] in results_dict:
                if tup[1] >= 0.1:
                    results_dict[tup[0]] += (tup[1] * (df.iloc[row_num].download_count + 
                                             df.iloc[row_num].page_views_total_log))
    return results_dict