In [2]:
# system tools
import warnings
import json
import sys
import string
import ast

# data cleaning + analysis tools
import pandas as pd
import datetime as dt
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

#nltk tools
import spacy
import lda #Latent Dirichlet Allocation (create topics)
import gensim
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from gensim.models import Phrases
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams

#set notebook preferences
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

Populating the interactive namespace from numpy and matplotlib


### Import JSON file with city metadata 

This including which cities have published the raw Public Record Requests (PRRs) they receive for analysis. 

In [47]:
json_file = '../data/cities.json'

with open(json_file, 'r') as f:
    md = json.load(f)

###  Create dataframe of PRR data for all relevant cities

This dataframe includes PPR data from the **34 cities** in our sample (52 total cities) that had sufficient raw data with the public record request for analysis. Our sample represents cities that host an online PRR portal for submitting requests.  These data were obtained through a variety of methods including:

1. exporting the full archive of PRRs hosted on the online portal as a csv file  
2. scraping the full history of PRR data from portals which publish previous requests, but do not offer a download option (see [scraping notebook](https://github.com/sunlightpolicy/Sunlight_FOIA/blob/master/src/data/NR_Scrape.ipynb))
3. downloading public records request data that has been published on city’s open data portal  
4. submitting a public record request to obtain the archive of PRR data 

It is worth noting that *specificities of the different city portals influence the substance of the public record requests received*. For example, the city of Clearwater, FL has separate request forms for police records and public records, prompting citizens who submit police record requests to provide the specific case number. In addition, while most of the data released by cities is the raw request submitted by citizens, in a few cases the city released a summary of the submitted request prepared by city staff. For example, the Oklahoma City clerk's office released the summary of the request and the department the request was routed to for response. 


In [96]:
data_raw = pd.DataFrame(columns = ['city', 'month_year', 'Summary'])
city_list = []
for key, value in md.items():
    city = value['name']
    filepath = '/Users/alenastern/Google Drive File Stream/My Drive/Alena_Project/PR_Data/{}.csv'.format(city)
    # tag in metadata for whether city publishes request content
    if value["desc"] == "Y":
        try:
            df = pd.read_csv(filepath)
        except:
            try:
                df = pd.read_csv(filepath, encoding='mac_roman')
            except:
                continue
        print(key)
        name = key.split(' ')
        city_list.extend([x for x in name[:-1]])
    else:
        continue
    
    try:
        df['Create Date'] = pd.to_datetime(df['Create Date'])
    except:
        df['New'] = pd.to_datetime(df['Create Date'].apply(lambda x: re.findall('^\S*', x)[0]))
        df.drop(columns=['Create Date'], inplace = True)
        df.rename(index=str, columns={"New": "Create Date"}, inplace = True)

    df['month_year'] = df['Create Date'].dt.to_period('M')
    
    mc = df[['month_year', 'Summary']]
    mc['city'] = city
    
    data_raw = pd.concat([data_raw, mc])


Arlington city
Asheville city
Bainbridge Island city
Boulder County
Cathedral City city
Clearwater city
Dayton city
Denton city
Everett city
Fort Collins city
Greensboro city
Hayward city
Kirkland city
Las Cruces city
Lynnwood city
Mercer Island city
Miami city
Middleborough town
New Orleans city
Oakland city
Oklahoma City city
Olympia city
Palo Alto city
Peoria city
Pullman city
Rancho Cucamonga city
Redmond city
Renton city
Sacramento city
San Francisco city
Tukwila city
Vallejo city
West Sacramento city
Winchester city
['Arlington', 'Asheville', 'Bainbridge', 'Island', 'Boulder', 'Cathedral', 'City', 'Clearwater', 'Dayton', 'Denton', 'Everett', 'Fort', 'Collins', 'Greensboro', 'Hayward', 'Kirkland', 'Las', 'Cruces', 'Lynnwood', 'Mercer', 'Island', 'Miami', 'Middleborough', 'New', 'Orleans', 'Oakland', 'Oklahoma', 'City', 'Olympia', 'Palo', 'Alto', 'Peoria', 'Pullman', 'Rancho', 'Cucamonga', 'Redmond', 'Renton', 'Sacramento', 'San', 'Francisco', 'Tukwila', 'Vallejo', 'West', 'Sacrame

#### Our raw dataset includes 110,138 PRRs from 34 different cities

NEED TO EDIT BEFORE FINAL

In [153]:
data_raw.to_csv('data_raw.csv')
data_raw.shape

In [91]:
len(data_raw.city.unique())

34

In [12]:
# create sequential numeric index for data

data_raw.index = pd.RangeIndex(len(data_raw.index))
data_raw.reset_index(inplace=True)

NameError: name 'data_raw' is not defined

### Create dataframe for cleaning by removing null summaries

In [51]:
# Drop observations that are null for the raw PRR content field ('Summary')
data = data_raw.dropna(subset=['Summary'])

We see the raw data below. As we can see, the text in the Summary field is very messy and will require a lot of cleaning to prepare the data for analysis!

In [22]:
data.head()

Unnamed: 0,index,Summary,city,month_year
0,0,"We are working with an engineering firm on an upcoming project. They have asked us to gather maps for this project. Would you be able to assist me in gathering maps/records (as builds) for any underground water facilities you may have? Something just showing the route of the water lines would do.\n\n207th ST NE to 92nd Ave NE, Arlington, Cascade Surveying & Engineering \n\nI have attached the scope for your convenience. Please let me know if you have questions.",Arlington,2018-06
1,1,"Need copies of contracts and all related documents pertaining to Topcub Aircraft property located at 17922 59th DR NE Arlington WA 98223 between Arlington Airport, Topcub Aircraft, City of Arlington, HCI Steel Buildings and PUD.",Arlington,2018-06
2,2,"Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)",Arlington,2018-06
3,3,"police report filed to an officer against Wayne Parris (DOB 08-03-1957) from Brittany J. Parris. The paperwork I have has a case number D18-39 it is also stamped at the bottom with 18-1294, Iím not sure which number you will need. If there is any other information needed please let me know.",Arlington,2018-06
4,4,"Email Communications between Stephanie Shook, Dave Kraski, Bruce Stedman and Chad Schmidt in regards to Fire Protection District 21 billing and passage of contract for ALS Services. \n\nAlso any copies of Agenda Bills, D21 Contract and materials presented for review in Nov/Dec time frame in regards to the contract.",Arlington,2018-06


#### Function to convert nltk part of speech tags to wordnet tags (we use this to stem the words in data cleaning below):

In [52]:
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#### Function to turn separate files of the 1000 most popular baby names by year provided by the [Social Security Administration](https://www.ssa.gov/OACT/babynames/) into a single set of unique first names across years


In [None]:
def name_list(st, end, thresh):
    names = set()
    for yr in range(st, end+1):
        fp = '../data/names/yob{}.txt'.format(yr)
        df = pd.read_table(fp, sep = ',', names = ['name', 'sex', 'count'])
        df = df[df['count'] >= thresh]
        #df['name'] = df['name'].str.lower()
        names |= set(df['name'])
    
    return list(names)

In [14]:
# create list of unique first names that were on the 1000 most popular names each year between 1950-2017

names = name_list(1950,2017)

NameError: name 'name_list' is not defined

#### Create list on common surnames in the United States. Data on surnames is from the U.S. Census Bureau, compiled by FiveThirtyEight and accessed via [data.world](https://data.world/fivethirtyeight/most-common-name/workspace/file?filename=README.md)

In [None]:
last_names = pd.read_csv('../data/names/surnames.csv')
last_names.name = last_names.name.str.title()
ln = list(last_names['name'])

In [None]:
# combine first names and surnames and create dictionary 

all_names = names + ln
all_names_dict = {key: 1 for key in all_names if key}

## Clean PRR data to prepare for LDA analysis

Prior to analysis, we clean our unstructured text data to improve the outcome of our LDA analysis results. Our goals are as follows:

1. Remove "noise" - words that do not provide information on the subject of a PRR (eg. stop words like "the", proper nouns like people's names or city names, punctuation and digits, and general words/phrase common to PRRs like "good morning" or "record"
2. Stem words so like words are treated as the same (eg. "photo" and "photos" should be regarded as the same word, as should "assault" and "assaulted"
3. Account for meaningful phrases where the combination of words has particular meaning (to avoid excessive computation time, we only consider two-word phrases)

In [None]:
# remove common public record request phrases - we remove as phrases because we care about specific combination/order 
# of words (we want to remove "open record request" not all instances of word "open")
phrase_list = ['public record request', 'open record request', 'public records request', 'open records request', 
               'foia request', 'see attached', 'see attachment', 'to whom it may concern', 'public records act',
              'electronic copy', 'electronic copies', 'freedom of information act', 'good afternoon', 'good morning',
              'good day']
                         
for phrase in phrase_list:
    s = re.compile(re.escape(phrase), re.IGNORECASE)
    data.Summary = data['Summary'].apply(lambda x: s.sub('', x))
    
    
# Replace common acronyms in Summary
data.Summary = data.Summary.str.replace('NOPD', 'police department')
data.Summary = data.Summary.str.replace('OPD' , 'police department')
data.Summary = data.Summary.str.replace('SFPD', 'police department')
data.Summary = data.Summary.str.replace('CPD', 'police department')
data.Summary = data.Summary.str.replace('APD', 'police department')
data.Summary = data.Summary.str.replace('GPD', 'police department')
data.Summary = data.Summary.str.replace('KPD', 'police department')
data.Summary = data.Summary.str.replace('TPD', 'police department')
data.Summary = data.Summary.str.replace('DPD', 'police department')
data.Summary = data.Summary.str.replace('EPD', 'police department')
data.Summary = data.Summary.str.replace('HPD', 'police department')
data.Summary = data.Summary.str.replace('LPD', 'police department')
data.Summary = data.Summary.str.replace('MDPD', 'police department')
data.Summary = data.Summary.str.replace('PPD', 'police department')
data.Summary = data.Summary.str.replace('SPD', 'police department')
data.Summary = data.Summary.str.replace('VPD', 'police department')
data.Summary = data.Summary.str.replace('CCPD', 'police department')
data.Summary = data.Summary.str.replace('FCPD', 'police department')
data.Summary = data.Summary.str.replace('TPD', 'police department')
data.Summary = data.Summary.str.replace('LCPD', 'police department')
data.Summary = data.Summary.str.replace('OKCPD', 'police department')
data.Summary = data.Summary.str.replace('PAPD', 'police department')
data.Summary = data.Summary.str.replace('RCPD', 'police department')
data.Summary = data.Summary.str.replace('WSPD', 'police department')

# PDRD = portable digital recording device (body cam) worn by police
data.Summary = data.Summary.str.replace('PDRD', 'police body camera')
data.Summary = data.Summary.str.replace('CPS', 'child protective services')

#https://www.sfdph.org/dph/EH/HMUPA/HMUPAFormsMenu.asp - hazardous materials
#https://www.waterboards.ca.gov/ust/contacts/docs/lop_guide.pdf - water resources local oversight program
data.Summary = data.Summary.str.replace('LOP', 'water')
data.Summary = data.Summary.str.replace('HMUPA', 'hazardous materials')

# Replace key numbers with strings
data.Summary = data.Summary.str.replace(' 911 ', ' nineoneone ')
data.Summary = data.Summary.str.replace(' 311 ', ' threeoneone ')
data.Summary = data.Summary.str.replace(' 9-11 ', ' nineoneone ')
data.Summary = data.Summary.str.replace(' 3-11 ', ' threeoneone ')

# Remove digits
dig_translator = str.maketrans('','', string.digits)
data.Summary = data.Summary.str.translate(dig_translator)

# because "will" is in the NLTK list of stopwords below, we treat 'final will' separately                         
c = re.compile(re.escape('final will'), re.IGNORECASE)
data.Summary = data['Summary'].apply(lambda x: s.sub('final final_will', x))

# replace hyphen and slash with space to treat hyphate words as two separate words
hyphen_translator = str.maketrans('-/','  ')
data.Summary = data.Summary.str.translate(hyphen_translator)

# remove all punctuation
translator = str.maketrans('','', string.punctuation)
data.Summary = data.Summary.str.translate(translator)

## Use SpaCy tokenizer to ID Proper Nouns ##

#nlp = spacy.load('en_core_web_sm')
#data['token_sp'] = data['Summary'].apply(lambda x: nlp(x))
#data['pn'] = data['token_sp'].apply(lambda x: [ i.lemma_ for i in x if i.tag_ == 'NNP'])

# split text into list of words by space 
data['token'] = data['Summary'].apply(lambda x: nltk.word_tokenize(x))

# remove proper first and last names in our dictionary + convert all words to lower case
data['token'] = data['token'].apply(lambda x: [i.lower() for i in x if i not in all_names_dict])

#remove empty strings, stopwords and stem
stop_words = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
data['lemma'] = data['token'].apply(lambda x: nltk.pos_tag(x))
data['mash'] = data['lemma'].apply(lambda x: [lmtzr.lemmatize(i[0], get_wordnet_pos(i[1])) for i in x if len(i[0]) > 0 and i[0] not in stop_words])

# Remove whitespace
wsp_translator = str.maketrans('','', string.whitespace)
data['mash'] = data['mash'].apply(lambda x: [i.translate(wsp_translator) for i in x])

# Remove empty lists
data['mash_len'] = data['mash'].apply(lambda x: len(x))
data = data[data['mash_len'] > 0]


In [None]:
# remove number suffixes
suffix_list = ['th', 'nd', 'st', 'rd', 'blvd', 'pkwy']
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in suffix_list])

# remove city and state abbreviations
abbv_list = ['wa', 'nc', 'co', 'ca', 'oh', 'tx', 'nm', 'fl', 'ma', 'la', 'ok', 'az', 'ri', 'va', 
             'francisco', 'sf', 'okc', 'lv', 'nola', 'slc', 'cw']
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in abbv_list])

# remove spelled numbers
num_list = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten']
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in  num_list])

# replace 'inc' with 'incident
data['mash'] = data['mash'].apply(lambda x: ['incident' if i=='inc' else i for i in x])

# replace 'pd' with 'police department
data['mash'] = data['mash'].apply(lambda x: ['police department' if i=='pd' else i for i in x])

# remove noise words
noise = ['dr', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sept', 'sep', 'oct', 'nov', 'dec', 
        'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 
        'december', 'ne', 'nw', 'se', 'sw', 'ct', 'dr', 'way', 'dv', 'ave', 'aka', 'get', 'look', 'im', 'want', 
        'find', 'could', 'go', 'take', 'e', 'n', 's', 'w', '“', '’', '”', '•', 'northeast', 'northwest', 'southeast', 
        'southwest', 'north', 'south', 'east', 'west', 'orleans', '–', 'a', 'b', 'c', 'd', 'f', 'g', 'h', 'i', 'j', 'k',
        'l', 'm', 'o', 'p', 'q', 'r', 't', 'u', 'v', 'x', 'y', 'z', 'am', 'pm', 'hr', 'mr', 'ms', 'mrs', 'johnson', 
        'jr', 'kent', 'christopher', 'miller', 'joe', 'willows', 'david', 'michael', 'john', 'red', 'robert',
        'ask', 'able', 'let', 'question', 'also', 'snohomish', '¬ß', 'per', 'available', 'test', '√Ø', 'andor', '·', 'etc',
        'ï', 'ce', 'eg', 'sammamish']

data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in noise])

# remove cities
city_list = ['arlington', 'asheville', 'bainbridge', 'island', 'boulder', 'cathedral' ,'clearwater', 'dayton', 
            'denton', 'everett', 'fort', 'collins', 'greensboro', 'hayward', 'kirkland', 'las', 'cruces', 'lynnwood',
            'mercer', 'miami', 'middleborough', 'new', 'orleans', 'oakland', 'oklahoma', 'olympia', 'palo', 'alto', 
            'peoria', 'pullman', 'rancho', 'cucamonga', 'redmond', 'renton', 'sacramento', 'san', 'francisco', 
            'tukwila', 'vallejo', 'west', 'sacramento', 'winchester']

data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in city_list])

# Remove all state names
state_list = ['washington', 'carolina', 'colorado', 'california',
             'ohio', 'texas', 'florida', 'new', 'mexico','massachusetts',
             'louisiana', 'oklahoma', 'arizona', 'rhode', 'virginia']

data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in state_list])

# Create two-word phrases (bigrams)
data['bigrams'] = data['mash'].apply(lambda x: ["_".join(w) for w in ngrams(x, 2)])

#### Identify and remove noise words that are commonly used in PRRs

In [None]:
word_list = [y for x in list(data['mash']) for y in x]
counts = Counter(word_list)
Counter(word_list).most_common(50)

In [None]:
common_list = ['report', 'request', 'record', 'city', 'please', 'copy', 'date', 'information', 'would', 'regard', 'public',
              'include', 'document', 'provide', 'like', 'thank', 'need', 'know', 'thanks', 'pursuant', 'dear', 'file',
              'relate', 'from', 'either', 'hello', 'hi', 'foia', 'requestors', 'requestor', 'receive', 'available', 
               'make', 'attach', 'pertain', 'might', 'see', 'near']

# remove general words that are common to public record requests
data['mash'] = data['mash'].apply(lambda x: [i for i in x if i not in common_list])

#### Identify meaningful phrases by looking at the list of two-word sequences (bigrams) that are frequently used in public record requests. The meaningful phrases that we identify will be added to the list of words to consider in analysis for the PRRs in which they appear.

In [None]:
bigram_list = [y for x in list(data['bigrams']) for y in x]
counts = Counter(bigram_list)
Counter(bigram_list).most_common(100)

In [15]:

common_bigrams = ['police_report', 'insurance_company', 'location_loss', 'date_occurrence', 'reportcase_number',
                  'insure_driver', 'auto_accident', 'occurrence_location', 'transactionreference_insurance', 'number_date', 'type_auto',
                  'accident_reportcase', 'code_violation', 'copy_police', 'incident_report', 'police_department', 'certificate_occupancy',
                  'accident_report', 'property_locate', 'storage_tank','driver_note', 'building_permit', 'driver_driver','case_number', 
                  'hazardous_material', 'collision_report', 'state_farm', 'site_plan', 'fire_department', 'ftp_report', 'auto_theft',
                  'fire_code', 'request_police', 'farm_claim', 'claim_compass', 'site_assessment', 'compass_report', 'environmental_site', 
                  'tax_sale', 'loss_cross','city_council', 'code_enforcement', 'subject_property', 'report_case', 'phase_environmental', 
                  'report_incident', 'date_loss', 'police_case', 'witness_statement', 'driving_record', 'break_in', 'birth_certificate', 
                  'death_certificate', 'background_check', 'public_works', 'lease_agreement', 'medical_record', 'billing_record', 
                  'record_check', 'records_check', 'marriage_certificate', 'marriage_record', 'park_ticket', 'miss_person',
                 'marriage_license', 'reckless_driving', 'arrest_report', 'medical_billing', 'medical_report', 'criminal_record',
                 'floor_plan', 'site_plan', 'building_plan', 'building_code', 'code_enforcement', 'personnel_file']

data['common_bigrams'] = data['bigrams'].apply(lambda x: [i for i in x if i in common_bigrams])

NameError: name 'data' is not defined

#### Combine columns containing cleaned words (mash) and meaningful phrases (common_bigrams) to yield final set of words for analysis for each PRR

In [None]:
data['final_mash'] = data['mash'] + data['common_bigrams']

# Remove empty lists
data['mash_len'] = data['final_mash'].apply(lambda x: len(x))
data = data[data['mash_len'] > 0]

#### We can see the result of the final cleaned data below. 
The final_mash column represents the set of words that will be considered in our analysis.

In [None]:
data.head()

In [None]:
data['mash_len'].describe()

#### We can see that there is significant variation in the average request length per city. 
In some cases, cities with short average length represent cities where the provided data represented a summary of the original request (Oklahoma City) though in other cases, like Dayton, we received the raw data and the average length is still considerably shorter than other cities. 

In [None]:
data_gp = data.groupby('city').mean()
data_gp['mash_len']

#### We can see a couple of examples of the cleaned mash and the original request:

In [None]:
data['Summary'][data.index == 164]

In [None]:
data['final_mash'][data.index == 164]

In [None]:
data['final_mash'][data.index == 60000]

In [None]:
data["Summary"][60000]

#### Save the cleaned data to csv to use for testing different LDA models

In [None]:
data.to_csv('data.csv', index=False)

Next, we tested a number of different parameters for the LDA models to identify the optimal model. Because these models are very computationally intensive and take a long time to run, we have included the tests and final model in a [separate notebook](https://github.com/sunlightpolicy/Sunlight_FOIA/blob/master/src/analysis/LDA_Model_Tests.ipynb). Below, we conduct the analysis on our final model.

# LDA Analysis

In [None]:
# Test different length restrictions
'''
# 1) as-is

# create dictionary and corpus
texts = list(data['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus_all = [dictionary.doc2bow(text) for text in texts]

lda_60_60 = gensim.models.ldamodel.LdaModel(corpus_all, num_topics=60, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_60_60_model_all"
lda_60_60.save(model_name)
corpus_lda = lda_60_60[corpus_all]
corpus_lda_list = list(corpus_lda)
topics = data.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_60_60_topics_all.csv"
topics.to_csv(file_name)
'''

# 2) mash len > 2

data_ml2 = data[data['mash_len'] > 2]
texts = list(data_ml2['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus_ml2 = [dictionary.doc2bow(text) for text in texts]

lda_100_60_ml2 = gensim.models.ldamodel.LdaModel(corpus_ml2, num_topics=100, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_100_60_model_ml2"
lda_100_60_ml2.save(model_name)
corpus_lda = lda_100_60_ml2[corpus_ml2]
corpus_lda_list = list(corpus_lda)
topics = data_ml2.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_100_60_topics_ml2.csv"
topics.to_csv(file_name)

# 3) mash len > 3

data_ml3 = data[data['mash_len'] > 3]
texts = list(data_ml3['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus_ml3 = [dictionary.doc2bow(text) for text in texts]

lda_100_60_ml3 = gensim.models.ldamodel.LdaModel(corpus_ml3, num_topics=100, id2word = dictionary, 
                                         passes = 60, random_state=7)
model_name = "lda_100_60_model_ml3"
lda_100_60_ml3.save(model_name)
corpus_lda = lda_100_60_ml3[corpus_ml3]
corpus_lda_list = list(corpus_lda)
topics = data_ml3.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
file_name = "topics/lda_100_60_topics_ml3.csv"
topics.to_csv(file_name)

In [None]:
data_ml = data[data['mash_len'] > 2]

In [None]:
# create dictionary and corpus
texts = list(data['final_mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# 30 topics and 30 passes
lda_30_45 = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, 
                                         passes = 30, random_state=7)

In [None]:
# show topics for model
lda_30_45.show_topics(num_topics=30, formatted=False)

In [None]:
# show topics for model
lda_50_45_2.show_topics(num_topics=50, formatted=False)

In [None]:
# save good model
lda_40_45.save('lda_40_45_model')

In [None]:
lda_40_45 = gensim.models.ldamodel.LdaModel.load('lda_40_45_model')

In [None]:
corpus_lda = lda_40_45[corpus] #this is just a wrapper; calculates on the fly when you call it



In [None]:
corpus_lda_list = list(corpus_lda)

In [None]:
for index, score in sorted(lda_30_45[corpus[600]], key=lambda tup: -1*tup[1]): #600th document
    print("Score: {}\t Topic: {} \n".format(score, lda_30_45.print_topic(index, 15))) #15 word topics

In [None]:
data.mash2[600]

In [None]:
topics = data.copy()
topics = topics.assign(topic_comp = corpus_lda_list)
topics.head() #the topic_comp column are actual Python lists

In [None]:
def highest_topic(fp):
    df = pd.read_csv(fp)
    df['topic_comp'] =  df['topic_comp'].apply(lambda x:  ast.literal_eval(x))
    df['comp_len'] = df['topic_comp'].apply(len)
    df = df[df['comp_len'] > 0]
    df['top_topic'] = df['topic_comp'].apply(lambda x: max(x, key=lambda item:item[1])[0])
    df['top_topic_comp'] = df['topic_comp'].apply(lambda x: max(x, key=lambda item:item[1])[1])
    
    return df
    
    

In [None]:
def topics_to_csv(df, num_topics):
    for topic in range(0, num_topics):
        subset = df[df['top_topic'] == topic]
        file_name = 'topics/{}_PRR_topic_{}.csv'.format(num_topics, topic)
        subset.to_csv(file_name)

In [None]:
def process_csv(model_list):
    for model in model_list:
        fp = 'topics/lda_{}_45_topics.csv'.format(model)
        df = pd.read_csv(fp)
        highest_topic(df, fp)
        topics_to_csv(df, model)
    

In [None]:
fp = 'topics/lda_data_c2000_3.csv'
highest_topic(fp)
topics_to_csv(fp, 60)

In [None]:
fp2 = 'topics/lda_data10.csv'
highest_topic(fp2)
topics_to_csv(fp2, 60)

In [None]:
df.head()

In [None]:
d = highest_topic('topics/lda_data_avg.csv')

In [None]:
d.head()

In [None]:
df = df[df['comp_len'] > 0]

In [None]:
df_blank = df[df['comp_len'] == 0 ]

In [None]:
df_blank

#### We test a variety of different numbers of topics to identify the number of topics that yields the best results:

In [None]:
num_topics_list = [20, 30, 40, 50, 60]
for n in num_topics_list:
    lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=n, id2word = dictionary, 
                                         passes = 60, random_state=7)
    
    model_name = "lda_{}_45_model".format(n)
    lda.save(model_name)
    corpus_lda = lda[corpus]
    corpus_lda_list = list(corpus_lda)
    topics = data.copy()
    topics = topics.assign(topic_comp = corpus_lda_list)
    file_name = "topics/lda_{}_45_topics.csv".format(n)
    topics.to_csv(file_name)
    

In [None]:
data = pd.read_csv('topics/lda_60_45_topics.csv')

In [None]:
data.groupby('top_topic').size()

In [None]:
data.groupby('city').size()

In [None]:
process_csv(num_topics_list)

In [None]:
data['topic_comp'][500]

In [None]:
# create small dataset for testing functions

In [None]:
data_sm = data[:50]

In [None]:
highest_topic(data)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data2 = data[~data['top_topic'].isna()]

In [None]:
data2.shape

In [None]:
data['top_topic'].hist(bins=50)
plt.title('Count by Topic')
plt.show()

In [None]:
lda_50_45 = gensim.models.ldamodel.LdaModel.load('lda_50_45_model')

In [None]:
topics_to_csv(data, 50)

## Evaluate Categories

In [None]:
def eval_cat(df, num_topics, city):
    df_raw = pd.read_csv('/Users/alenastern/Google Drive File Stream/My Drive/Alena_Project/PR_Data/{}.csv'.format(city))
    
    df_raw_gp = df_raw.groupby('Dept').count()
    df_raw_gp['Summary'].plot(kind = 'bar')
    title = 'PRR Departments for {}'.format(city)
    plt.title(title)
    plt.show()
    
    df_city = df[df['city'] == city]
    df_merge = df_city.merge(df_raw, how ='left', on = 'Summary')
    df_merge['top_topic'] = pd.to_numeric(df_merge['top_topic'])
    for topic in range(0, num_topics):
        df_sub = df_merge[df_merge['top_topic'] == topic]
        df_gp = df_sub.groupby('Dept').count()
        df_gp['Summary'].plot(kind='bar')
        title = 'Departments for topic {} in {}'.format(topic, city)
        plt.title(title)
        plt.show()

In [None]:
city_list = ['Bainbridge', 'LasCruces', 'FortCollins', 'Mercer', 'Nola', 'Oakland', 'PaloAlto', 'Redmond', 
             'SanFrancisco', 'Vallejo']
fp = 'topics/lda_20_45_topics.csv'
df = pd.read_csv(fp)

In [None]:
eval_cat(df, 20, 'PaloAlto')

In [None]:
final_model = gensim.models.ldamodel.LdaModel.load('lda_data_avg_2')
final_fp = 'topics/lda_data_avg_2.csv'
final_model.show_topics(num_topics=60, formatted=False)


### Load Best Model and Data

In [None]:
final_model = gensim.models.ldamodel.LdaModel.load('lda_data_c2000_3_2')
final_fp = 'topics/lda_data_c2000_3_2.csv'

In [None]:
final_model.show_topics(num_topics=60, formatted=False)

## Identify Most Popular Catgories

### "Winner take all" popularity metric:
* Scoring Rules:
    * Only the topic that composes the largest share of a document scores "points" for its "Adjusted Popularity" total.
    * If a topic composes the largest share of that document, its "points" are its composition score.


In [None]:
# categorize data by final model identify highest topics
final_df = highest_topic(final_fp)
final_df = final_df[['top_topic', 'top_topic_comp']]

topic_gp = final_df.groupby('top_topic').sum()
topic_gp.reset_index(inplace = True)
topic_gp.rename(index=str, columns={"top_topic_comp": "total_pop"}, inplace = True)
topic_gp = topic_gp[['top_topic', 'total_pop']]

topic_gp.sort_values(by=['top_topic'], ascending = True)

# add in topic words
words_in_topics = [tup[1] for tup in final_model.show_topics(num_topics=60, formatted=False)] # update w/ winning model
topic_gp['topic'] = words_in_topics

topic_gp[["topic1", "topic2", "topic3", "topic4", 
       "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]] = topic_gp.topic.apply(pd.Series)

topic_gp = topic_gp.sort_values(by='total_pop', ascending=False)
topic_gp

## "Winner Take All with Thresholds" Rules:

Scoring Rules:
* Same as "Winner Take All", except a winning topic must compose at least a certain threshold of a document to get any points.
* We'll try 0.2 (low) and 0.5 (high) thresholds.

In [None]:
def winner_thresh(fp, thresh):
    final_df = highest_topic(fp)
    final_df = final_df[['top_topic', 'top_topic_comp']]


    final_df = final_df[final_df['top_topic_comp'] >= thresh]
    topic_gp = final_df.groupby('top_topic').sum()
    topic_gp.reset_index(inplace = True)
    topic_gp.rename(index=str, columns={"top_topic_comp": "total_pop"}, inplace = True)
    topic_gp = topic_gp[['top_topic', 'total_pop']]

    topic_gp.sort_values(by=['top_topic'], ascending = True)

    # add in topic words
    words_in_topics = [tup[1] for tup in final_model.show_topics(num_topics=60, formatted=False)] # update w/ winning model
    topic_gp['topic'] = words_in_topics

    topic_gp[["topic1", "topic2", "topic3", "topic4", 
       "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]] = topic_gp.topic.apply(pd.Series)

    topic_gp = topic_gp.sort_values(by='total_pop', ascending=False)
    return topic_gp

In [None]:
topic2 = winner_thresh(final_fp, 0.2)
topic5 = winner_thresh(final_fp, 0.5)

In [None]:
topic5

In [None]:
topic2

## Partial Credit Approach

* All topics assigned to a given PRR get credit for that PRR's topic composition score, provided the score is above the established threshold

In [None]:
def prop_calc(fp, thresh):
    df = pd.read_csv(fp)
    results_dict = {}
    df['topic_comp'] =  df['topic_comp'].apply(lambda x:  ast.literal_eval(x))
    for row_num in df.index:
        for tup in df.topic_comp[row_num]: 
            if not tup[0] in results_dict:
                if tup[1] >= thresh: 
                    results_dict[tup[0]] = tup[1] 
                else:
                    pass
            if tup[0] in results_dict:
                if tup[1] >= thresh:
                    results_dict[tup[0]] += tup[1] 
                    
    pd_df = pd.DataFrame.from_dict(results_dict, orient = 'index')
    pd_df.reset_index(inplace = True)
    pd_df.rename(index = str, columns = {'index': 'topic', 0: 'total_score'}, inplace = True)
    pd_df.topic = pd.to_numeric(pd_df.topic)
    pd_df = pd_df.sort_values(by=['topic'], ascending = True)

    # add in topic words
    words_in_topics = [tup[1] for tup in final_model.show_topics(num_topics=60, formatted=False)] # update w/ winning model
    pd_df['topic_words'] = words_in_topics

    pd_df[["topic1", "topic2", "topic3", "topic4", 
           "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]] = pd_df.topic_words.apply(pd.Series)

    
    
    pd_df = pd_df.sort_values(by='total_score', ascending=False)

    return pd_df

In [None]:
pc2 = prop_calc(final_fp, .2)

In [None]:
top_10 = list(pc2.topic[:10])

In [None]:
top_10

In [None]:
prop_calc(final_fp, .5)

## Normalize Metrics within a City/County (Dampened Popularity):
* For each city/county, we add up total score fore each topic and then take the log of the total score. We then add up scores across each city/count.
* For winner-take-all, only score for top topic included (provided it is above threshold)
* For partial-credit, scores for all topics included (provided it is above threhsold
* This is an extra control for cities with a large number of PRRs from skewing our results


In [None]:
city_list = ['Arlington', 'Asheville', 'Bainbridge', 'Boulder', 'CathedralCity' ,'Clearwater', 'Dayton', 
            'Denton', 'Everett', 'FortCollins', 'Greensboro', 'Hayward', 'Kirkland', 'LasCruces', 'Lynnwood',
            'Mercer', 'Miami', 'Middleborough', 'Nola', 'Oakland', 'OKC', 'Olympia', 'PaloAlto', 
            'Peoria', 'Pullman', 'RanchoCucamonga', 'Redmond', 'Renton', 'Sacramento', 'SanFrancisco', 
            'Tukwila', 'Vallejo', 'WestSacramento', 'Winchester']

In [None]:
def norm_pop(fp, city_list, thresh, winner_take_all):
    df = pd.read_csv(fp)
    df['topic_comp'] =  df['topic_comp'].apply(lambda x:  ast.literal_eval(x))
    
    list_of_domain_dicts = []
    popularity_dict = {}
    
    for city in city_list:
        
        results_dict = {}
        
        #get our df only of rows from a given city/state domain
        city_df = df[df.city == city]
        
        for row_num in city_df.index:
            tup_list = city_df.topic_comp[row_num] #list of (topic, doc composition) tuples
            
            if winner_take_all:
        
                #return only the tuple w/highest topic composition value
                winner_tuple = max(tup_list, key=lambda item:item[1])  

                if not winner_tuple[0] in results_dict: #if not in dict, add it with its TOTAL VIEWS score
                    if winner_tuple[1] > thresh:
                        results_dict[winner_tuple[0]] = winner_tuple[1] 
                else:
                    pass

                if winner_tuple[0] in results_dict: #if in dict, increment that key's value with score
                    if winner_tuple[1] > thresh:
                        results_dict[winner_tuple[0]] += winner_tuple[1]
                    pass
            else:
                for tup in tup_list: 
                    if not tup[0] in results_dict:
                        if tup[1] >= thresh: 
                            results_dict[tup[0]] = tup[1] 
                    else:
                        pass
                    if tup[0] in results_dict:
                        if tup[1] >= thresh:
                            results_dict[tup[0]] += tup[1] 
            
        #when loop of domain_df is finished, take log of all keys in dict
        log_dict = {}
        for k,v in results_dict.items():
            log_dict[k] = np.log(v)
        
        #now we have a polished dict of topic numbers as keys and log of all views/DLs as values; append it to list
        list_of_domain_dicts.append(log_dict)
    
    #use Counter() object to sync our dictionaries
    c = Counter()
    for d in list_of_domain_dicts:
        c.update(d)
    
    popularity_dict = dict(c)
    
    pd_df = pd.DataFrame.from_dict(popularity_dict, orient = 'index')
    pd_df.reset_index(inplace = True)
    pd_df.rename(index = str, columns = {'index': 'topic', 0: 'total_score'}, inplace = True)
    pd_df.topic = pd.to_numeric(pd_df.topic)
    pd_df = pd_df.sort_values(by=['topic'], ascending = True)

    # add in topic words
    words_in_topics = [tup[1] for tup in final_model.show_topics(num_topics=60, formatted=False)] # update w/ winning model
    pd_df['topic_words'] = words_in_topics

    pd_df[["topic1", "topic2", "topic3", "topic4", 
           "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]] = pd_df.topic_words.apply(pd.Series)

    
    
    pd_df = pd_df.sort_values(by='total_score', ascending=False)

    return pd_df

In [None]:
norm_pop(final_fp, city_list, .5, True)

In [None]:
norm_pop(final_fp, city_list, .2, True)

In [None]:
norm_pop('topics/lda_data_avg.csv', city_list, .5, False)

In [None]:
norm_pop('topics/lda_data_avg.csv', city_list, .2, False)

## Topic Popularity by City

In [None]:
def pop_by_city(fp, city_list, num_topics, thresh, winner_take_all):
    
    df = pd.read_csv(fp)
    df['topic_comp'] =  df['topic_comp'].apply(lambda x:  ast.literal_eval(x))
    
    cols = ['city', 'topic', 'total_pop', 'pct'] 
    
    topic_pop_city = pd.DataFrame(columns = cols)
    
    for city in city_list:
        
        results_dict = {}
        for i in range(0, num_topics):
            results_dict[i] = 0
        
        #get our df only of rows from a given city/state domain
        city_df = df[df.city == city]
        
        for row_num in city_df.index:
            tup_list = city_df.topic_comp[row_num] #list of (topic, doc composition) tuples
            
            if winner_take_all:
        
                #return only the tuple w/highest topic composition value
                winner_tuple = max(tup_list, key=lambda item:item[1])  

                if not winner_tuple[0] in results_dict: #if not in dict, add it with its TOTAL VIEWS score
                    if winner_tuple[1] > thresh:
                        results_dict[winner_tuple[0]] = winner_tuple[1] 
                else:
                    pass

                if winner_tuple[0] in results_dict: #if in dict, increment that key's value with score
                    if winner_tuple[1] > thresh:
                        results_dict[winner_tuple[0]] += winner_tuple[1]
                    pass
            else:
                for tup in tup_list: 
                    if not tup[0] in results_dict:
                        if tup[1] >= thresh: 
                            results_dict[tup[0]] = tup[1] 
                    else:
                        pass
                    if tup[0] in results_dict:
                        if tup[1] >= thresh:
                            results_dict[tup[0]] += tup[1] 
        
        pd_df = pd.DataFrame.from_dict(results_dict, orient = 'index')
        pd_df.reset_index(inplace = True)
        pd_df.rename(index = str, columns = {'index': 'topic', 0: 'total_pop'}, inplace = True)
        pd_df['city'] = city
        pd_df['pct'] = (pd_df['total_pop']/sum(pd_df['total_pop']))*100
        topic_pop_city = pd.concat([topic_pop_city, pd_df])

    return topic_pop_city

In [None]:
pbc = pop_by_city(final_fp, city_list, 60, .2, False)

In [None]:
pbc.to_csv('pbc.csv')

## Topic Popularity Over Time

In [None]:
def topic popularity(city):
    avg_month.plot(x='date_posted', y = ['total_price_excluding_optional_support', 'total_price_including_optional_support'], kind = 'line' )
    plt.show()

## Scratch

In [None]:
df_sp = pd.read_csv('topics/lda_data_sp.csv')

In [None]:
df_sp['final_mash'] = df_sp['final_mash'].apply(lambda x:  ast.literal_eval(x))
df_sp['mash'] = df_sp['mash'].apply(lambda x:  ast.literal_eval(x))
df_sp['common_bigrams'] = df_sp['common_bigrams'].apply(lambda x:  ast.literal_eval(x))
df_sp['token'] = df_sp['token'].apply(lambda x:  ast.literal_eval(x))
df_sp['lemma'] = df_sp['lemma'].apply(lambda x:  ast.literal_eval(x))
df_sp['pn2'] = df_sp['pn2'].apply(lambda x:  ast.literal_eval(x))


In [None]:
pn_list = [y for x in list(df_sp['pn2']) for y in x]
pn_counts = Counter(pn_list)
words = list(pn_counts.keys())
cnt = list(pn_counts.values())
pn_count_df = pd.DataFrame({'word': words, 'cnt': cnt})

In [None]:
pn_count_df.cnt.describe()

In [None]:
pn_count_df4 = pn_count_df[pn_count_df['cnt'] == 4]

In [None]:
pn_count_df4[:100]

In [None]:
pn_count_df.sort_values(by = ['cnt'], ascending = False, inplace = True)

In [None]:
pn_count_df[1000:1100]