In [142]:
# system tools
import warnings
import json
import sys
import string

# data cleaning + analysis tools
import pandas as pd
import datetime as dt
import numpy as np
import re

#nltk tools
import lda #Latent Dirichlet Allocation (create topics)
import gensim
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

#set notebook preferences
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

### Import JSON file with city metadata - including which cities have sufficient Public Record Request (PRR) data for analysis

In [2]:
json_file = '../data/cities.json'

with open(json_file, 'r') as f:
    md = json.load(f)

###  Create dataframe of PRR data for all relevant cities

In [92]:
data_raw = pd.DataFrame(columns = ['city', 'month_year', 'Summary'])
city_list = []
for key, value in md.items():
    city = value['name']
    filepath = '/Users/alenastern/Google Drive File Stream/My Drive/Alena_Project/PR_Data/{}.csv'.format(city)
    if value["desc"] == "Y":
        try:
            df = pd.read_csv(filepath)
        except:
            try:
                df = pd.read_csv(filepath, encoding='mac_roman')
            except:
                continue
        print(key)
        name = key.split(' ')
        city_list.append(name[0].lower())
    else:
        continue
    
    try:
        df['Create Date'] = pd.to_datetime(df['Create Date'])
    except:
        df['New'] = pd.to_datetime(df['Create Date'].apply(lambda x: re.findall('^\S*', x)[0]))
        df.drop(columns=['Create Date'], inplace = True)
        df.rename(index=str, columns={"New": "Create Date"}, inplace = True)

    df['month_year'] = df['Create Date'].dt.to_period('M')
    
    mc = df[['month_year', 'Summary']]
    mc['city'] = city
    
    data_raw = pd.concat([data_raw, mc])

Arlington city
Asheville city
Bainbridge Island city
Boulder County
Cathedral City city
Dayton city
Denton city
Everett city
Fort Collins city
Greensboro city
Hayward city
Kirkland city
Las Cruces city
Lynnwood city
Mercer Island city
Miami city
Middleborough town
New Orleans city
Oakland city
Oklahoma City city
Olympia city
Palo Alto city
Peoria city
Pullman city
Rancho Cucamonga city
Redmond city
Renton city
Sacramento city
San Francisco city
Tukwila city
Vallejo city
West Sacramento city
Winchester city


#### We can see the raw data below. Our raw dataset includes 86,416 PRRs from 33 different cities

In [110]:
data_raw.head()

Unnamed: 0,index,Summary,city,month_year
0,0,We are working with an engineering firm on an ...,Arlington,2018-06
1,1,Need copies of contracts and all related docum...,Arlington,2018-06
2,2,"Copies of Building Permits of $5,000 valuation...",Arlington,2018-06
3,3,police report filed to an officer against Wayn...,Arlington,2018-06
4,4,"Email Communications between Stephanie Shook, ...",Arlington,2018-06


In [93]:
data_raw.shape

(86416, 3)

In [81]:
len(data_raw.city.unique())

33

In [94]:
data_raw.index = pd.RangeIndex(len(data_raw.index))

In [98]:
data_raw.reset_index(inplace=True)

### Create dataframe for cleaning by removing null summaries

In [157]:
data = data_raw.dropna(subset=['Summary'])

#### Function to convert nltk part of speech tags to wordnet tags (we use this to stem the words in data cleaning below):

In [171]:
def get_wordnet_pos(tag):

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

## Clean PRR data to prepare for LDA analysis

In [158]:
# Turn to lowercase
data.Summary = data.Summary.str.lower()

# Remove all punctuation
translator = str.maketrans('','', string.punctuation)
data.Summary = data.Summary.str.translate(translator)

# Remove all city names
for city in city_list:
    data.Summary = data.Summary.str.replace(city, '')
    
# Remove digits
dig_translator = str.maketrans('','', string.digits)
data.Summary = data.Summary.str.translate(dig_translator)

#remove empty strings, stopwords and stem
stop_words = set(stopwords.words('english'))
lmtzr = WordNetLemmatizer()
data['token'] = data['Summary'].apply(lambda x: nltk.word_tokenize(x))
data['lemma'] = data['token'].apply(lambda x: nltk.pos_tag(x))
data['mash'] = data['lemma'].apply(lambda x: [lmtzr.lemmatize(i[0], get_wordnet_pos(i[1])) for i in x if len(i[0]) > 0 and i[0] not in stop_words])

# Remove whitespace
wsp_translator = str.maketrans('','', string.whitespace)
data['mash'] = data['mash'].apply(lambda x: [i.translate(wsp_translator) for i in x])

# Remove empty lists
data['mash_len'] = data['mash'].apply(lambda x: len(x))
data = data[data['mash_len'] > 0]


### Identify and remove commonly used words in PRRs

In [None]:
word_list = [y for x in list(data['mash']) for y in x]
counts = Counter(word_list)
Counter(word_list).most_common(50)

In [None]:
Counter(word_list).most_common()[-50:-1]

In [163]:
common_list = ['report', 'request', 'record', 'city', 'please', 'copy', 'date', 'information', 'would',
              'include', 'document', 'provide']

In [169]:
# remove general words that are common to public record requests

for word in common_list:
    for i in range(len(data['mash'])):
        if word in data['mash'].loc[i]:
            data['mash'].loc[i].remove(word)

In [None]:
# create column with the length of mash for each PRR

data['mash_len'] = data['mash'].apply(lambda x: len(x))

In [175]:
# remove entries of length 0

data = data[data['mash_len'] > 0]

In [176]:
data['mash_len'].describe()

count    73946.000000
mean        20.043451
std         36.064379
min          1.000000
25%          4.000000
50%         10.000000
75%         24.000000
max       2924.000000
Name: mash_len, dtype: float64

#### We can see a couple of examples of the cleaned mash and the original request:

In [177]:
data['mash'].iloc[7]

['build', 'permit', 'conversion', 'portion', 'garage', 'live', 'space']

In [180]:
data_raw["Summary"].iloc[7]

'Building permit from 2000 for the conversion of a portion of the garage to living space.'

In [179]:
data['mash'][12]

['build',
 'permit',
 'valuation',
 'min',
 'reroofs',
 'min',
 'cell',
 'tower',
 'upgrades',
 'electrical',
 'mechanical',
 'plumbing',
 'min',
 'solar',
 'panel',
 'swim',
 'pools',
 'foundation',
 'valuation',
 'issue',
 'may',
 'thru',
 'may']

In [181]:
data_raw["Summary"].iloc[12]

'Copies of Building Permits of $5,000 valuation and up ($20,000 min for Re-Roofs), ($50,000 min. for Cell Tower upgrades), (Electrical, Mechanical & Plumbing at $100,000 min.) and (Solar Panels, Swimming Pools & Foundations at any valuation)     \nIssued  ____May 3, 2018________  thru __May 25, 2018______'

# LDA Analysis

In [None]:
# create dictionary and corpus
texts = list(data['mash'])
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
# 50 topics and 45 passes
lda_50_45 = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, 
                                         passes = 45, random_state=7)

In [None]:
# show topics for model
lda_50_45.show_topics(num_topics=50, formatted=False)