## Data Loading

In [4]:
## Import libraries and load data.
import json
import string
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

with open('illegal_pets.json') as data_file:    
    data = json.load(data_file)

In [22]:
## Create data frame to store data.
length = len(data['entry'])
columns = []

for element in str(data['entry'][0]['content']).split(','):
        if len(element.split(': ')[-2:]) == 2:
            columns.append(element.split(': ')[-2:][0])

columns.append('animal1')
columns.append('animal2')
complaints = pd.DataFrame(columns=columns)
complaints

Unnamed: 0,u'details,date,streetaddress,city,zip,latitude,longitude,fulladdress,u'type',animal1,animal2


In [23]:
## Fill data frame.
for i in range(length):
    for element in str(data['entry'][i]['content']).split(','):
        if len(element.split(': ')[-2:]) == 2:
            complaints.loc[i, element.split(': ')[-2:][0]] = element.split(': ')[-2:][1]
    if pd.isnull(complaints.loc[i,"u'details"]):
        complaints.loc[i,"u'details"] = complaints.loc[i,'u"details']
    if pd.isnull(complaints.loc[i,"u'details"]):
        complaints.loc[i,"u'details"] = complaints.loc[i,'10Th Request In 3Yrs']

## Common Words

In [24]:
## Most common words.
count_all = Counter()
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation
animals = ['roosters','rooster','snakes','snake','chickens','chicken','pigs','pig','crows','crow','birds','bird'
          'racoons','racoon', 'pet']

for i in complaints.index:
    terms = [term for term in word_tokenize(complaints.loc[i,"u'details"].lower()) if term in animals]
    count_all.update(terms)
    
print count_all.most_common(60)


for i in complaints.index:
    words = word_tokenize(complaints.loc[i,"u'details"].lower())
    animal_set = set([word for word in words if word in animals])
    if animal_set:
        complaints.loc[i,"animal1"] = animal_set.pop()
        #complaints.loc[i,"animal2"] = animal_set.pop()

[('rooster', 89), ('roosters', 56), ('pet', 44), ('snake', 18), ('chickens', 16), ('snakes', 15), ('pig', 12), ('crows', 8), ('chicken', 6), ('birds', 4), ('crow', 3), ('racoon', 1), ('pigs', 1)]


In [25]:
complaints = complaints.drop(complaints.columns[[7,8,11,12]], axis = 1)
complaints
complaints.to_csv("complaints_details.csv", index = False)

In [63]:
## Words per city.
common_words = {}

for city in complaints.ix[:,3].unique():
    count_city = Counter()
    city_df = complaints[complaints.ix[:,3] == city]
    
    for i in city_df.index:    
        terms = [term for term in word_tokenize(city_df.loc[i,"u'details"].lower()) if term not in stop]
        count_city.update(terms)
        
    common_words[city] = count_city.most_common(10)

In [64]:
common_animals = {}

for city, animals in common_words.iteritems():
    if len([x for x in animals if x[1] > 2]) > 0:
        common_animals[city] = [x for x in animals if x[1] > 2]

animals_df = pd.DataFrame.from_dict(common_animals, orient='index')
animals_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Staten Island,"(rooster, 7)","(neighbor, 5)","(apartment, 4)","(roosters, 4)","(yard, 4)","(keeping, 4)","(lizard, 4)","(kept, 4)","(health, 3)","(morning, 3)"
Middle Village,"(rooster, 4)",,,,,,,,,
Corona,"(roosters, 6)","(back, 3)","(yard, 3)",,,,,,,
Ridgewood,"(rooster, 4)","(snake, 3)",,,,,,,,
Brooklyn,"(rooster, 31)","(neighbor, 21)","(pet, 18)","(keeping, 17)","(kept, 15)","(caller, 13)","(backyard, 12)","(roosters, 9)","(large, 8)","(pets, 8)"
New York,"(rooster, 5)","(2, 4)","(customer, 4)","(chicken, 3)","(apartment, 3)","(pet, 3)","(roosters, 3)","(sugar, 3)","(snakes, 3)","(afraid, 3)"
Elmhurst,"(rooster, 4)","(kept, 3)",,,,,,,,
Rockaway Park,"(kept, 3)","(rooster, 3)",,,,,,,,
Bronx,"(roosters, 23)","(rooster, 17)","(kept, 12)","(pet, 10)","(backyard, 8)","(building, 8)","(like, 6)","('s, 5)","(neighbor, 5)","(apartment, 4)"
College Point,"(pig, 5)","(even, 3)",,,,,,,,


In [65]:
animals_df.to_csv("common_animals.csv")