In [36]:
import pandas as pd
import numpy as np
import shapefile

<h3>We want to keep Seattle + SF data separate to prevent extra work (i.e. no comparisons between neighborhoods in the same city are necessary). </h3>

In [6]:
# Open datasets
sf_categories = pd.read_csv(open('../data/eventbrite_SF_categories.csv'))
sf_subcats = pd.read_csv(open('../data/eventbrite_SF_subcats.csv'))
sea_categories = pd.read_csv(open('../data/eventbrite_Seattle_categories.csv'))
sea_subcats = pd.read_csv(open('../data/eventbrite_Seattle_subcats.csv'))

In [12]:
# Merge datasets by city
sf_merged = sf_categories.merge(sf_subcats, on='hood', how='outer')
sea_merged = sea_categories.merge(sea_subcats, on='hood', how='outer')

<h3>Q: Do the categories and sub-categories in each city match? (Are there any missing columns?)
<br>A: No - SF has 128, Seattle has 98. Unclear if categories themselves mix/match.</h3>

In [14]:
len(sf_merged.columns)

128

In [15]:
len(sea_merged.columns)

98

<h3>To create comparable vectors, columns must match. Time to fill in missing columns in each set.</h3>

In [22]:
sf_columns = []
sea_columns = []
missing_in_sea = []
missing_in_sf = []

for column in sf_merged.columns:
    sf_columns.append(column)
    
for column in sea_merged.columns:
    sea_columns.append(column)
    
for item in sf_columns:
    if item not in sea_columns:
        missing_in_sea.append(item)
        
for item in sea_columns:
    if item not in sf_columns:
        missing_in_sf.append(item)
        
print missing_in_sea
print missing_in_sf

['Accessories', 'Adult', 'Animal Welfare', 'Anime/Comics', 'Ballet', 'Boat', 'Books', 'Bridal', 'Buddhism', 'Country', 'County/Municipal Government ', 'Eastern Religion', 'Exercise', 'Federal Government', 'Fighting & Martial Arts', 'Folk', 'Halloween/Haunt', 'Independence Day', 'Indie', 'Latin', 'Literary Arts', 'Media', 'Medicine', 'Musical', 'New Years Eve', 'Opera', 'Other Party', 'Reggae', 'Robotics', 'Rock', 'State', 'Swimming & Water Sports', 'TV', 'Theatre', 'Top 40', 'Walking']
['Climbing', 'Kayaking', 'Real Estate', 'Renaissance', 'Soccer', 'Tennis']


In [31]:
# Add missing columns to each merged df as needed
for item in missing_in_sea:
    sea_merged[item] = 0

for item in missing_in_sf:
    sf_merged[item] = 0
    
sf_merged.head()

Unnamed: 0,hood,Arts,"Auto, Boat & Air",Business,Charity & Causes,Community,Family & Education,Fashion_x,Film & Media,Food & Drink,...,Travel,Walking,Wine,Yoga,Climbing,Kayaking,Real Estate,Renaissance,Soccer,Tennis
0,Alamo Square,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
1,Bayview,7,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
2,Bernal Heights,0,0,1,2,0,3,0,0,0,...,0.0,0.0,0.0,1.0,0,0,0,0,0,0
3,Central Richmond,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
4,Central Sunset,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0


In [32]:
# Check to see if the number of columns in each city's dataset are equal
len(sf_merged.columns) == len(sea_merged.columns)

True

<h3>Uh oh, the shape of our dfs look suspicious. Not all neighborhoods appear to be represented.
<br>Let's add the missing neighborhoods to each df (with rows full of zeroes, since no events were recorded).
<br>While this doesn't matter for Eventbrite data, it WILL matter for our final data merging script.</h3>

In [33]:
sf_merged.shape

(66, 134)

In [34]:
len(sf_merged['hood'])

66

In [38]:
# Import shape files
sf_shapefile = shapefile.Reader('../downloads/sf_neighborhoods/geo_export_197f44fb-6cc0-472b-81f7-347deefb57df')
sea_shapefile = shapefile.Reader('../downloads/seattle_neighborhoods/Neighborhoods')

In [39]:
# Grab the records from each polygon shape to get the list of all neighborhoods
sf_records = sf_shapefile.records()
sea_records = sea_shapefile.records()

In [41]:
sf_neighborhoods = []
for record in sf_records:
    sf_neighborhoods.append(record[2])

sea_neighborhoods = []
for record in sea_records:
    if record[5] != 'OOO' and record[5][1] != ' ':
        sea_neighborhoods.append(record[5])

['Loyal Heights', 'Adams', 'Whittier Heights', 'West Woodland', 'Phinney Ridge', 'Wallingford', 'Fremont', 'Green Lake', 'View Ridge', 'Ravenna', 'Sand Point', 'Bryant', 'Windermere', 'Laurelhurst', 'Roosevelt', 'University District', 'East Queen Anne', 'West Queen Anne', 'Lower Queen Anne', 'North Queen Anne', 'Westlake', 'Eastlake', 'South Lake Union', 'Lawton Park', 'Briarcliff', 'Southeast Magnolia', 'Madrona', 'Harrison/Denny-Blaine', 'Minor', 'Leschi', 'Mann', 'Atlantic', 'Pike-Market', 'Belltown', 'International District', 'Central Business District', 'First Hill', 'Yesler Terrace', 'Pioneer Square', 'Interbay', 'Industrial District', 'Georgetown', 'South Park', 'Harbor Island', 'Seaview', 'Gatewood', 'Arbor Heights', 'Alki', 'North Admiral', 'Fairmount Park', 'Genesee', 'Fauntleroy', 'North Beacon Hill', 'Mid-Beacon Hill', 'South Beacon Hill', 'Holly Park', 'Brighton', 'Dunlap', 'Rainier Beach', 'Rainier View', 'Mount Baker', 'Columbia City', 'Highland Park', 'North Delridge', 

In [55]:
# Check to see which neighborhoods are missing and add them to a missing list
# for item in sea_neighborhoods:
    # if item not in sea_merged.hood:
        # print item
        
print sea_merged.hood[32]
print sea_neighborhoods[0]

Loyal Heights
Loyal Heights


In [56]:
sea_merged.hood[32] == sea_neighborhoods[0]

True

In [64]:
sea_current_hoods = set(sea_merged.hood)
sf_current_hoods = set(sf_merged.hood)

In [66]:
sea_missing_hoods = []
for item in sea_neighborhoods:
    if item not in sea_current_hoods:
        sea_missing_hoods.append(item)
        
sf_missing_hoods = []
for item in sf_neighborhoods:
    if item not in sf_current_hoods:
        sf_missing_hoods.append(item)

In [69]:
# Add missing neighborhoods to each city's data set


In [72]:
sf_merged.tail(2)

Unnamed: 0,hood,Arts,"Auto, Boat & Air",Business,Charity & Causes,Community,Family & Education,Fashion_x,Film & Media,Food & Drink,...,Travel,Walking,Wine,Yoga,Climbing,Kayaking,Real Estate,Renaissance,Soccer,Tennis
64,Western Addition,0,0,4,2,0,5,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
65,Yerba Buena,9,1,159,10,9,6,3,12,7,...,0.0,0.0,1.0,1.0,0,0,0,0,0,0


In [85]:
sf_merged.loc[len(sf_merged)] = 0
sf_merged.loc[(len(sf_merged) - 1)]['hood'] = 'test'
sf_merged.tail()
                 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,hood,Arts,"Auto, Boat & Air",Business,Charity & Causes,Community,Family & Education,Fashion_x,Film & Media,Food & Drink,...,Travel,Walking,Wine,Yoga,Climbing,Kayaking,Real Estate,Renaissance,Soccer,Tennis
65,Yerba Buena,9,1,159,10,9,6,3,12,7,...,0.0,0.0,1.0,1.0,0,0,0,0,0,0
test,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
67,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
68,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
69,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0


In [78]:
sf_merged.tail(2)

Unnamed: 0,hood,Arts,"Auto, Boat & Air",Business,Charity & Causes,Community,Family & Education,Fashion_x,Film & Media,Food & Drink,...,Travel,Walking,Wine,Yoga,Climbing,Kayaking,Real Estate,Renaissance,Soccer,Tennis
64,Western Addition,0,0,4,2,0,5,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,0
65,Yerba Buena,9,1,159,10,9,6,3,12,7,...,0.0,0.0,1.0,1.0,0,0,0,0,0,0


In [77]:
sf_merged = sf_merged.drop(66, 1)