In [54]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)

In [3]:
sf = pd.read_csv('../data/sf_trees_all_records_tagged.csv')
sf.head()

Unnamed: 0.1,Unnamed: 0,City,Latitude,Longitude,Neighborhood
0,0,San Francisco,37.7783,-122.507268,Outer Richmond
1,1,San Francisco,37.752547,-122.393785,Potrero Hill
2,2,San Francisco,37.780364,-122.40543,South of Market
3,3,San Francisco,37.78166,-122.438551,Western Addition
4,4,San Francisco,37.789498,-122.38845,South Beach


In [4]:
sf = sf.drop(sf.columns[0], axis=1)

In [5]:
sf.head()
sf['Number of Trees'] = 1
sf = sf[['Neighborhood', 'Number of Trees']]
sf.head()

Unnamed: 0,Neighborhood,Number of Trees
0,Outer Richmond,1
1,Potrero Hill,1
2,South of Market,1
3,Western Addition,1
4,South Beach,1


In [6]:
sf['Neighborhood'].value_counts().count()

92

In [7]:
trees_sf = sf.groupby(['Neighborhood']).count()

In [8]:
trees_sf

Unnamed: 0_level_0,Number of Trees
Neighborhood,Unnamed: 1_level_1
Alamo Square,576
Anza Vista,412
Balboa Terrace,271
Bayview,3750
Bayview Heights,181
Bernal Heights,3595
Buena Vista Park/Ashbury Heights,636
Candlestick Point,77
Central Richmond,2318
Central Sunset,1701


In [33]:
sea = pd.read_csv('../data/trees_by_hood_Seattle.csv')
sea.head()

Unnamed: 0,Neighborhood,Number of Trees
0,Adams,2486
1,Alki,800
2,Arbor Heights,39
3,Atlantic,2277
4,Belltown,2286


In [34]:
sea['Number of Trees'] = 1
sea = sea[['Neighborhood', 'Number of Trees']]
sea.head()

Unnamed: 0,Neighborhood,Number of Trees
0,Adams,1
1,Alki,1
2,Arbor Heights,1
3,Atlantic,1
4,Belltown,1


In [11]:
trees_sea = sea.groupby(['Neighborhood']).count()

In [12]:
trees_sea.to_csv('../data/sea_trees_by_hood.csv')

In [35]:
sea_trees = pd.read_csv('../data/trees_by_hood_Seattle.csv')
sf_trees = pd.read_csv('../data/trees_by_hood_SF.csv')

In [36]:
sea_trees.head()

Unnamed: 0,Neighborhood,Number of Trees
0,Adams,2486
1,Alki,800
2,Arbor Heights,39
3,Atlantic,2277
4,Belltown,2286


In [37]:
sea_hood_areas = pd.read_csv('../data/sea_hood_areas.csv')
sea_hood_areas.head()

Unnamed: 0,Neighborhood,Area (sq km)
0,Loyal Heights,2.933042
1,Adams,3.085047
2,Whittier Heights,1.952875
3,West Woodland,3.053022
4,Phinney Ridge,4.418331


In [38]:
sea_trees = sea_trees.merge(sea_hood_areas, on='Neighborhood', how='outer')

In [39]:
sea_trees.head()

Unnamed: 0,Neighborhood,Number of Trees,Area (sq km)
0,Adams,2486,3.085047
1,Alki,800,3.374849
2,Arbor Heights,39,3.591894
3,Atlantic,2277,2.837951
4,Belltown,2286,2.057272


In [40]:
sea_trees['Tree Density (Trees / sq km)'] = sea_trees['Number of Trees'] / sea_trees['Area (sq km)']

In [41]:
sea_trees.head()

Unnamed: 0,Neighborhood,Number of Trees,Area (sq km),Tree Density (Trees / sq km)
0,Adams,2486,3.085047,805.822394
1,Alki,800,3.374849,237.047635
2,Arbor Heights,39,3.591894,10.85778
3,Atlantic,2277,2.837951,802.339512
4,Belltown,2286,2.057272,1111.180508


In [29]:
sea_trees.groupby(sea_trees['Tree Density (Trees / sq km)']).max().tail(10)

Unnamed: 0_level_0,Neighborhood,Number of Trees,Area (sq km)
Tree Density (Trees / sq km),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
955.835277,Portage Bay,581,0.607845
991.758455,Wallingford,5737,5.784675
1012.377646,Minor,2494,2.463508
1028.401479,Mann,1573,1.529558
1034.278426,Stevens,3701,3.57834
1063.560198,Whittier Heights,2077,1.952875
1111.180508,Belltown,2286,2.057272
1111.305463,First Hill,1453,1.307471
1173.713818,Lower Queen Anne,2825,2.40689
1276.699484,Holly Park,1479,1.158456


In [42]:
sea_trees = pd.read_csv('../data/tree_densities_Seattle.csv')
sf_trees = pd.read_csv('../data/tree_densities_SF.csv')

In [44]:
sf_trees.head()

Unnamed: 0,Neighborhood,Tree Density (Trees / sq km)
0,Alamo Square,1403.863583
1,Anza Vista,1128.401692
2,Balboa Terrace,981.679913
3,Bayview,591.854865
4,Bayview Heights,325.368159


<h3>Can we get a cosine similarity between neighborhoods with a single feature? Mathematically, yes, but it won't be useful since the 'angle' between the vectors will be the same. Let's look at both tree density and walk score for a general 'stroll enjoyment' feature.</h3>

In [60]:
sea_hood = np.array(sea_trees['Tree Density (Trees / sq km)'][0])
sf_hood = np.array(sf_trees['Tree Density (Trees / sq km)'][10])
#input_hood = np.array(test_hood)
print type(sea_hood), type(sf_hood), cosine_similarity(sea_hood, sf_hood)

<type 'numpy.ndarray'> <type 'numpy.ndarray'> [[ 1.]]


In [47]:
def convert_hood_to_array(input_hood):
    # convert to numpy array in format needed for cosine similarity comparison
    input_hood = np.array(input_hood)[0][1:]
    input_hood = input_hood.reshape(1, -1)
    return input_hood



def compare_hood_to_all_city_hoods(input_hood, hood_city_df, comparison_city_df, input_dict):
    hood1 = hood_city_df.loc[hood_city_df['Neighborhood'] == input_hood]
    hood1 = convert_hood_to_array(hood1)
    for hood in comparison_city_df['Neighborhood']:
        hood2 = comparison_city_df.loc[comparison_city_df['hood'] == hood]
        hood2 = convert_hood_to_array(hood2)
        if input_hood not in input_dict:
            input_dict[input_hood] = {}
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
        else:
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
    return input_dict

# comparisons = {}
# for hood in sea_trees['Neighborhood']:
#     compare_hood_to_all_city_hoods(hood, sea_trees, sf_trees, comparisons)
# for hood in sf_trees['Neighborhood']:
#     compare_hood_to_all_city_hoods(hood, sf_trees, sea_trees, comparisons)