In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)

In [7]:
# import tree density, walk score, and transit score data for merging
tree_density_sf = pd.read_csv('../data/tree_densities_SF.csv')
tree_density_sea = pd.read_csv('../data/tree_densities_Seattle.csv')
walk_score_sf = pd.read_csv('../data/walk_scores_San Francisco.csv')
walk_score_sea = pd.read_csv('../data/walk_scores_Seattle.csv')
transit_score_sf = pd.read_csv('../data/transit_scores_San Francisco.csv')
transit_score_sea = pd.read_csv('../data/transit_scores_Seattle.csv')

In [19]:
# merge each category for each city
merged_sf = tree_density_sf.merge(walk_score_sf, on='Neighborhood', how='outer')
merged_sf = merged_sf.merge(transit_score_sf, on='Neighborhood', how='outer')
merged_sea = tree_density_sea.merge(walk_score_sea, on='Neighborhood', how='outer')
merged_sea = merged_sea.merge(transit_score_sea, on='Neighborhood', how='outer')

<h3>The magnitude of the tree density data is substantially higher than the walk and transit scores. Does scaling the tree density down by an order of magnitude change/improve the cosine similarity recommendations? I checked this to be sure, but it looks like it does not (which makes sense, given that we're looking at the angle, not the magnitude of the distance).</h3>

In [36]:
merged_sf_scaled = merged_sf
merged_sea_scaled = merged_sea
merged_sf_scaled['Tree Density (Trees / sq km)'] = merged_sf_scaled['Tree Density (Trees / sq km)'] / 10
merged_sea_scaled['Tree Density (Trees / sq km)'] = merged_sea_scaled['Tree Density (Trees / sq km)'] / 10

In [38]:
merged_sf_scaled.head()

Unnamed: 0,Neighborhood,Tree Density (Trees / sq km),Walk Score,Transit Score
0,Alamo Square,140.386358,95,91
1,Anza Vista,112.840169,93,82
2,Balboa Terrace,98.167991,85,72
3,Bayview,59.185486,88,70
4,Bayview Heights,32.536816,73,69


In [39]:
def convert_hood_to_array(input_hood):
    # convert to numpy array in format needed for cosine similarity comparison
    input_hood = np.array(input_hood)[0][1:]
    input_hood = input_hood.reshape(1, -1)
    return input_hood

def compare_hood_to_all_city_hoods(input_hood, hood_city_df, comparison_city_df, input_dict):
    hood1 = hood_city_df.loc[hood_city_df['Neighborhood'] == input_hood]
    hood1 = convert_hood_to_array(hood1)
    for hood in comparison_city_df['Neighborhood']:
        hood2 = comparison_city_df.loc[comparison_city_df['Neighborhood'] == hood]
        hood2 = convert_hood_to_array(hood2)
        if input_hood not in input_dict:
            input_dict[input_hood] = {}
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
        else:
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
    return input_dict

# compare all Seattle neighborhoods with all SF neighborhoods
# note: one optimization for this will be to, instead of a dictionary of dictionaries,
# have a dictionary of tuples (hood_name, cosine_similarity), sorted by c_s
comparisons = {}
for hood in merged_sea['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, merged_sea, merged_sf, comparisons)
for hood in merged_sf['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, merged_sf, merged_sea, comparisons)

comparisons_scaled = {}
for hood in merged_sea_scaled['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, merged_sea_scaled, merged_sf_scaled, comparisons_scaled)
for hood in merged_sf_scaled['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, merged_sf_scaled, merged_sea_scaled, comparisons_scaled)

In [41]:
city = comparisons['Fremont']
for k in sorted(city, key=city.get, reverse=True)[:5]:
    print k, city[k]

Cole Valley/Parnassus Heights 0.999931936497
Balboa Terrace 0.99902622132
Inner Mission 0.998944898608
Bernal Heights 0.998696736341
Anza Vista 0.998461120058


In [42]:
city_scaled = comparisons_scaled['Fremont']
for k in sorted(city_scaled, key=city_scaled.get, reverse=True)[:5]:
    print k, city_scaled[k]

Cole Valley/Parnassus Heights 0.999931936497
Balboa Terrace 0.99902622132
Inner Mission 0.998944898608
Bernal Heights 0.998696736341
Anza Vista 0.998461120058


In [43]:
merged_sf[merged_sf['Neighborhood'] == 'Cole Valley/Parnassus Heights']

Unnamed: 0,Neighborhood,Tree Density (Trees / sq km),Walk Score,Transit Score
12,Cole Valley/Parnassus Heights,105.846444,91,71


In [44]:
merged_sea[merged_sea['Neighborhood'] == 'Fremont']

Unnamed: 0,Neighborhood,Tree Density (Trees / sq km),Walk Score,Transit Score
21,Fremont,89.017674,78,59


In [45]:
city = comparisons['Wallingford']
for k in sorted(city, key=city.get, reverse=True)[:5]:
    print k, city[k]

Cole Valley/Parnassus Heights 0.99742846124
Presidio Heights 0.995920157921
Balboa Terrace 0.994913037768
Bernal Heights 0.994772836823
Inner Mission 0.994378413362
