In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max_columns', None)

In [3]:
# import rush hour morning commute data
sf = pd.read_csv('../data/commute_by_hood_sf.csv')
sea = pd.read_csv('../data/commute_by_hood_sea.csv')

In [4]:
sf.columns = ['Neighborhood', 'Bicycling (min)', 'Driving (min)', 'Transit (min)', 'Walking (min)']
sea.columns = ['Neighborhood', 'Bicycling (min)', 'Driving (min)', 'Transit (min)', 'Walking (min)']

In [5]:
sea.describe()

Unnamed: 0,Bicycling (min),Driving (min),Transit (min),Walking (min)
count,90.0,90.0,90.0,90.0
mean,37.992778,15.121111,41.437037,100.785741
std,16.264942,4.561824,27.195529,47.959177
min,2.9,3.35,5.45,5.45
25%,25.770833,12.375,28.1875,61.333333
50%,39.733333,15.525,34.425,107.35
75%,48.470833,17.958333,47.545833,137.383333
max,74.166667,24.516667,227.75,187.033333


In [13]:
type(sf['Bicycling (min)'][0])

numpy.float64

In [7]:
def convert_hood_to_array(input_hood):
    # convert to numpy array in format needed for cosine similarity comparison
    input_hood = np.array(input_hood)[0][1:]
    input_hood = input_hood.reshape(1, -1)
    return input_hood

def compare_hood_to_all_city_hoods(input_hood, hood_city_df, comparison_city_df, input_dict):
    hood1 = hood_city_df.loc[hood_city_df['Neighborhood'] == input_hood]
    hood1 = convert_hood_to_array(hood1)
    for hood in comparison_city_df['Neighborhood']:
        hood2 = comparison_city_df.loc[comparison_city_df['Neighborhood'] == hood]
        hood2 = convert_hood_to_array(hood2)
        if input_hood not in input_dict:
            input_dict[input_hood] = {}
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
        else:
            input_dict[input_hood][hood] = cosine_similarity(hood1, hood2)[0][0]
    return input_dict

# compare all Seattle neighborhoods with all SF neighborhoods
# note: one optimization for this will be to, instead of a dictionary of dictionaries,
# have a dictionary of tuples (hood_name, cosine_similarity), sorted by c_s
comparisons = {}
for hood in sea['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, sea, sf, comparisons)
for hood in sf['Neighborhood']:
    compare_hood_to_all_city_hoods(hood, sf, sea, comparisons)

In [8]:
city = comparisons['Fremont']
for k in sorted(city, key=city.get, reverse=True)[:5]:
    print k, city[k]

Pine Lake Park 0.999702221459
Merced Manor 0.999561778611
Saint Francis Wood 0.999517581379
Balboa Terrace 0.999491700096
Mission Terrace 0.998978641891


In [9]:
city = comparisons['Yerba Buena']
for k in sorted(city, key=city.get, reverse=True)[:5]:
    print k, city[k]

Central Business District 0.996451652517
Pike-Market 0.995356736707
Minor 0.993520077133
First Hill 0.991868497163
Pioneer Square 0.991725499933


In [10]:
city = comparisons['Wallingford']
for k in sorted(city, key=city.get, reverse=True)[:5]:
    print k, city[k]

Ingleside Heights 0.999882305592
Stonestown 0.999678774486
Oceanview 0.99953667224
Lake Shore 0.999475509464
Outer Mission 0.999406291487
