Visualize data

In [4]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

In [5]:
result = pd.read_csv("result.csv")

In [6]:
result

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Lawrence Heights",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Not assigned",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Don Mills North",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Ryerson",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Glencairn",43.657162,-79.378937


In [7]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(result['Latitude'], result['Longitude'],result['Postal Code'], result['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork



The geograpical coordinate of Toronto are 43.653963, -79.387207.


Analizing different venues in the East York borough Neighborhood

In [9]:
ey_data = result[result['Borough'] == 'East York'].reset_index(drop=True)

In [10]:
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(ey_data['Latitude'], ey_data['Longitude'], ey_data['Borough'], ey_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map)
    
map

Define Foursquare Credentials and Version

In [11]:
CLIENT_ID = 'TJHUBCYUEQZJCRJKXN2PPE1CC2U3INVHPWRDNINFWBIX1ZBH' # your Foursquare ID
CLIENT_SECRET = 'EB2CX2KIWDLQTSAVUSAC55M3CYRUPVJVXYZY41QGCSXJ1W4Z' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TJHUBCYUEQZJCRJKXN2PPE1CC2U3INVHPWRDNINFWBIX1ZBH
CLIENT_SECRET:EB2CX2KIWDLQTSAVUSAC55M3CYRUPVJVXYZY41QGCSXJ1W4Z


In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
LIMIT=500
ey_venues = getNearbyVenues(names=ey_data['Neighborhood'],
                                   latitudes=ey_data['Latitude'],
                                   longitudes=ey_data['Longitude']
                                  )

Woodbine Gardens, Ryerson
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto


In [14]:
ey_venues.shape

(75, 7)

In [15]:
# one hot encoding
ey_onehot = pd.get_dummies(ey_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ey_onehot['Neighborhood'] = ey_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ey_onehot.columns[-1]] + list(ey_onehot.columns[:-1])
ey_onehot = ey_onehot[fixed_columns]

ey_onehot.head()

Unnamed: 0,Neighborhood,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Bus Station,Bus Stop,Café,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop,Curling Ice,Dessert Shop,Discount Store,Electronics Store,Fast Food Restaurant,Fish & Chips Shop,Furniture / Home Store,Gastropub,Grocery Store,Gym,Gym / Fitness Center,Indian Restaurant,Intersection,Liquor Store,Mexican Restaurant,Park,Pet Store,Pharmacy,Pizza Place,Record Shop,Restaurant,Rock Climbing Spot,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Video Store,Warehouse Store,Yoga Studio
0,"Woodbine Gardens, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Woodbine Gardens, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,"Woodbine Gardens, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Woodbine Gardens, Ryerson",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Woodbine Gardens, Ryerson",0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
ey_onehot.shape

(75, 50)

In [17]:
ey_grouped = ey_onehot.groupby('Neighborhood').mean().reset_index()
ey_grouped

Unnamed: 0,Neighborhood,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Bus Station,Bus Stop,Café,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop,Curling Ice,Dessert Shop,Discount Store,Electronics Store,Fast Food Restaurant,Fish & Chips Shop,Furniture / Home Store,Gastropub,Grocery Store,Gym,Gym / Fitness Center,Indian Restaurant,Intersection,Liquor Store,Mexican Restaurant,Park,Pet Store,Pharmacy,Pizza Place,Record Shop,Restaurant,Rock Climbing Spot,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Spa,Sporting Goods Shop,Supermarket,Sushi Restaurant,Video Store,Warehouse Store,Yoga Studio
0,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Leaside,0.0,0.030303,0.030303,0.030303,0.030303,0.030303,0.030303,0.060606,0.0,0.0,0.0,0.0,0.030303,0.090909,0.0,0.0,0.0,0.030303,0.0,0.030303,0.0,0.030303,0.030303,0.030303,0.030303,0.030303,0.0,0.0,0.0,0.030303,0.030303,0.0,0.030303,0.0,0.0,0.030303,0.030303,0.0,0.060606,0.030303,0.0,0.030303,0.0,0.090909,0.030303,0.030303,0.0,0.0,0.0
2,Thorncliffe Park,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.125,0.0,0.0,0.0,0.0625,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0625
3,"Woodbine Gardens, Ryerson",0.071429,0.0,0.071429,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.071429,0.0,0.0,0.071429,0.0,0.071429,0.0,0.0,0.0,0.071429,0.071429,0.142857,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Woodbine Heights,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.0


In [18]:
ey_grouped.shape

(5, 50)

In [19]:
num_top_venues = 5

for hood in ey_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ey_grouped[ey_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----East Toronto----
                    venue  freq
0                    Park  0.50
1  Furniture / Home Store  0.25
2       Convenience Store  0.25
3      Athletics & Sports  0.00
4              Restaurant  0.00


----Leaside----
                 venue  freq
0  Sporting Goods Shop  0.09
1          Coffee Shop  0.09
2       Sandwich Place  0.06
3         Burger Joint  0.06
4        Grocery Store  0.03


----Thorncliffe Park----
               venue  freq
0  Indian Restaurant  0.12
1      Grocery Store  0.06
2        Coffee Shop  0.06
3    Warehouse Store  0.06
4     Sandwich Place  0.06


----Woodbine Gardens, Ryerson----
                  venue  freq
0  Fast Food Restaurant  0.14
1           Pizza Place  0.14
2    Athletics & Sports  0.07
3                  Café  0.07
4             Gastropub  0.07


----Woodbine Heights----
         venue  freq
0  Curling Ice  0.12
1   Beer Store  0.12
2  Video Store  0.12
3         Park  0.12
4          Spa  0.12




In [20]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ey_grouped['Neighborhood']

for ind in np.arange(ey_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ey_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,Park,Furniture / Home Store,Convenience Store,Clothing Store,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Discount Store,Dessert Shop,Curling Ice
1,Leaside,Coffee Shop,Sporting Goods Shop,Sandwich Place,Burger Joint,Grocery Store,Brewery,Dessert Shop,Electronics Store,Clothing Store,Fish & Chips Shop
2,Thorncliffe Park,Indian Restaurant,Yoga Studio,Pizza Place,Bank,Burger Joint,Bus Line,Bus Station,Coffee Shop,Discount Store,Warehouse Store
3,"Woodbine Gardens, Ryerson",Pizza Place,Fast Food Restaurant,Rock Climbing Spot,Gastropub,Gym / Fitness Center,Intersection,Pet Store,Pharmacy,Café,Athletics & Sports
4,Woodbine Heights,Video Store,Beer Store,Spa,Skating Rink,Curling Ice,Cosmetics Shop,Park,Bus Stop,Yoga Studio,Clothing Store


Separate the neighborhoods in clusters

In [22]:
# set number of clusters
kclusters = 4

ey_grouped_clustering = ey_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ey_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([1, 0, 0, 3, 2], dtype=int32)

In [23]:
ey_merged = ey_data

# add clustering labels
ey_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ey_merged = ey_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ey_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4B,East York,"Woodbine Gardens, Ryerson",43.706397,-79.309937,1,Pizza Place,Fast Food Restaurant,Rock Climbing Spot,Gastropub,Gym / Fitness Center,Intersection,Pet Store,Pharmacy,Café,Athletics & Sports
1,M4C,East York,Woodbine Heights,43.695344,-79.318389,0,Video Store,Beer Store,Spa,Skating Rink,Curling Ice,Cosmetics Shop,Park,Bus Stop,Yoga Studio,Clothing Store
2,M4G,East York,Leaside,43.70906,-79.363452,0,Coffee Shop,Sporting Goods Shop,Sandwich Place,Burger Joint,Grocery Store,Brewery,Dessert Shop,Electronics Store,Clothing Store,Fish & Chips Shop
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372,3,Indian Restaurant,Yoga Studio,Pizza Place,Bank,Burger Joint,Bus Line,Bus Station,Coffee Shop,Discount Store,Warehouse Store
4,M4J,East York,East Toronto,43.685347,-79.338106,2,Park,Furniture / Home Store,Convenience Store,Clothing Store,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Discount Store,Dessert Shop,Curling Ice


In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ey_merged['Latitude'], ey_merged['Longitude'], ey_merged['Neighborhood'], ey_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters