# Part 1
Scraping Toronto Neighborhood data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and transferring it to a Pandas dataframe

In [1]:
pip install mechanize

Collecting mechanize
  Downloading mechanize-0.4.5-py2.py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 17.1 MB/s eta 0:00:01
Installing collected packages: mechanize
Successfully installed mechanize-0.4.5
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import mechanize as mech

In [3]:
# open webpage
webpage = mech.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
# create dataframe to hold the neighborhoods
columns = ['Postal Code', 'Borough', 'Neighborhood']
toronto_neighborhoods = pd.DataFrame(columns = columns)

# pass the webpage to BeautifulSoup and add info from the table to the dataframe
page_stream = bs(webpage.read())
rows = page_stream.table.find_all('tr')
for i, row in enumerate(rows[1:]):
    data = row.find_all('td')
    if data[1].string.strip() != 'Not assigned':
        if data[2].string.strip() == 'Not assigned':
            toronto_neighborhoods = toronto_neighborhoods.append({'Postal Code': data[0].string.strip(), 'Borough': data[1].string.strip(), 'Neighborhood': data[1].string.strip()}, ignore_index=True)
        else:
            toronto_neighborhoods = toronto_neighborhoods.append({'Postal Code': data[0].string.strip(), 'Borough': data[1].string.strip(), 'Neighborhood': data[2].string.strip()}, ignore_index=True)

In [5]:
toronto_neighborhoods.shape

(103, 3)

# Part 2

In [6]:
!wget -q -O 'postal_code_data.csv' http://cocl.us/Geospatial_data

In [7]:
postal_code_data = pd.read_csv('postal_code_data.csv')

toronto_neighborhoods = toronto_neighborhoods.join(postal_code_data.set_index('Postal Code'), on='Postal Code')

# Part 3

In [8]:
toronto_neighborhoods.shape

(103, 5)

In [9]:
pip install folium

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.0 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
import requests
import folium
from sklearn.cluster import KMeans

In [29]:
places = pd.DataFrame(columns = ['Neighborhood', 'Venue Category'])

In [30]:
url = "https://api.foursquare.com/v2/venues/search"
params = dict(
    client_id = 'SN5EXEFG4C4QTVFYMFIFHLGTJLLH4A1ADXXAN1U5YVEZQMNO',
    client_secret = 'A4RRPOGHNDSNAQXECYIHISB1T0CVBCUSCNLIORLPKKDST00Z',    
    v = '20180605',
    ll = '',
    radius = 500,
    limit = 100
    )

count_multiple_categories = 0
for neighborhood, lat, long in zip(toronto_neighborhoods['Neighborhood'], toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude']):
    params['ll'] = str(lat) + ", " + str(long)

    try:
        venues = requests.get(url, params = params).json()['response']['venues']
        for venue in venues: 
            category = venue['categories'][0]['name']
            
            places = places.append({'Neighborhood': neighborhood, 'Venue Category': category}, ignore_index=True)
    except ValueError:
        continue
    except IndexError: 
        continue

### Count the number of venue types in each category

In [31]:
places = pd.get_dummies(places, columns=['Venue Category'], prefix="")
places = places.groupby(by='Neighborhood').mean().reset_index()

In [35]:
# Display the top 5 most common ammenities in each neighborhood
num_top_venues = 5
print(places.head())

for neighborhood in places['Neighborhood']:
    print("----"+neighborhood+"----")
    temp = places[places['Neighborhood'] == neighborhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

                                      Neighborhood  _ATM  _Accessories Store  \
0                                        Agincourt   0.0                 0.0   
1                           Alderwood, Long Branch   0.0                 0.0   
2  Bathurst Manor, Wilson Heights, Downsview North   0.0                 0.0   
3                                  Bayview Village   0.0                 0.0   
4                Bedford Park, Lawrence Manor East   0.0                 0.0   

   _Adult Boutique  _Advertising Agency  _African Restaurant  _Airport  \
0              0.0                  0.0                  0.0       0.0   
1              0.0                  0.0                  0.0       0.0   
2              0.0                  0.0                  0.0       0.0   
3              0.0                  0.0                  0.0       0.0   
4              0.0                  0.0                  0.0       0.0   

   _Airport Food Court  _Airport Gate  _Airport Lounge  ...  \
0          

### Cluster neighborhoods into 10 groups

In [80]:
features = places.drop(columns=['Neighborhood'])
kmeans = KMeans(init="k-means++", n_clusters=5, max_iter=10)
kmeans.fit(features)

KMeans(max_iter=10, n_clusters=5)

In [84]:
places.insert(loc=1, column='Cluster', value=kmeans.labels_)

### Display top 5 venue types in each cluster

In [85]:
def find_top_five(row):
    venue_types = row.iloc[2:]
    venue_types_sorted = venue_types.sort_values(ascending=False)
    
    return venue_types_sorted.index.values[0:5]

In [86]:
endings = ['st', 'nd', 'rd']

columns = ['Cluster']

for i in range(1, 6):
    if i < 3:
        columns.append("{}{} most common venue type".format(i, endings[i - 1]))
    else:
        columns.append("{}th most common venue type".format(i))
        
most_common_venues_by_cluster = pd.DataFrame(columns=columns)
venues_by_cluster = places.drop(columns=['Neighborhood']).groupby(by='Cluster').sum().reset_index()
most_common_venues_by_cluster['Cluster'] = venues_by_cluster['Cluster']

for i in range(venues_by_cluster.shape[0]):
    most_common_venues_by_cluster.iloc[i, 1:] = find_top_five(venues_by_cluster.iloc[i,:])
    
most_common_venues_by_cluster

Unnamed: 0,Cluster,1st most common venue type,2nd most common venue type,3th most common venue type,4th most common venue type,5th most common venue type
0,0,_Coffee Shop,_Café,_Pharmacy,_Dentist's Office,_Bank
1,1,_Park,_Church,_Dog Run,_Hospital,_Furniture / Home Store
2,2,_Beach,_Café,_Playground,_Yoga Studio,_Dog Run
3,3,_Automotive Shop,_Playground,_Church,_Fast Food Restaurant,_Elementary School
4,4,_Residential Building (Apartment / Condo),_Park,_Office,_Coffee Shop,_Liquor Store


### Display the cluster on a map


In [57]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Unnamed: 0,Neighborhood,Cluster,_ATM,_Accessories Store,_Adult Boutique,_Advertising Agency,_African Restaurant,_Airport,_Airport Food Court,_Airport Gate,...,_Vegetarian / Vegan Restaurant,_Veterinarian,_Video Game Store,_Vietnamese Restaurant,_Warehouse,_Wine Bar,_Wine Shop,_Winery,_Women's Store,_Yoga Studio
0,Agincourt,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for i in range(len(cy)):
    print("Cluster {} is {}".format(i, colors_array[i]))

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude'], toronto_neighborhoods['Neighborhood'], places['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Cluster 0 is [0.5 0.  1.  1. ]
Cluster 1 is [0.00196078 0.70928131 0.92328911 1.        ]
Cluster 2 is [0.50392157 0.99998103 0.70492555 1.        ]
Cluster 3 is [1.         0.70054304 0.37841105 1.        ]
Cluster 4 is [1.0000000e+00 1.2246468e-16 6.1232340e-17 1.0000000e+00]
