**Here, I import the libraries that we'll need for all 3 parts of the assignment.**

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

** In the next few cells, I will create a data frame based on the table from 
the Wikipedia article. **

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')
td_elements = soup.find('table').find_all('td')

content = []
data = []

for el in td_elements:
    content.append(el.text.split('\n')[0])

# Create rough draft of data frame
for j in range(0,len(content),3):
    Postcode = content[j]
    Borough = content[j+1]
    Neighborhood = content[j+2]
    
    # 'Process' if Borough is assigned
    if Borough != 'Not assigned':
    
        # Set Neighborhood to Borough if Neighborhood is not assigned
        if Neighborhood == 'Not assigned':
            Neighborhood = Borough 
    
        data.append({'Postcode':Postcode,'Borough':Borough,'Neighborhood':Neighborhood})
                   
df = pd.DataFrame(data)

print('This is the first 5 rows of the first draft of the dataframe:')
df.head()

This is the first 5 rows of the first draft of the dataframe:


Unnamed: 0,Borough,Neighborhood,Postcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


In [3]:
# Change order of the columns of the data frame to the right order
df = pd.DataFrame([df['Postcode'],df['Borough'],df['Neighborhood']]).T

print('This is the first 5 rows of the second draft of the dataframe:')
df.head()

This is the first 5 rows of the second draft of the dataframe:


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


** In the next 2 cells, I create a subset of the above data frame where each borough contains the word 'Toronto', and add latitudes and longitudes corresponding to each neighborhood. **

In [4]:
# Create new data frame with Boroughs which contain the word 'Toronto'
# ('West Toronto', 'East Toronto', ...)
# and whose rows are ordered by 'Neighborhood' alphabetically

# List for Trues and Falses 
Toronto_TF_list = []

# Append True if 'Toronto' is found in the name of the Borough,
# otherwise append False

for Borough in df['Borough']:
    if Borough.find('Toronto') == -1:
        Toronto_TF_list.append(False)
    else:
        Toronto_TF_list.append(True)

tdf = df[Toronto_TF_list]
tdf.sort_values('Neighborhood',inplace = True)
tdf.reset_index(drop = True, inplace = True)

print('First 5 rows of tdf:')
tdf.head()

First 5 rows of tdf:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Postcode,Borough,Neighborhood
0,M5H,Downtown Toronto,Adelaide
1,M5V,Downtown Toronto,Bathurst Quay
2,M5E,Downtown Toronto,Berczy Park
3,M6K,West Toronto,Brockton
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern


In [5]:
# Populate dataframe with latitudes and longitudes
# of the neighborhoods using geopy.geocoders

from geopy.geocoders import Nominatim
latitudes = []
longitudes =[]

for i in range(0,len(tdf)):
        address = '{}, Toronto'.format(tdf['Neighborhood'][i])
        geolocator = Nominatim()
        location = geolocator.geocode(address)
        if (location != None):
            latitudes.append(location.latitude)
            longitudes.append(location.longitude)
        else:
            latitudes.append('')
            longitudes.append('')

tdf['Latitude'] = latitudes
tdf['Longitude'] = longitudes

print('First 5 rows of tdf')
tdf.head()



First 5 rows of tdf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M5H,Downtown Toronto,Adelaide,43.6503,-79.3805
1,M5V,Downtown Toronto,Bathurst Quay,43.6358,-79.3983
2,M5E,Downtown Toronto,Berczy Park,43.648,-79.3754
3,M6K,West Toronto,Brockton,43.651,-79.44
4,M7Y,East Toronto,Business reply mail Processing Centre969 Eastern,,


In [7]:
# Create a map for the subset of neighborhoods in Toronto which 
# are not missing coordinates and whose Borough contained 'Toronto' 

tdf = tdf[(tdf['Latitude'] != '') & (tdf['Longitude'] != '')]
tdf.reset_index(drop = True,inplace = True)

Toronto_Map = folium.Map(location=[tdf['Latitude'].mean(), tdf['Longitude'].mean()], zoom_start=3)

# Add markers to map
for lat, lng, borough, neighborhood in zip(tdf['Latitude'], tdf['Longitude'], tdf['Borough'], tdf['Neighborhood']):
    label = '{} , {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(Toronto_Map)
        
Toronto_Map

**Some latitudes and longitudes seem to be wrong, so I remove some rows below.**

In [8]:
# Remove entries whose coordinates are probably wrong
tdf = tdf[(tdf['Neighborhood'] != 'Railway Lands')&\
(tdf['Neighborhood'] != 'North Midtown')&\
(tdf['Neighborhood'] != 'South Niagara')]

tdf.reset_index(drop = True,inplace = True)

In [13]:
# Create a map for the subset of neighborhoods in Toronto which 
# do not have any coordinates missing or likely wrong coordinates, and whose Borough 
# contained 'Toronto'
Toronto_Map = folium.Map(location=[tdf['Latitude'].mean(), tdf['Longitude'].mean()], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(tdf['Latitude'], tdf['Longitude'], tdf['Borough'], tdf['Neighborhood']):
    label = '{} , {}'.format(neighborhood,borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(Toronto_Map)
 
Toronto_Map

** Some of the latitudes and longitudes still seem to be incorrect. For example, the left most point and top most point 
are both supposedly part of Downtown Toronto yet the nearest 4 points to the left most point are supposedly part of West Toronto.**

**For the sake of not spending too much time cleaning the data, lets continue anyways.** 

In [10]:
CLIENT_ID = 'Z3DIG2FE2ODBVM4WHAHRBVAYMPFMQFE4FZERKYKPXRQBIVCL' # your Foursquare ID
CLIENT_SECRET = 'TRLMJL4F13DZSBEJBFSLDFSMXZJHS454BWV4EU03WBT00XIY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

**For the rest of the notebook, I often use the code given in the ungraded lab for Week 3 (but with some minor changes like name changes) **

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius= 100):
    
    LIMIT = 100
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # Create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # Make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # Return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Here, I create a dataframe containing some venues within some distance of the coordinates of each Neighborhood, and order the rows by Neighborhood in alphabetical order.**

In [12]:
Some_Toronto_Venues = getNearbyVenues(tdf['Neighborhood'],tdf['Latitude'],tdf['Longitude'])
Some_Toronto_Venues.sort_values('Neighborhood',inplace = True)
Some_Toronto_Venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Adelaide,43.650298,-79.380477,John & Sons Oyster House,43.650656,-79.381613,Seafood Restaurant
10,Adelaide,43.650298,-79.380477,Ben McNally Books,43.65085,-79.381178,Bookstore
8,Adelaide,43.650298,-79.380477,Sweet Lulu,43.650557,-79.381175,Asian Restaurant
7,Adelaide,43.650298,-79.380477,Cloud Gardens,43.65107,-79.37988,Park
6,Adelaide,43.650298,-79.380477,Mercatto,43.650243,-79.38082,Italian Restaurant


In [14]:
print('Number of rows of Some_Toronto_Venues:',Some_Toronto_Venues.shape[0])

Number of rows of Some_Toronto_Venues: 402


** In the next couple of cells, I make a data frame based on the one above for use in Kmeans clustering**

In [15]:
# One hot encoding
Toronto_onehot = pd.get_dummies(Some_Toronto_Venues[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Some_Toronto_Venues['Neighborhood'] 

Toronto_onehot = Toronto_onehot[['Neighborhood'] + list(Toronto_onehot.columns[Toronto_onehot.columns != 'Neighborhood'])]

Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Adult Boutique,African Restaurant,American Restaurant,Amphitheater,Aquarium,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,...,Thai Restaurant,Theater,Theme Park,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,CN Tower,0.0,0.0,0.083333,0.0,0.333333,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0


In [16]:
# Add empty Latitude and Longitude columns
Toronto_grouped['Latitude']=''
Toronto_grouped['Latitude']=''

# Change order of columns of Toronto_grouped
Toronto_grouped = Toronto_grouped.loc[:,['Neighborhood','Latitude','Longitude']\
+ list(Toronto_grouped.columns[(Toronto_grouped.columns != 'Latitude')&\
(Toronto_grouped.columns != 'Longitude') &\
(Toronto_grouped.columns != 'Neighborhood')])]

# Populate Latitude and Longitude columns of Toronto_grouped
for i,Neighborhood in enumerate(Toronto_grouped['Neighborhood']):  
    j = 0
    while(j < len(tdf)):
        if tdf['Neighborhood'][j] == Neighborhood:
            Toronto_grouped['Latitude'][i] = tdf['Latitude'][j]
            Toronto_grouped['Longitude'][i] = tdf['Longitude'][j]
            break
        else:
            j = j + 1   

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


**In the next few cells, I classify each point as being part of one of 5 groups and display the results visually.**

In [17]:
# Set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop(['Neighborhood','Latitude','Longitude'], 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 3, 0, 3, 0, 0, 0, 0, 0, 0], dtype=int32)

In [18]:
# Add column for cluster labels
Toronto_grouped['Cluster Labels'] = kmeans.labels_

# Change order of columns
Toronto_grouped = Toronto_grouped.loc[:,['Neighborhood','Latitude','Longitude', 'Cluster Labels']\
+ list(Toronto_grouped.columns[(Toronto_grouped.columns != 'Latitude')&\
(Toronto_grouped.columns != 'Longitude') &\
(Toronto_grouped.columns != 'Neighborhood')&
(Toronto_grouped.columns != 'Cluster Labels')])]

Toronto_grouped.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,Adult Boutique,African Restaurant,American Restaurant,Amphitheater,Aquarium,Art Gallery,...,Thai Restaurant,Theater,Theme Park,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Adelaide,43.6503,-79.380477,0,0.0,0.0,0.090909,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bathurst Quay,43.6358,-79.398329,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,43.648,-79.375385,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,43.651,-79.440029,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
4,CN Tower,43.6426,-79.387087,0,0.0,0.0,0.083333,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0


In [19]:
# Create map
map_clusters = folium.Map(location=[tdf['Latitude'].mean(), tdf['Longitude'].mean()], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_grouped['Latitude'], Toronto_grouped['Longitude'], Toronto_grouped['Neighborhood'], Toronto_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters