# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np

## **Submission 1**

**1. Scrape Wikipedia page and transform the data into a pandas dataframe**

In [2]:
#!conda install -c conda-forge lxml --yes

df_wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
#df_wiki.head()

**2. Ignore cells with a borough that is 'Not assigned'**

In [3]:
df_wiki_clean = df_wiki[df_wiki['Borough'] != 'Not assigned'].reset_index(drop=True)
#df_wiki_clean.head()

**3. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.**

In [4]:
for i,b in zip(df_wiki_clean.index, df_wiki_clean['Neighborhood'] == 'Not assigned'):
    if b:
        df_wiki_clean.loc[i,'Neighborhood'] = df_wiki_clean.loc[i,'Borough']
        print('Neighborhood of index ',i, ' is assigned')
        
df_wiki_clean.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# Remark: there are postal codes who have the same neighborhood's name, for example, Downsview 
#df_wiki_clean.groupby('Neighborhood').count().reset_index().head(30)

**4. Print the number of rows of your dataframe**

In [6]:
print('The cleaned dataframe has ', df_wiki_clean.shape[0], ' rows.')

The cleaned dataframe has  103  rows.


## **Submission 2**

**Use the Geocoder package to get the latitude and longitude coordinates of given postal code**

**1. Download the csv file**

In [7]:
!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


**2. Read the csv file into a dataframe**

In [8]:
df_geospatial = pd.read_csv('Geospatial_data.csv')
#df_geospatial.shape

**3. assign the geospatial coordinates to the Neighborhood dataframe**

In [9]:
df_wiki_geo = df_wiki_clean.join(df_geospatial.set_index('Postal Code'), on='Postal Code')
df_wiki_geo.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# **Submission 3**

**Explore and cluster the neighborhoods in Toronto.**

**1. Before we get the data and start exploring it, let's download all the dependencies that we will need.**

In [10]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip -q install folium
import folium # map rendering library

**2. Use geopy library to get the latitude and longitude values of Toronto.**

In [11]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**3. Create a map of Toronto with neighborhoods**

In [12]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_wiki_geo['Latitude'], df_wiki_geo['Longitude'], df_wiki_geo['Borough'], df_wiki_geo['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

**4. Define Foursquare Credentials and Version**

In [13]:
# The code was removed by Watson Studio for sharing.

**5. Explore the neighborhoods in our dataframe**

In [14]:
def getNearbyVenues(postal_codes, names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for postal_code, name, lat, lng in zip(postal_codes, names, latitudes, longitudes):
        #print(postal_code, name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        if (len(results) == 0):
            print('Foursquare does not find any venues within the radius ', radius,' m near ', postal_code, name)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            postal_code,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood',           
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**6. Get the venues of each neighborhood and create a new dataframe called *toronto_venues*.**

In [15]:
LIMIT = 5
toronto_venues = getNearbyVenues(postal_codes=df_wiki_geo['Postal Code'],
                                 names=df_wiki_geo['Neighborhood'],
                                 latitudes=df_wiki_geo['Latitude'],
                                 longitudes=df_wiki_geo['Longitude']
                                )
toronto_venues.head()
#toronto_venues.groupby('Postal Code').count().reset_index().shape

Unnamed: 0,Postal Code,Neighborhood,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,M3A,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
2,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
3,M3A,Parkwoods,43.753259,-79.329656,Island Foods,43.745866,-79.346035,Caribbean Restaurant
4,M3A,Parkwoods,43.753259,-79.329656,Galleria Supermarket,43.75352,-79.349518,Supermarket


**7. Analyze each neighborhood**

In [16]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
#toronto_onehot = toronto_onehot.drop('Neighborhood',axis=1)--- to del
toronto_onehot.insert(loc=0, column='Postal Code', value=toronto_venues['Postal Code'])

# let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()

# remove the feature 'Neighborhood'
toronto_grouped.drop(columns='Neighborhood', inplace=True)

# check
#toronto_grouped.head()
#print('The shapes of the dataframes toronto_onehot and toronto_grouped are ', toronto_onehot.shape, toronto_grouped.shape)

**8. Cluster neigborhoods: Run *k*-means to cluster the neighborhood into 5 clusters.**

In [17]:
# set number of clusters
kclusters = 5

# get features
toronto_features = toronto_grouped.drop('Postal Code', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_features)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 0, 2, 2, 2, 1, 2, 0, 2], dtype=int32)

**9. Let's create a new dataframe that includes the cluster as well as the venues for each neighborhood.**

In [18]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
#toronto_grouped.head()

In [19]:
toronto_merged = df_wiki_geo

# merge toronto_grouped with df_wiki_geo to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Postal Code'), on='Postal Code')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Afghan Restaurant,Airport,American Restaurant,Arts & Crafts Store,...,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wings Joint,Xinjiang Restaurant,Zoo,Zoo Exhibit
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**10. Finally, let's visualize the resulting clusters**

In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters