# **Segmenting and Clustering Neighborhoods in Toronto, Canada**

### PART 1 : CLEANING UP THE DATASET 

**Step 1** : Import the relevant libraries.

In [1]:
import pandas as pd # library to handle data in a vectorized manner
import requests # library for data analysis
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import json # library to handle JSON files
from bs4 import BeautifulSoup # library to parse HTML and XML documents
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


**Step 2** : Download the dataset and transform into _pandas_ dataframe.

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html_content = website_url.content
html_soup = BeautifulSoup(html_content, 'html.parser')
Wiki_table = html_soup.find('table', class_="wikitable sortable")

In [3]:
df = pd.read_html(str(Wiki_table))
df = df[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


 **Step 3** : Renaming relevant columns to reflect PostalCode, Borough, and Neighborhood.

In [4]:
df.rename(columns={'Postal code':'PostalCode'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


**Step 4** : Remove rows with a Borough that is Not assigned.

In [5]:
df["Borough"].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Mississauga          1
Name: Borough, dtype: int64

In [6]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


**Step 5** : Combine rows with same PostalCode into one row with neighborhoods separated with a comma. (This includes changing slash separator in some cells.)

In [7]:
df["Neighborhood"] = df["Neighborhood"].str.replace(" /",",")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
result = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
df = result.reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Step 6** : For boroughs with non-assigned neighborhood, neighborhood will be the same as the borough. (Note: Given the below result, no missing neighborhood.)

In [9]:
missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

PostalCode
False    103
Name: PostalCode, dtype: int64

Borough
False    103
Name: Borough, dtype: int64

Neighborhood
False    103
Name: Neighborhood, dtype: int64



In [10]:
df.shape

(103, 3)

### PART 2 : LATITUDE AND LONGITUDE FOR EACH NEIGHBORHOOD

**Step 1** : Get the geospatial data.

In [11]:
filename = 'https://cocl.us/Geospatial_data'
df2 = pd.read_csv(filename)
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


**Step 2** : Merge geospatial data with postal codes data. 

In [12]:
#Rename the column "PostalCode"
df2.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#Merge two tables on the column "PostalCode"
df3 = df.merge(df2, on="PostalCode", how="left")
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [14]:
df3.shape

(103, 5)

### PART 3 : LATITUDE AND LONGITUDE FOR EACH NEIGHBORHOOD

**Step 1** : Use geopy library to get the latitude and longitude values of Toronto.

In [15]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**Step 2** : Create a map of Toronto with neighborhoods superimposed on top.

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

**Step 3** : Filter boroughs that contain the word Toronto.

In [17]:
#Filter borough names that contain the word Toronto
borough_names = list(df3.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']

In [18]:
#Create a new DataFrame with only boroughs that contain the word Toronto
df4 = df3[df3['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(df4.shape)
df4.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

**Step 4** : Use Foursquare API to explore.

In [25]:
#Define Foursquare Credentials and Version
CLIENT_ID = ' ' # your Foursquare ID
CLIENT_SECRET = ' ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID:  
CLIENT_SECRET: 


In [21]:
#Top 10 venues within radius 100m.
radius = 500
LIMIT = 10

venues = []

for lat, long, post, borough, neighborhood in zip(df4['Latitude'], df4['Longitude'], df4['PostalCode'], df4['Borough'], df4['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [22]:
#Convert venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

#Define column names
venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(346, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [23]:
#Number of venues returned for each PostalCode
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M4E,East Toronto,The Beaches,4,4,4,4,4,4
M4K,East Toronto,"The Danforth West, Riverdale",10,10,10,10,10,10
M4L,East Toronto,"India Bazaar, The Beaches West",10,10,10,10,10,10
M4M,East Toronto,Studio District,10,10,10,10,10,10
M4N,Central Toronto,Lawrence Park,4,4,4,4,4,4
M4P,Central Toronto,Davisville North,6,6,6,6,6,6
M4R,Central Toronto,North Toronto West,10,10,10,10,10,10
M4S,Central Toronto,Davisville,10,10,10,10,10,10
M4T,Central Toronto,"Moore Park, Summerhill East",2,2,2,2,2,2
M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park",10,10,10,10,10,10


In [24]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))
venues_df['VenueCategory'].unique()[:50]

There are 119 uniques categories.


array(['Bakery', 'Coffee Shop', 'Breakfast Spot', 'Distribution Center',
       'Spa', 'Restaurant', 'Park', 'Gym / Fitness Center',
       'Historic Site', 'Sushi Restaurant', 'Italian Restaurant',
       'Creperie', 'Mexican Restaurant', 'Beer Bar', 'Clothing Store',
       'Tea Room', 'Comic Shop', 'Theater', 'Plaza', 'Café',
       'Burrito Place', 'Music Venue', 'Japanese Restaurant',
       'Food Truck', 'Cosmetics Shop', 'Middle Eastern Restaurant', 'Gym',
       'Gastropub', 'Trail', 'Health Food Store', 'Pub', 'Neighborhood',
       'Liquor Store', 'Vegetarian / Vegan Restaurant', 'Concert Hall',
       'Museum', 'Farmers Market', 'Cocktail Bar', 'Fountain',
       'Modern European Restaurant', 'Bubble Tea Shop', 'Grocery Store',
       'Diner', 'Candy Store', 'Speakeasy', 'Steakhouse', 'Hotel', 'Bar',
       'Brewery', 'Bank'], dtype=object)

## END OF ASSIGNMENT.