# Segmenting and Clustering Neighborhoods in Toronto

##  ----------------- (Part 1 of 3) -----------------

In [35]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# !conda install -c conda-forge geopy --yes # uncomment this line if haven't installed geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't installed folium
import folium # map rendering library

### Scrape postal codes, boroughs & neighborhoods of Canana on Wikipedia page.

In [12]:
url_canada_list = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source_code = requests.get(url_canada_list).text
soup = BeautifulSoup(source_code, 'xml')
table_postal_codes = soup.find('table')
row_postal_codes = table_postal_codes.find('tbody')

### Create an empty dataframe.

In [13]:
# define the dataframe columns (PostalCode, Borough, and Neighborhood)
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
df_postal_codes = pd.DataFrame(columns=column_names)
df_postal_codes

Unnamed: 0,PostalCode,Borough,Neighborhood


###  Loop through the data and fill the dataframe.

In [14]:
# Loop table rows
for row in row_postal_codes.find_all('tr'):
    data = []
    for cell in row.find_all('td'):
        data.append(cell.text.strip())

    if len(data) > 0:
        df_postal_codes.loc[len(df_postal_codes)] = data

df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Remove rows with a borough that is 'Not assigned'.

In [15]:
df_postal_codes.drop(df_postal_codes[df_postal_codes['Borough']=='Not assigned'].index, inplace=True)
df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [16]:
df_postal_codes.loc[df_postal_codes['Neighborhood']=='Not assigned', ['Neighborhood']] = df_postal_codes['Borough']
df_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combine neighborhoods if same postal code exist in multiple rows.

In [17]:
def join_neighborhood(dataframe):    
    return ', '.join(sorted(dataframe['Neighborhood'].tolist()))
                    
temp_df = df_postal_codes.groupby(['PostalCode', 'Borough'])
df_postal_codes = temp_df.apply(join_neighborhood).reset_index(name='Neighborhood')

In [18]:
df_postal_codes.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### Print the number of rows of the dataframe.

In [19]:
df_postal_codes.shape

(103, 3)

##  ----------------- (Part 2 of 3) -----------------

### Download geographical coordinates of Toronto's file and fill it to dataframe

In [23]:
geospatial_data_url = 'https://cocl.us/Geospatial_data'
geo_df=pd.read_csv(geospatial_data_url)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge geographical coordinates with each postal code.

In [24]:
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geo_merged = pd.merge(geo_df, df_postal_codes, on='PostalCode')
geo_merged.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


### Reposition columns

In [25]:
df = geo_merged[['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


##  ----------------- (Part 3 of 3) -----------------

In [27]:
toronto_df = df[df['Borough'].str.contains("Toronto")]
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        try:
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            pass

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [34]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="trt_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [42]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label, borough in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_df['Borough']):
    label = folium.Popup('[ {} ] {}'.format(borough, label), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [45]:
# @hidden_cell
CLIENT_ID = 'NSB0D5HBV23BQ1LONDZYODLCIUHLGG1R50RQHP2VGBOGACZQ' # your Foursquare ID
CLIENT_SECRET = 'TNIAKADFEJ0KLEHKPJQKJ0TER3G0HLMB4EWYEFD5BQ01BXQ4' # your Foursquare Secret
VERSION = '20200605' # Foursquare API version