Scrape the Wikipedia page, using Beautiful Soup package

Get url, decode page source

In [27]:
import requests
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [28]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')

Find source code containing table, extract table columns

In [29]:
My_table=soup.find('table',{'class':'wikitable sortable'})

In [30]:
rows=My_table.find_all('tr')
columns=[v.text.replace('\n','') for v in rows[0].find_all('th')]

Extract table values: 
1.Ignore rows with borough 'not assigned'
2.Set not assigned neighbourhood with the same value of borough
3.Sort the table by Postal code for convenience

In [31]:
import pandas as pd
df=pd.DataFrame(columns=columns)
for i in range(1,len(rows)):
    tds=rows[i].find_all('td')
    values=[v.text.replace('\n','') for v in tds]
    if values[1].lower()!='not assigned':
        if values[2].lower()=='not assigned':
            values[2]=values[1]
        df=df.append(pd.Series(values,index=columns),ignore_index=True)
df.sort_values(by='Postcode',inplace=True)
df.reset_index(drop=True,inplace=True)
#print(df)

Merge rows with same postal code, print the final table

In [32]:
df2=pd.DataFrame(columns=columns)
k=-1
i=0
while i+1<len(df):
    df2=df2.append(df.iloc[i])
    k+=1
    j=i+1
    while df.iloc[i]['Postcode']==df.iloc[j]['Postcode']:
        df2.iloc[k]['Neighbourhood']=df2.iloc[k]['Neighbourhood']+', '+df.iloc[j]['Neighbourhood']
        j+=1
    i=j  
df2=df2.append(df.iloc[i])    
df2.reset_index(drop=True,inplace=True)
#print(df2)    

Print the shape of final table

In [33]:
print(df2.shape)

(103, 3)


Import csv file

In [34]:
post2coor=pd.read_csv("Geospatial_Coordinates.csv")

Look up coordinates and add new columns to the final table df2

In [35]:
coorla=[]
coorlo=[]
for i in range(len(df2)):
    coor=post2coor.loc[post2coor['Postal Code']==df2.iloc[i]['Postcode']]
    coorla.append(coor.iloc[0]['Latitude'])
    coorlo.append(coor.iloc[0]['Longitude'])
df2['Latitude']=coorla
df2['Longitude']=coorlo
pd.options.display.max_colwidth = 20
print(df2)

    Postcode      Borough        Neighbourhood   Latitude  Longitude
0        M1B  Scarborough       Rouge, Malvern  43.806686 -79.194353
1        M1C  Scarborough  Port Union, Roug...  43.784535 -79.160497
2        M1E  Scarborough  Guildwood, Morni...  43.763573 -79.188711
3        M1G  Scarborough               Woburn  43.770992 -79.216917
4        M1H  Scarborough            Cedarbrae  43.773136 -79.239476
..       ...          ...                  ...        ...        ...
98       M9N         York               Weston  43.706876 -79.518188
99       M9P    Etobicoke            Westmount  43.696319 -79.532242
100      M9R    Etobicoke  Richview Gardens...  43.688905 -79.554724
101      M9V    Etobicoke  Albion Gardens, ...  43.739416 -79.588437
102      M9W    Etobicoke            Northwest  43.706748 -79.594054

[103 rows x 5 columns]


Select rows with Borough containing 'Toronto'

In [36]:
toronto=pd.DataFrame(columns=columns)
for i in range(len(df2)):
    if('toronto' in df2.iloc[i]['Borough'].lower())==True:
        toronto=toronto.append(df2.iloc[i])
toronto.reset_index(drop=True,inplace=True)
print(toronto)

   Postcode           Borough        Neighbourhood   Latitude  Longitude
0       M4E      East Toronto          The Beaches  43.676357 -79.293031
1       M4K      East Toronto  Riverdale, The D...  43.679557 -79.352188
2       M4L      East Toronto  The Beaches West...  43.668999 -79.315572
3       M4M      East Toronto      Studio District  43.659526 -79.340923
4       M4N   Central Toronto        Lawrence Park  43.728020 -79.388790
5       M4P   Central Toronto     Davisville North  43.712751 -79.390197
6       M4R   Central Toronto   North Toronto West  43.715383 -79.405678
7       M4S   Central Toronto           Davisville  43.704324 -79.388790
8       M4T   Central Toronto  Summerhill East,...  43.689574 -79.383160
9       M4V   Central Toronto  South Hill, Summ...  43.686412 -79.400049
10      M4W  Downtown Toronto             Rosedale  43.679563 -79.377529
11      M4X  Downtown Toronto  St. James Town, ...  43.667967 -79.367675
12      M4Y  Downtown Toronto  Church and Welle... 

Create map of Toronto

In [37]:
import folium

latitude=toronto['Latitude'].mean()
longitude=toronto['Longitude'].mean()

map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood \
in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

Acquire data from Foursquare API

In [38]:
CLIENT_ID = 'FQSS1JCEKBE2MH50Q3ZUKMGUWL3CWHVCREBY3XTYZMFZE2IA' # your Foursquare ID
CLIENT_SECRET = '01WDQZYC4WKJMY1W1WM1Q21LPXL2MBSHJFDX5UVAYMDYG0V3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get nearby venues

In [39]:
toronto_venues = getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude'])

The Beaches
Riverdale, The Danforth West
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East, Moore Park
South Hill, Summerhill West, Rathnelly, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide, Richmond, King
Harbourfront East, Union Station, Toronto Islands
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
Yorkville, The Annex, North Midtown
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
King and Spadina, Railway Lands, South Niagara, CN Tower, Island airport, Harbourfront West, Bathurst Quay
Stn A PO Boxes 25 The Esplanade
Underground city, First Canadian Place
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Exhibition Place, Brockton, Parkdale Village
High Park, The Junction Sout

In [40]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, Richmond, King",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
Central Bay Street,79,79,79,79,79,79
Christie,17,17,17,17,17,17
Church and Wellesley,87,87,87,87,87,87
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,9,9,9,9,9,9
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100


In [41]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 uniques categories.


In [42]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [43]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

Categorize neighborhoods based on their top 5 venues

In [44]:
import numpy as np

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = \
    return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

In [45]:
from sklearn.cluster import KMeans
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [46]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

In [47]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in \
zip(toronto_merged['Latitude'], toronto_merged['Longitude'],\
    toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters