# Getting Wikipedia Data

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import geocoder

wiki_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = bs(wiki_page)
mytable = soup.find('table',{'class':'wikitable sortable'})
data = mytable.findAll('tr')

data_dic = {}

for row in data:
    temp = row.findAll("td")
    if len(temp)==3:
        # PostalCode p, borough b and neighborhood n 
        p,b,n = temp
        
        # Remove spaces and \n
        p = p.text.strip()
        b = b.text.strip()
        n = n.text.strip()

        if b!="Not assigned":
            if n == "Not assigned":
                n=b
            
            # Check if postalcode has been added, if yes, append neighborhood after comma
            if p in data_dic:
                data_dic[p][1] = data_dic[p][1] + ", " + n
            else:
                data_dic[p] = [b, n]

# Create a list to store data                
li = []
for k,v in data_dic.items():
    li.append([k, v[0], v[1]])

# Convert list of lists to DF    
dataframe = pd.DataFrame(li,columns=['PostalCode', 'Borough', 'Neighborhood'])#.set_index('Neighborhood')
print(dataframe.head())

  PostalCode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Queen's Park


In [2]:
print(dataframe.shape)

(103, 3)


# Getting Geodata

In [3]:
geospat = pd.read_csv('Geospatial_Coordinates.csv')

dataframe = pd.merge(dataframe, geospat, how='left', left_on='PostalCode', right_on='Postal Code', validate="1:1")
dataframe.drop(labels='Postal Code', axis=1, inplace=True)
print(dataframe.head())

  PostalCode           Borough                      Neighborhood   Latitude  \
0        M3A        North York                         Parkwoods  43.753259   
1        M4A        North York                  Victoria Village  43.725882   
2        M5A  Downtown Toronto         Harbourfront, Regent Park  43.654260   
3        M6A        North York  Lawrence Heights, Lawrence Manor  43.718518   
4        M7A      Queen's Park                      Queen's Park  43.662301   

   Longitude  
0 -79.329656  
1 -79.315572  
2 -79.360636  
3 -79.464763  
4 -79.389494  


# Foursquare API to get venues

In [8]:
CLIENT_ID = 'DXQ1YUUJINSDWH5IKHURSVTZN0IPKU4WPWBXNWVL0E1FKWYK' # your Foursquare ID
CLIENT_SECRET = '4HIOYYAJKGGA4UKFSEVSZVLWHL4IWFIBBLQXDOX1TCMFY14R' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
venues_list = []

for _, row in dataframe.iterrows():
    
    lat = row['Latitude']
    lng = row['Longitude']
    name = row['Neighborhood']
    code = row['PostalCode']
    
    

    LIMIT = 10
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION,
        lat, lng, 1000, LIMIT)

    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
#     print(venues[0]['venue']['location']['lng'])
    venues_list.append(
        [
            (code, name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  
                v['venue']['categories'][0]['shortName']) 
         for v in venues
        ]
                      )

In [9]:
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['PostalCode',
                    'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

print(nearby_venues.head())

  PostalCode Neighborhood  Neighborhood Latitude  Neighborhood Longitude  \
0        M3A    Parkwoods              43.753259              -79.329656   
1        M3A    Parkwoods              43.753259              -79.329656   
2        M3A    Parkwoods              43.753259              -79.329656   
3        M3A    Parkwoods              43.753259              -79.329656   
4        M3A    Parkwoods              43.753259              -79.329656   

                      Venue  Venue Latitude  Venue Longitude Venue Category  
0           Allwyn's Bakery       43.759840       -79.324719      Caribbean  
1           Brookbanks Park       43.751976       -79.332140           Park  
2               Tim Hortons       43.760668       -79.326368           Café  
3                A&W Canada       43.760643       -79.326865      Fast Food  
4  High Street Fish & Chips       43.745260       -79.324949   Fish & Chips  


# Grouping data and getting counts

In [18]:
nearby_venues.groupby('PostalCode').count()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M1B,10,10,10,10,10,10,10
M1C,4,4,4,4,4,4,4
M1E,10,10,10,10,10,10,10
M1G,8,8,8,8,8,8,8
M1H,10,10,10,10,10,10,10
M1J,10,10,10,10,10,10,10
M1K,10,10,10,10,10,10,10
M1L,10,10,10,10,10,10,10
M1M,10,10,10,10,10,10,10
M1N,10,10,10,10,10,10,10


# One-hot

In [48]:
toronto_venues = nearby_venues.copy()


# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

(987, 178)


Unnamed: 0,PostalCode,Afghan,Airport,American,Apparel,Art Gallery,Arts & Crafts,Asian,Athletics & Sports,Auto Dealer,...,Toys & Games,Trail,Train Station,Turkish,Vegetarian / Vegan,Vietnamese,Warehouse Store,Wings,Yoga Studio,Yogurt
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head()

(102, 178)


Unnamed: 0,PostalCode,Afghan,Airport,American,Apparel,Art Gallery,Arts & Crafts,Asian,Athletics & Sports,Auto Dealer,...,Toys & Games,Trail,Train Station,Turkish,Vegetarian / Vegan,Vietnamese,Warehouse Store,Wings,Yoga Studio,Yogurt
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Clustering

In [50]:
from sklearn.cluster import KMeans
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('PostalCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 0, 4, 4, 0, 4, 4, 4, 1, 1])

In [59]:
toronto_grouped['label'] = kmeans.labels_
toronto_plot = pd.merge(toronto_grouped, dataframe, how='left',
        on='PostalCode', validate="1:1")
toronto_plot.head()

Unnamed: 0,PostalCode,Afghan,Airport,American,Apparel,Art Gallery,Arts & Crafts,Asian,Athletics & Sports,Auto Dealer,...,Vietnamese,Warehouse Store,Wings,Yoga Studio,Yogurt,label,Borough,Neighborhood_y,Latitude,Longitude
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4,Scarborough,Woburn,43.770992,-79.216917
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,Scarborough,Cedarbrae,43.773136,-79.239476


# Plotting a map

In [93]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

mycolors = ["#FF0000","#00FF00","#0000FF","#000000","#00FFFF"]

torontomap = folium.Map(location=[43.6532, -79.3832], zoom_start=11)
for lat, lon, postalcode, label in zip(toronto_plot['Latitude'], toronto_plot['Longitude'], 
                                       toronto_plot['PostalCode'], toronto_plot['label']):
    pop = folium.Popup("PostalCode: "+str(postalcode))
    folium.CircleMarker(
    [lat, lon],
    radius = 5,
    color = mycolors[label],
    fill = True,
    popup = pop,
    fill_color = mycolors[label],
    fill_opacity = 0.3
    ).add_to(torontomap)


torontomap

<h3>Map doesn't show up on GitHub for some reason, so here's a screenshot of the generated map</h3>
<img src="Capture.PNG">