### 1. This is the code for the Capstone Project - Coursera (IBM ML With Python)

In [1]:
# start by importing the relevant libraries
import pandas as pd
from  bs4 import BeautifulSoup as bs
import json
import requests
from pandas.io.json import json_normalize


### 2. Set the url and read the data from the website

In [2]:
# initialise and set the url variable from which data is to be scraped

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
data_list = bs(data, 'html.parser')

### 3. Setup the dataframe 

In [3]:
#setup the column headers in the Dataframe
df_list = pd.DataFrame (columns = ['Postcode', 'Borough','Neighbourhood'])

# populate the dataframe
for row in data_list.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        check_str = cells[1].text

# ignore all lines that have Borough set to "Not Assigned"
        if check_str.find ("Not assigned") == -1:
            new_row = {'Postcode':cells[0].text, 'Borough':cells[1].text,'Neighbourhood':cells[2].text}
            df_list = df_list.append(new_row, ignore_index=True)


### 4. Remove the /n characters from the data elements

In [4]:
# replace the /n characters being appended
df_list = df_list.replace('\n',' ', regex=True)


### 5. Print the dataframe shape as per instructions

In [5]:
print(df_list.shape)

(103, 3)


### 6. Before we start converting the post codes into latitude and longitude,  must install and import packages

In [33]:
!pip install pgeocode
!pip install folium
import pgeocode
import folium
from geopy.geocoders import Nominatim
import numpy as np
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors




### 7. Now find the longtitude and latitude information

In this logic, we first obtain the location details for TORONTO.  As we process the postcodes, some are returned as NaN, which causes issues for the folium and mapping commands.  So, while inelegant, this code defaults to the Toront co-ordinates for any postcode that returns NaN.

In this case I have used pgeocode rather than geocode or even the Excel sheet provided.  This is just so as to practice alternative approach to achieving the same effect.

In [7]:
#Start mapping - first create an initial map of the toronto Neighbourhood - this is located here as some postcodes are nulls.  In this case, the borough location is being defaulted to the Toronto location details.

address = 'Toronto'
gl = Nominatim(user_agent="govinda")
loc = gl.geocode(address)
lat = loc.latitude
long = loc.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, long))

# Look up longitude and latitude using the poscode.  Using pgeocodes for this converison
nomi = pgeocode.Nominatim('ca')
pcodes = pd.DataFrame(columns= ['Postcode', 'Long','Lat'])

for index, row in df_list.iterrows():
    out = nomi.query_postal_code(row['Postcode'])
    if numpy.isnan(out[9]) or numpy.isnan(out[10]):
        new_row = {'Postcode':row['Postcode'], 'Long':long, 'Lat':lat}
    else:
        new_row = {'Postcode':row['Postcode'], 'Long':out[10], 'Lat':out[9]}
    pcodes = pcodes.append(new_row, ignore_index=True)


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### 8. Now merge the two DataFrames

In [8]:
#now merge the two dataframes

df_list_full = df_list.merge(pcodes, on='Postcode', how = 'left')

# print header to confirm that the data is updated correctly
print (df_list_full.head())


  Postcode            Borough                                 Neighbourhood  \
0     M3A         North York                                     Parkwoods    
1     M4A         North York                              Victoria Village    
2     M5A   Downtown Toronto                     Regent Park, Harbourfront    
3     M6A         North York              Lawrence Manor, Lawrence Heights    
4     M7A   Downtown Toronto   Queen's Park, Ontario Provincial Government    

      Long      Lat  
0 -79.3300  43.7545  
1 -79.3148  43.7276  
2 -79.3626  43.6555  
3 -79.4504  43.7223  
4 -79.3889  43.6641  


In [9]:
# filter borough names that contain the word Toronto
borough_names = list(df_list.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

# create a new DataFrame with only boroughs that contain the word Toronto
df_full_list = df_list_full[df_list_full['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(df_list_full.shape)
df_list_full.head()

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Long,Lat
0,M3A,North York,Parkwoods,-79.33,43.7545
1,M4A,North York,Victoria Village,-79.3148,43.7276
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.3626,43.6555
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.4504,43.7223
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.3889,43.6641


### 9.  Now we start plotting the initial map to show the locations as identified above.  Using different fill colour for practice only.

In [10]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat, long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_list_full['Lat'], df_list_full['Long'], df_list_full['Borough'], df_list_full['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7).add_to(map_toronto)

map_toronto


### 10. Now start formatting the 4Square query parameters - I have removed the CLIENT ID AND CLIENT SECRET FOR PUBLISHING TO GITHUB

In [11]:
#define Foursquare Credentials and Version
CLIENT_ID = 'CLIENT ID' # your Foursquare ID
CLIENT_SECRET = 'CLIENT SECRET' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
LIMIT = 100
radius = 450


Your credentails:
CLIENT_ID: JMC0RA50DJHNPEPHWCHB4SDL5FJRAPX12WJRNAM2DYDP0GZJ
CLIENT_SECRET:JO1TNVIKI4TNNZZ11NEIJDXS2FSCH0PUU1FC43TXOREDTP5T


### 11. Retrieve data and create a new dataframe that shows the venue names by Borough and Neighbourhoods.  Add in the Lat, Long and Category

As an output print the number of Boroughs and Neighbour hoods

In [60]:
# set up output DataFrame
venue_data = pd.DataFrame (columns = ['Borough','Neighbourhood', 'VName', 'vLat', 'vLong', 'Category'])
x = 0
for lat, long, post, borough, neighborhood in zip(df_list_full['Lat'], df_list_full['Long'], df_list_full['Postcode'], df_list_full['Borough'], df_list_full['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius,
        LIMIT)

    results = requests.get(url)
    test = results.json()
        
    group = test['response']['groups']
    for init_items in group:
        items = init_items['items']
        for get_names in items:
            cat_data = get_names['venue']['categories']
            for cat_name in cat_data:
                new_row = {'Borough': borough, 'Neighbourhood': neighborhood,'VName':get_names['venue']['name'], 'vLat':get_names['venue']['location']['lat'], 'vLong':get_names['venue']['location']['lng'], 'Category':cat_name['name']}
                venue_data = venue_data.append (new_row, ignore_index=True)

# print how many boroughs and neighborhoods counted
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(venue_data['Borough'].unique()),
        venue_data.shape[0]))


The dataframe has 10 boroughs and 1878 neighborhoods.


As a check lets print the first 10 lines of the newly formed data frame

In [15]:
print (venue_data.head(10))


             Borough               Neighbourhood  \
0        North York                   Parkwoods    
1        North York                   Parkwoods    
2        North York                   Parkwoods    
3        North York            Victoria Village    
4        North York            Victoria Village    
5        North York            Victoria Village    
6        North York            Victoria Village    
7        North York            Victoria Village    
8        North York            Victoria Village    
9  Downtown Toronto   Regent Park, Harbourfront    

                                       VName       vLat      vLong  \
0                            Brookbanks Park  43.751976 -79.332140   
1                              Variety Store  43.751974 -79.333114   
2          Corrosion Service Company Limited  43.752432 -79.334661   
3                                  Portugril  43.725819 -79.312785   
4                                Tim Hortons  43.725517 -79.313103   
5  Egli

### 12 a) Explore the data - for this I want to know the count of each category type

In [16]:
venue_data.groupby(["Category"])["VName"].count()

Category
Accessories Store                 2
Afghan Restaurant                 2
Airport                           1
American Restaurant              23
Art Gallery                       9
Art Museum                        2
Arts & Crafts Store               5
Asian Restaurant                 16
Athletics & Sports                3
Auto Dealership                   1
Auto Garage                       3
BBQ Joint                         4
Baby Store                        1
Bagel Shop                        3
Bakery                           32
Bank                             25
Bar                              24
Baseball Field                    6
Basketball Court                  1
Basketball Stadium                2
Beach Bar                         1
Beer Bar                         11
Beer Store                        7
Belgian Restaurant                1
Bike Rental / Bike Share          1
Bistro                            4
Bookstore                        14
Boutique           

### 12 b) explore the data - list all the steakhouses 

In [17]:
steak_house = venue_data.loc[venue_data['Category'] == 'Steakhouse']
print(steak_house)

                Borough                              Neighbourhood  \
135   Downtown Toronto                   Garden District, Ryerson    
398   Downtown Toronto                                Berczy Park    
460   Downtown Toronto                         Central Bay Street    
534   Downtown Toronto                   Richmond, Adelaide, King    
562   Downtown Toronto                   Richmond, Adelaide, King    
586   Downtown Toronto                   Richmond, Adelaide, King    
820   Downtown Toronto   Toronto Dominion Centre, Design Exchange    
862   Downtown Toronto   Toronto Dominion Centre, Design Exchange    
930       East Toronto             India Bazaar, The Beaches West    
947   Downtown Toronto             Commerce Court, Victoria Hotel    
1007  Downtown Toronto             Commerce Court, Victoria Hotel    
1019  Downtown Toronto             Commerce Court, Victoria Hotel    
1100        North York                Willowdale, Willowdale East    
1314       Mississau

#### Mapping the steakhouses - just for fun :)

In [18]:
# create map of Toronto using latitude and longitude values
map_steakhouses = folium.Map(location=[lat, long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(steak_house['vLat'], steak_house['vLong'], steak_house['Borough'], steak_house['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7).add_to(map_steakhouses)

map_steakhouses

###  13  Now set up the cluster groups - I initially did three clusters but later changed to 4.  This was just for me to see how this worked

In [39]:
# set number of clusters
kclusters = 4

venue_data_interim = venue_data
venue_grouped_clustering = venue_data_interim.drop(['Neighbourhood','Category','VName','Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venue_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 1], dtype=int32)

Run the clustering

In [40]:
venue_data_sorted = venue_data.sort_values (by=['Neighbourhood'])

venue_data_sorted["Cluster Labels"] = kmeans.labels_

venue_data_sorted.head(20)

Unnamed: 0,Borough,Neighbourhood,VName,vLat,vLong,Category,Cluster Labels
1347,Scarborough,Agincourt,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot,2
1348,Scarborough,Agincourt,Commander Arena,43.794867,-79.267989,Skating Rink,2
1644,Etobicoke,"Alderwood, Long Branch",Rexall,43.601951,-79.545694,Pharmacy,2
1643,Etobicoke,"Alderwood, Long Branch",Delta Variety,43.603897,-79.536385,Convenience Store,2
1642,Etobicoke,"Alderwood, Long Branch",Tim Hortons,43.602396,-79.545048,Coffee Shop,2
1641,Etobicoke,"Alderwood, Long Branch",Timothy's Pub,43.600165,-79.544699,Pub,2
1639,Etobicoke,"Alderwood, Long Branch",Il Paesano Pizzeria & Restaurant,43.60128,-79.545028,Pizza Place,2
1640,Etobicoke,"Alderwood, Long Branch",Toronto Gymnastics International,43.599832,-79.542924,Gym,2
492,North York,"Bathurst Manor, Wilson Heights, Downsview North",Orly Restaurant & Grill,43.754493,-79.443507,Middle Eastern Restaurant,2
491,North York,"Bathurst Manor, Wilson Heights, Downsview North",Wolfie's Deli,43.754875,-79.442438,Deli / Bodega,1


###  14 Map the clusters using folium

In [41]:
# create map
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venue_data_sorted['vLat'], venue_data_sorted['vLong'], venue_data_sorted['Neighbourhood'], venue_data_sorted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### 15 Examine the clusters - only printing the first 10 points.  In this case I have chosen not to out the data with Category ratings

In [57]:
venue_clustered = venue_data_sorted.loc[venue_data_sorted['Cluster Labels'] == 0, venue_data_sorted.columns[[1] + list(range(5, venue_data_sorted.shape[1]))]]
venue_clustered.groupby(['Neighbourhood','Category']).count()
venue_clustered.head(10)

Unnamed: 0,Neighbourhood,Category,Cluster Labels
366,Berczy Park,Food Truck,0
1097,"Birch Cliff, Cliffside West",General Entertainment,0
1096,"Birch Cliff, Cliffside West",Café,0
1098,"Birch Cliff, Cliffside West",Skating Rink,0
1313,Canada Post Gateway Processing Centre,Bank,0
1312,Canada Post Gateway Processing Centre,Modern European Restaurant,0
1311,Canada Post Gateway Processing Centre,Clothing Store,0
1310,Canada Post Gateway Processing Centre,Bar,0
1309,Canada Post Gateway Processing Centre,Dessert Shop,0
1824,Church and Wellesley,Burrito Place,0


In [58]:
venue_clustered = venue_data_sorted.loc[venue_data_sorted['Cluster Labels'] == 1, venue_data_sorted.columns[[1] + list(range(5, venue_data_sorted.shape[1]))]]
venue_clustered.groupby(['Neighbourhood','Category']).count()
venue_clustered.head(10)

Unnamed: 0,Neighbourhood,Category,Cluster Labels
491,"Bathurst Manor, Wilson Heights, Downsview North",Deli / Bodega,1
490,"Bathurst Manor, Wilson Heights, Downsview North",Mediterranean Restaurant,1
495,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,1
494,"Bathurst Manor, Wilson Heights, Downsview North",Fried Chicken Joint,1
493,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,1
764,Bayview Village,Park,1
762,Bayview Village,Gas Station,1
761,Bayview Village,Dog Run,1
760,Bayview Village,Trail,1
763,Bayview Village,Flower Shop,1


In [59]:
venue_clustered = venue_data_sorted.loc[venue_data_sorted['Cluster Labels'] == 2, venue_data_sorted.columns[[1] + list(range(5, venue_data_sorted.shape[1]))]]
venue_clustered.groupby(['Neighbourhood','Category']).count()
venue_clustered.head(10)

Unnamed: 0,Neighbourhood,Category,Cluster Labels
1347,Agincourt,Breakfast Spot,2
1348,Agincourt,Skating Rink,2
1644,"Alderwood, Long Branch",Pharmacy,2
1643,"Alderwood, Long Branch",Convenience Store,2
1642,"Alderwood, Long Branch",Coffee Shop,2
1641,"Alderwood, Long Branch",Pub,2
1639,"Alderwood, Long Branch",Pizza Place,2
1640,"Alderwood, Long Branch",Gym,2
492,"Bathurst Manor, Wilson Heights, Downsview North",Middle Eastern Restaurant,2
1099,"Birch Cliff, Cliffside West",College Stadium,2
