### 1. This is the code for the Capstone Project - Coursera (IBM ML With Python)

In [1]:
# start by importing the relevant libraries
import pandas as pd
from  bs4 import BeautifulSoup as bs
import json
import requests
from pandas.io.json import json_normalize


### 2. Set the url and read the data from the website

In [2]:
# initialise and set the url variable from which data is to be scraped

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
data_list = bs(data, 'html.parser')

### 3. Setup the dataframe 

In [3]:
#setup the column headers in the Dataframe
df_list = pd.DataFrame (columns = ['Postcode', 'Borough','Neighbourhood'])

# populate the dataframe
for row in data_list.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        check_str = cells[1].text

# ignore all lines that have Borough set to "Not Assigned"
        if check_str.find ("Not assigned") == -1:
            new_row = {'Postcode':cells[0].text, 'Borough':cells[1].text,'Neighbourhood':cells[2].text}
            df_list = df_list.append(new_row, ignore_index=True)


### 4. Remove the /n characters from the data elements

In [4]:
# replace the /n characters being appended
df_list = df_list.replace('\n',' ', regex=True)


### 5. Print the dataframe shape as per instructions

In [5]:
print(df_list.shape)

(103, 3)


### 6. Before we start converting the post codes into latitude and longitude,  must install and import packages

In [9]:
!pip install pgeocode
!pip install folium
import pgeocode
import folium
from geopy.geocoders import Nominatim
import numpy
from sklearn.cluster import KMeans



### 7. Now find the longtitude and latitude information

In this logic, we first obtain the location details for TORONTO.  As we process the postcodes, some are returned as NaN, which causes issues for the folium and mapping commands.  So, while inelegant, this code defaults to the Toront co-ordinates for any postcode that returns NaN.

In this case I have used pgeocode rather than geocode or even the Excel sheet provided.  This is just so as to practice alternative approach to achieving the same effect.

In [10]:
#Start mapping - first create an initial map of the toronto Neighbourhood - this is located here as some postcodes are nulls.  In this case, the borough location is being defaulted to the Toronto location details.

address = 'Toronto'
gl = Nominatim(user_agent="govinda")
loc = gl.geocode(address)
lat = loc.latitude
long = loc.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, long))

# Look up longitude and latitude using the poscode.  Using pgeocodes for this converison
nomi = pgeocode.Nominatim('ca')
pcodes = pd.DataFrame(columns= ['Postcode', 'Long','Lat'])

for index, row in df_list.iterrows():
    out = nomi.query_postal_code(row['Postcode'])
    if numpy.isnan(out[9]) or numpy.isnan(out[10]):
        new_row = {'Postcode':row['Postcode'], 'Long':long, 'Lat':lat}
    else:
        new_row = {'Postcode':row['Postcode'], 'Long':out[10], 'Lat':out[9]}
    pcodes = pcodes.append(new_row, ignore_index=True)


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### 8. Now merge the two DataFrames

In [11]:
#now merge the two dataframes

df_list_full = df_list.merge(pcodes, on='Postcode', how = 'left')

# print header to confirm that the data is updated correctly
print (df_list_full.head())


  Postcode            Borough                                 Neighbourhood  \
0     M3A         North York                                     Parkwoods    
1     M4A         North York                              Victoria Village    
2     M5A   Downtown Toronto                     Regent Park, Harbourfront    
3     M6A         North York              Lawrence Manor, Lawrence Heights    
4     M7A   Downtown Toronto   Queen's Park, Ontario Provincial Government    

      Long      Lat  
0 -79.3300  43.7545  
1 -79.3148  43.7276  
2 -79.3626  43.6555  
3 -79.4504  43.7223  
4 -79.3889  43.6641  


In [12]:
# filter borough names that contain the word Toronto
borough_names = list(df_list.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

# create a new DataFrame with only boroughs that contain the word Toronto
df_full_list = df_list_full[df_list_full['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(df_list_full.shape)
df_list_full.head()

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Long,Lat
0,M3A,North York,Parkwoods,-79.33,43.7545
1,M4A,North York,Victoria Village,-79.3148,43.7276
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.3626,43.6555
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.4504,43.7223
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.3889,43.6641


### 9.  Now we start plotting the initial map to show the locations as identified above.  Using different fill colour for practice only.

In [13]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat, long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_list_full['Lat'], df_list_full['Long'], df_list_full['Borough'], df_list_full['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7).add_to(map_toronto)

map_toronto


### 10. Now start formatting the 4Square query parameters

In [14]:
#define Foursquare Credentials and Version
CLIENT_ID = 'JMC0RA50DJHNPEPHWCHB4SDL5FJRAPX12WJRNAM2DYDP0GZJ' # your Foursquare ID
CLIENT_SECRET = 'JO1TNVIKI4TNNZZ11NEIJDXS2FSCH0PUU1FC43TXOREDTP5T' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
LIMIT = 100
radius = 450


Your credentails:
CLIENT_ID: JMC0RA50DJHNPEPHWCHB4SDL5FJRAPX12WJRNAM2DYDP0GZJ
CLIENT_SECRET:JO1TNVIKI4TNNZZ11NEIJDXS2FSCH0PUU1FC43TXOREDTP5T


### 11. Retrieve data and create a new dataframe that shows the venue names by Borough and Neighbourhoods.  Add in the Lat, Long and Category

As an output print the number of Boroughs and Neighbour hoods

In [15]:
# set up output DataFrame
venue_data = pd.DataFrame (columns = ['Borough','Neighbourhood', 'VName', 'vLat', 'vLong', 'Category'])
x = 0
for lat, long, post, borough, neighborhood in zip(df_list_full['Lat'], df_list_full['Long'], df_list_full['Postcode'], df_list_full['Borough'], df_list_full['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius,
        LIMIT)

    results = requests.get(url)
    test = results.json()
    
    group = test['response']['groups']
    for init_items in group:
        items = init_items['items']
        for get_names in items:
            cat_data = get_names['venue']['categories']
            for cat_name in cat_data:
                new_row = {'Borough': borough, 'Neighbourhood': neighborhood,'VName':get_names['venue']['name'], 'vLat':get_names['venue']['location']['lat'], 'vLong':get_names['venue']['location']['lng'], 'Category':cat_name['name']}
                venue_data = venue_data.append (new_row, ignore_index=True)

# print how many boroughs and neighborhoods counted
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(venue_data['Borough'].unique()),
        venue_data.shape[0]))


The dataframe has 10 boroughs and 1868 neighborhoods.


As a check lets print the first 10 lines of the newly formed data frame

In [16]:
print (venue_data.head(10))


             Borough               Neighbourhood  \
0        North York                   Parkwoods    
1        North York                   Parkwoods    
2        North York            Victoria Village    
3        North York            Victoria Village    
4        North York            Victoria Village    
5        North York            Victoria Village    
6        North York            Victoria Village    
7  Downtown Toronto   Regent Park, Harbourfront    
8  Downtown Toronto   Regent Park, Harbourfront    
9  Downtown Toronto   Regent Park, Harbourfront    

                                       VName       vLat      vLong  \
0                            Brookbanks Park  43.751976 -79.332140   
1                              Variety Store  43.751974 -79.333114   
2                                  Portugril  43.725819 -79.312785   
3                                Tim Hortons  43.725517 -79.313103   
4  Eglinton Ave E & Sloane Ave/Bermondsey Rd  43.726086 -79.313620   
5      

### 12 a) Explore the data - for this I want to know the count of each category type

In [19]:
venue_data.groupby(["Category"])["VName"].count()

Category
ATM                               1
Accessories Store                 1
Afghan Restaurant                 2
Airport                           1
American Restaurant              22
Art Gallery                      10
Art Museum                        2
Arts & Crafts Store               5
Asian Restaurant                 16
Athletics & Sports                3
Auto Dealership                   1
Auto Garage                       1
BBQ Joint                         4
Baby Store                        1
Bagel Shop                        2
Bakery                           32
Bank                             25
Bar                              24
Baseball Field                    6
Basketball Court                  1
Basketball Stadium                2
Beach Bar                         1
Beer Bar                         11
Beer Store                        7
Belgian Restaurant                1
Bistro                            4
Board Shop                        1
Bookstore          

### 12 b) explore the data - list all the steakhouses 

In [23]:
steak_house = venue_data.loc[venue_data['Category'] == 'Steakhouse']
print(steak_house)

                Borough                              Neighbourhood  \
135   Downtown Toronto                   Garden District, Ryerson    
402   Downtown Toronto                                Berczy Park    
463   Downtown Toronto                         Central Bay Street    
538   Downtown Toronto                   Richmond, Adelaide, King    
566   Downtown Toronto                   Richmond, Adelaide, King    
590   Downtown Toronto                   Richmond, Adelaide, King    
814   Downtown Toronto   Toronto Dominion Centre, Design Exchange    
856   Downtown Toronto   Toronto Dominion Centre, Design Exchange    
875   Downtown Toronto   Toronto Dominion Centre, Design Exchange    
922       East Toronto             India Bazaar, The Beaches West    
942   Downtown Toronto             Commerce Court, Victoria Hotel    
1002  Downtown Toronto             Commerce Court, Victoria Hotel    
1014  Downtown Toronto             Commerce Court, Victoria Hotel    
1095        North Yo

#### Mapping the steakhouses - just for fun :)

In [25]:
# create map of Toronto using latitude and longitude values
map_steakhouses = folium.Map(location=[lat, long], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(steak_house['vLat'], steak_house['vLong'], steak_house['Borough'], steak_house['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#000000',
        fill_opacity=0.7).add_to(map_steakhouses)

map_steakhouses