# This notebook contains the Peer Graded Assignment

### Part 1 of the code is below

In [1]:
# importing prerequisites
import pandas as pd
import numpy as np
!conda install lxml --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.6.24  |                0         125 KB
    certifi-2020.6.20          |           py36_0         156 KB
    libxml2-2.9.10             |       he19cac6_1         1.2 MB
    libxslt-1.1.34             |       hc22bd24_0         432 KB
    lxml-4.5.2                 |   py36hefd8a0e_0         1.2 MB
    openssl-1.1.1g             |       h7b6447c_0         2.5 MB
    ------------------------------------------------------------
                                           Total:         5.6 MB

T

In [2]:
# the pandas read_html in this case returns multiple dataframes in a list,
# hence the [0] after scraping the list. the header=0 argument sets the column names
# to Postal Code, Borough, and Neighborhood respectively.


df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
# then we process the dataframe. First, getting rid of cells without a borough.
# a bit messy, but the reset index properly fixes the index so that it isn't whack
df = df[df['Borough'] != 'Not assigned'].reset_index(drop=False).drop(columns=['index'])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
# next the cells without neighborhoods being assigned the borough as the neighborhood
df[df['Neighbourhood'] == 'Not assigned']
# since there aren't any, no need to worry and deal with it

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [5]:
# the final part is to display the shape of the dataframe
print('The dataframe has', df.shape[0], 'rows!')

The dataframe has 103 rows!


### Now begins part two

In [6]:
# had some trouble with geocoder, so here we are
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
# then we combine the two dataframes, first we sort them both so they are in the same order, then continue
# below sorting the coordinates dataframe
coords.sort_values('Postal Code', inplace=True)
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
# next sorting the borough and neighborhood dataframe, as well as reindexing
df.sort_values('Postal Code', inplace=True)
df.reset_index(drop=False, inplace=True)
df.drop(columns=['index'], inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# now combining the two dataframes
final = pd.DataFrame([df['Postal Code'], df['Borough'], df['Neighbourhood'], coords['Latitude'], coords['Longitude']])
final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
Postal Code,M1B,M1C,M1E,M1G,M1H,M1J,M1K,M1L,M1M,M1N,...,M9A,M9B,M9C,M9L,M9M,M9N,M9P,M9R,M9V,M9W
Borough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,Scarborough,...,Etobicoke,Etobicoke,Etobicoke,North York,North York,York,Etobicoke,Etobicoke,Etobicoke,Etobicoke
Neighbourhood,"Malvern, Rouge","Rouge Hill, Port Union, Highland Creek","Guildwood, Morningside, West Hill",Woburn,Cedarbrae,Scarborough Village,"Kennedy Park, Ionview, East Birchmount Park","Golden Mile, Clairlea, Oakridge","Cliffside, Cliffcrest, Scarborough Village West","Birch Cliff, Cliffside West",...,"Islington Avenue, Humber Valley Village","West Deane Park, Princess Gardens, Martin Grov...","Eringate, Bloordale Gardens, Old Burnhamthorpe...",Humber Summit,"Humberlea, Emery",Weston,Westmount,"Kingsview Village, St. Phillips, Martin Grove ...","South Steeles, Silverstone, Humbergate, Jamest...","Northwest, West Humber - Clairville"
Latitude,43.8067,43.7845,43.7636,43.771,43.7731,43.7447,43.7279,43.7111,43.7163,43.6927,...,43.6679,43.6509,43.6435,43.7563,43.7248,43.7069,43.6963,43.6889,43.7394,43.7067
Longitude,-79.1944,-79.1605,-79.1887,-79.2169,-79.2395,-79.2395,-79.262,-79.2846,-79.2395,-79.2648,...,-79.5322,-79.5547,-79.5772,-79.566,-79.5322,-79.5182,-79.5322,-79.5547,-79.5884,-79.5941


In [10]:
# transposing to fix how it was
final = final.transpose()
final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395


In [11]:
# and that is the finished, or should I say 'final' dataframe

### Part 3: using foursquare and various analysis

In [12]:
# first figure out the unique boroughs
set(final['Borough'])

{'Central Toronto',
 'Downtown Toronto',
 'East Toronto',
 'East York',
 'Etobicoke',
 'Mississauga',
 'North York',
 'Scarborough',
 'West Toronto',
 'York'}

In [13]:
# then filter out those which don't have toronto in the name
final = final[final['Borough'] != 'Scarborough']
final = final[final['Borough'] != 'East York']
final = final[final['Borough'] != 'Etobicoke']
final = final[final['Borough'] != 'Mississauga']
final = final[final['Borough'] != 'North York']
final = final[final['Borough'] != 'York']
final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.6764,-79.293
41,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.669,-79.3156
43,M4M,East Toronto,Studio District,43.6595,-79.3409
44,M4N,Central Toronto,Lawrence Park,43.728,-79.3888


In [14]:
# so now only toronto is left
set(final['Borough'])

{'Central Toronto', 'Downtown Toronto', 'East Toronto', 'West Toronto'}

In [31]:
# for the map
import folium
import requests

In [29]:
# first we should visualize toronto
toronto_coords = [43.6532, -79.3832]

toronto_map = folium.Map(location=toronto_coords, zoom_start=12)

for lat, lng, borough, neighborhood in zip(final['Latitude'], final['Longitude'], final['Borough'], final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

toronto_map

In [21]:
# a function to get nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
# defining credentials
CLIENT_ID = '0E0SHR0UJ0RR0UQIBCIGO20HBCGV3DY5N3IQNPFQTRHWFAOU' # your Foursquare ID
CLIENT_SECRET = 'INNJXIJR51XLV24HFRTPLL5UE1O0JWF0DOF1OLRYWNXVMIAO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0E0SHR0UJ0RR0UQIBCIGO20HBCGV3DY5N3IQNPFQTRHWFAOU
CLIENT_SECRET:INNJXIJR51XLV24HFRTPLL5UE1O0JWF0DOF1OLRYWNXVMIAO


In [24]:
#defining other key parts of foursquare uri
LIMIT = 100
radius = 500

In [32]:
toronto_venues = getNearbyVenues(names=final['Neighbourhood'],
                                   latitudes=final['Latitude'],
                                   longitudes=final['Longitude']
                                  )
toronto_venues.head()

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop


In [36]:
# map of toronto and the venues
toronto_map = folium.Map(location=toronto_coords, zoom_start=12)  

for lat, lng, venue in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'], toronto_venues['Venue']):
    label = folium.Popup(venue, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

for lat, lng, borough, neighborhood in zip(final['Latitude'], final['Longitude'], final['Borough'], final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)

toronto_map

In [37]:
# now the k-means to show the clustering
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

In [51]:
# groups neighborhoods into clusters
k_clusters = 4

final_clus = final.drop('Neighbourhood', axis=1)
final_clus = final_clus.drop('Borough', axis=1)
final_clus = final_clus.drop('Postal Code', axis=1)

k_means = KMeans(n_clusters=k_clusters, random_state=0).fit(final_clus)
k_means.labels_

array([3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 3], dtype=int32)

In [53]:
# add the which cluster the neighborhood is in into the final dataframe
final.insert(0, 'Cluster Labels', k_means.labels_)
final.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,3,M4E,East Toronto,The Beaches,43.6764,-79.293
41,3,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522
42,3,M4L,East Toronto,"India Bazaar, The Beaches West",43.669,-79.3156
43,3,M4M,East Toronto,Studio District,43.6595,-79.3409
44,2,M4N,Central Toronto,Lawrence Park,43.728,-79.3888


In [58]:
# map it out
# create map
map_clusters = folium.Map(location=toronto_coords, zoom_start=12)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 0.5, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final['Latitude'], final['Longitude'], final['Neighbourhood'], final['Cluster Labels']):
    label = folium.Popup(str(poi) + ': Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [59]:
# in short, I decided to first view toronto and its neighborhoods, then also the venues, and then I clustered just the neighborhoods