In [2]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML

## IBM Week 3 Capstone Project
#### The code below scrapes the provided wikipedia page for the raw data and cleans it. The data includes the postal codes, borouhs, and neighorhoods in Toronto.

In [3]:
raw_data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)
data = pd.DataFrame(raw_data[0]).dropna(subset=['Neighborhood']).reset_index(drop=True)
print(data.head())

  Postal Code           Borough                                 Neighborhood
0         M3A        North York                                    Parkwoods
1         M4A        North York                             Victoria Village
2         M5A  Downtown Toronto                    Regent Park, Harbourfront
3         M6A        North York             Lawrence Manor, Lawrence Heights
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


#### The code below prints the shape of the data obtained from wikipedia.

In [4]:
print(data.shape)

(103, 3)


#### The code below reads the CSV file containing the latitude and longitude of the borough and neighborhood data of Toronto.

In [5]:
PC = pd.read_csv('Geospatial_Coordinates.csv')

#### The code below creates the full datafram of postal codes and their associated latitude and longitude.

In [6]:
latitude_list = list()
longitude_list = list()

for pc in data['Postal Code']:
    i = PC.index[PC['Postal Code'] == pc].values
    latitude_list.append(PC.loc[i, 'Latitude'])
    longitude_list.append(PC.loc[i, 'Longitude'])

data['Latitude'] = latitude_list
data['Longitude'] = longitude_list

print(data.head())
print(data.shape)

# The dataframe is complete, however it is shown in a condensed way below.
# The shape of the dataframe is also provide for proof.

  Postal Code           Borough                                 Neighborhood  \
0         M3A        North York                                    Parkwoods   
1         M4A        North York                             Victoria Village   
2         M5A  Downtown Toronto                    Regent Park, Harbourfront   
3         M6A        North York             Lawrence Manor, Lawrence Heights   
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

                                         Latitude  \
0  25    43.753259
Name: Latitude, dtype: float64   
1  34    43.725882
Name: Latitude, dtype: float64   
2   53    43.65426
Name: Latitude, dtype: float64   
3  71    43.718518
Name: Latitude, dtype: float64   
4  85    43.662301
Name: Latitude, dtype: float64   

                                         Longitude  
0  25   -79.329656
Name: Longitude, dtype: float64  
1  34   -79.315572
Name: Longitude, dtype: float64  
2  53   -79.360636
Name: Longitude, dtype

#### The code below narrows the boroughs down to the ones containing Toronto in them.

In [7]:
Toronto_data = data[data['Borough'].str.contains("Toronto")].reset_index(drop=True)
print(Toronto_data.shape)

(39, 5)


#### The code below contains the information required to use Foursquare.

In [8]:
client_ID = 'XHU3GBQGB54HATCX3QDADDT2UOOFFBTUFELAAYQPDH12Y4II' # your Foursquare ID
client_secret = 'TCCKBVZQB04QOCHYF0BUVYXNEZ5YH021C5XMTDEYSR1VYNNY' # your Foursquare Secret
version = '20200416'
limit = 100
radius = 500

#### The function defined below extracts all the venues in the neighborhoods.

In [9]:
def get_neighborhood_venues(names, latitudes, longitudes, limit=100, radius=500):
    venues_list = list()

    for name, lat, lng in zip(names, latitudes, longitudes):
        lat = lat.values[0]
        lng = lng.values[0]

        uri = f'https://api.foursquare.com/v2/venues/explore?&client_id={client_ID}&client_secret={client_secret}' \
              f'&ll={lat},{lng}&v={version}&radius={radius}&limit={limit}'

        results = requests.get(uri).json()['response']['groups'][0]['items']

        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results]
        )

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = [
            'Neighborhood',
            'Neighborhood Latitude',
            'Neighborhood Longitude',
            'Venue',
            'Venue Latitude',
            'Venue Longitude',
            'Venue Category'
        ]

    return nearby_venues


#### The code below uses the previously defined function to obtain all the venues.

In [10]:
Toronto_venues = get_neighborhood_venues(Toronto_data['Neighborhood'], Toronto_data['Latitude'], Toronto_data['Longitude'])
print(Toronto_venues.shape)

(1612, 7)


#### The code below preprocesses the data to prepare it for clustering.

In [11]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood']

fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

#### The function below returns the most common venues.

In [12]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)
    
print(neighborhoods_venues_sorted.shape)

(39, 11)


#### The code below applies k-means clustering to the preprocessed data.

In [13]:
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

#### The code below merges the data from Toronto to sort them based on neighborhood.

In [14]:
Toronto_merged = Toronto_data

Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

#### The code below creates a map of Toronto with the associated 5 clusters previously found.

In [15]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
Toronto_latitude = location.latitude
Toronto_longitude = location.longitude

map_clusters = folium.Map(location=[Toronto_latitude, Toronto_longitude], zoom_start=12)

x = np.arange(kclusters)
ys = [i + x + (i * x) ** 2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'],
                                  Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters
# For unknown reasons, the display on the Skills Network Labs does not render the markers (clusters).
# However, it does work on my local devices when running the program on Pycharm.

In [16]:
HTML(filename="map_clusters.html")
# After uploading a saved html file of the full map, it seems like the display window in this cell was cropped for unknown reasons 
# in the Skills Network Labs.

In [17]:
print(Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, '1st Most Common Venue'])
print(Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, '1st Most Common Venue'])
print(Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, '1st Most Common Venue'])
print(Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, '1st Most Common Venue'])
print(Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, '1st Most Common Venue'])

0            Coffee Shop
1            Coffee Shop
2         Clothing Store
3                   Café
5            Coffee Shop
6            Coffee Shop
7          Grocery Store
8            Coffee Shop
9               Pharmacy
10           Coffee Shop
11                   Bar
12      Greek Restaurant
13           Coffee Shop
14                  Café
15                  Park
16           Coffee Shop
17                  Café
20                  Park
21         Jewelry Store
22    Mexican Restaurant
23           Coffee Shop
24        Sandwich Place
25             Gift Shop
26        Sandwich Place
27                  Café
28           Pizza Place
30                  Café
31                   Pub
32       Airport Service
34           Coffee Shop
35           Coffee Shop
36           Coffee Shop
37           Coffee Shop
38    Light Rail Station
Name: 1st Most Common Venue, dtype: object
19    Pool
Name: 1st Most Common Venue, dtype: object
29    Park
33    Park
Name: 1st Most Common Venue, dt