# Coursera Capstone #

### This notebook serves as the coursera capstone project.  ###

In [2]:
#Import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import requests

### This cell utilizes BeautifulSoup, urllib and dataframes to get the first set of data to work through ###

In [3]:
#Read page and store
quote_page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(quote_page)

#Read through stored page and gather the table provided
soup = BeautifulSoup(page, 'html.parser')
postal_table = soup.find('table')

#Get header of table to generate dataframe
header = []
for x in postal_table.find_all('th'):
    header.append(x.string.strip('\n'))

#Get rows of table to generate dataframe
rowcount = 0
body = []
for x in postal_table.find_all('tr'):
    rowcount+=1
    if rowcount == 1:
        continue
    body.append(x.get_text().strip().split('\n\n'))

#Construct dataframe
d_array = np.array(body)
postaldf = pd.DataFrame(d_array,columns=header)

#Print dataframe
postaldf

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### This cell goes through to remove bad 'Borough' entries ###

In [4]:
#Dataframe cleansing
drop_val = postaldf[postaldf['Borough']=='Not assigned'].index
postaldf.drop(drop_val, inplace=True)

postaldf

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### This cell provides the required values of rows & columns ###

In [5]:
#Print rows & Columns
postaldf.shape

(103, 3)

### Merge latitude and longitude information ###

In [6]:
latlong = pd.read_csv('Geospatial_Coordinates.csv')
#print(latlong)

compdf = postaldf.merge(latlong, left_on='Postal Code', right_on='Postal Code')
compdf

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Create folium map of Toronto for review ###

In [7]:
# Create map of Toronto using latitude and longitude values
latitude = 43.6532
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, borough, neighbourhood in zip(compdf['Latitude'], compdf['Longitude'], compdf['Borough'], compdf['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Detail (DELETE WHEN SHARING) ###

In [8]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20210206' # Foursquare API version
LIMIT = 120 # A default Foursquare API limit value


### Define Venue function. This will be used to capture venues in the area for analysis. ###

In [9]:
def getNearbyVenues(names, borough, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, borough, lat, lng in zip(names, borough, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat, 
            lng,
            borough,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Borough',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
toronto_venues = getNearbyVenues(names=compdf['Neighbourhood'],
                                 borough=compdf['Borough'],
                                 latitudes=compdf['Latitude'],
                                 longitudes=compdf['Longitude']
                                  )

### Execute DBSCAN algorithm to identify clusters based on long/lat of venues gathered. ###

In [16]:
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
Clus_dataSet = toronto_venues[['Venue Longitude','Venue Latitude']]
Clus_dataSet = np.nan_to_num(Clus_dataSet)
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

# Compute DBSCAN
db = DBSCAN(eps=0.15, min_samples=10).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
toronto_venues["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 


# A sample of clusters
toronto_venues[["Borough", "Neighborhood", "Venue Category","Clus_Db"]]

Unnamed: 0,Borough,Neighborhood,Venue Category,Clus_Db
0,North York,Parkwoods,Park,-1
1,North York,Parkwoods,Pool,-1
2,North York,Parkwoods,Food & Drink Shop,-1
3,North York,Victoria Village,Hockey Arena,-1
4,North York,Victoria Village,Portuguese Restaurant,-1
...,...,...,...,...
2093,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",Social Club,26
2094,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",Tanning Salon,26
2095,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",Kids Store,26
2096,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",Thrift / Vintage Store,26


In [17]:
toronto_clusters = toronto_venues.groupby(['Clus_Db']).mean()
toronto_clusters.insert(loc=0,
                        column='Clus_Db',
                        value=
                       [-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])
toronto_clusters.head()

Unnamed: 0_level_0,Clus_Db,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude
Clus_Db,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,-1,43.723538,-79.39395,43.723415,-79.393828
0,0,43.65204,-79.380856,43.652186,-79.381073
1,1,43.718518,-79.464763,43.718653,-79.464443
2,2,43.706397,-79.309937,43.706471,-79.312642
3,3,43.7259,-79.340923,43.72577,-79.34031
4,4,43.707625,-79.357976,43.707378,-79.356531
5,5,43.669542,-79.422564,43.670399,-79.423918
6,6,43.754328,-79.442259,43.755672,-79.440344
7,7,43.669005,-79.442259,43.668282,-79.441407
8,8,43.778517,-79.346556,43.777782,-79.344651


### Create map of new clusters to review. ###

In [18]:
# Create map of Toronto using latitude and longitude values
latitude = 43.6532
longitude = -79.3832
map_venues = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, borough, neighborhood, venue, cluster in zip(
        toronto_venues['Venue Latitude'],
        toronto_venues['Venue Longitude'],
        toronto_venues['Borough'],
        toronto_venues['Neighborhood'],
        toronto_venues['Venue'],
        toronto_venues['Clus_Db']):
    label = '{}, {}, {}'.format(cluster, borough, venue)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_venues)  

for lat, lng, cluster in zip(
        toronto_clusters['Venue Latitude'],
        toronto_clusters['Venue Longitude'],
        toronto_clusters['Clus_Db']):
    label = '{}'.format(cluster)
    label= folium.Popup(label, show=True, parse_html=True, sticky=True)
    folium.Marker(
    [lat, lng],
    popup=label).add_to(map_venues)
    
map_venues

In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add borough column back to dataframe
toronto_onehot['Clus_Db'] = toronto_venues['Clus_Db'] 

# move borough column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Clus_Db,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
toronto_grouped = toronto_onehot.groupby('Clus_Db').mean().reset_index()
toronto_grouped

Unnamed: 0,Clus_Db,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,-1,0.0,0.0,0.004608,0.0,0.0,0.0,0.0,0.004608,0.0,...,0.0,0.0,0.004608,0.009217,0.0,0.0,0.0,0.0,0.004608,0.0
1,0,0.0,0.000852,0.0,0.0,0.0,0.0,0.0,0.015332,0.001704,...,0.011925,0.001704,0.0,0.003407,0.0,0.006814,0.000852,0.0,0.0,0.005963
2,1,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0
3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.018519
6,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,...,0.0,0.015385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Order top five venues based on clusters to review output ###

In [23]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd'] 

# create columns according to number of top venues
columns = ['Clus_Db']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
borough_venues_sorted = pd.DataFrame(columns=columns)
borough_venues_sorted['Clus_Db'] = toronto_grouped['Clus_Db']

for ind in np.arange(toronto_grouped.shape[0]):
    borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

borough_venues_sorted.head()

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,-1,Park,Pizza Place,Coffee Shop,Bakery,Grocery Store
1,0,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant
2,1,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Miscellaneous Shop
3,2,Pizza Place,Gym / Fitness Center,Flea Market,Bank,Intersection
4,3,Gym,Restaurant,Beer Store,Coffee Shop,Art Gallery


### Report out all clusters top five venues ###

In [24]:
borough_venues_sorted

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,-1,Park,Pizza Place,Coffee Shop,Bakery,Grocery Store
1,0,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant
2,1,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Miscellaneous Shop
3,2,Pizza Place,Gym / Fitness Center,Flea Market,Bank,Intersection
4,3,Gym,Restaurant,Beer Store,Coffee Shop,Art Gallery
5,4,Coffee Shop,Sporting Goods Shop,Burger Joint,Bank,Restaurant
6,5,Grocery Store,Café,Park,Baby Store,Coffee Shop
7,6,Bank,Coffee Shop,Pet Store,Restaurant,Middle Eastern Restaurant
8,7,Bakery,Pharmacy,Coffee Shop,Furniture / Home Store,Bar
9,8,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant


In [35]:
borough_venues_sorted.loc[[1]]

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,0,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant


In [37]:
borough_venues_sorted.loc[1:27]

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,0,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant
2,1,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Miscellaneous Shop
3,2,Pizza Place,Gym / Fitness Center,Flea Market,Bank,Intersection
4,3,Gym,Restaurant,Beer Store,Coffee Shop,Art Gallery
5,4,Coffee Shop,Sporting Goods Shop,Burger Joint,Bank,Restaurant
6,5,Grocery Store,Café,Park,Baby Store,Coffee Shop
7,6,Bank,Coffee Shop,Pet Store,Restaurant,Middle Eastern Restaurant
8,7,Bakery,Pharmacy,Coffee Shop,Furniture / Home Store,Bar
9,8,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Japanese Restaurant
10,9,Bar,Café,Coffee Shop,Restaurant,Vegetarian / Vegan Restaurant


In [39]:
borough_venues_sorted.loc[[0]]

Unnamed: 0,Clus_Db,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,-1,Park,Pizza Place,Coffee Shop,Bakery,Grocery Store
