# Capstone Project
#### By: Ajay Sharma

## Introduction
This project is to refresh the concepts I have learned throughout this course. I will also demonstrate how I can use the data analysis skills to solve a real world problem through this capstone project 

In [1]:
import pandas as pd
import numpy as np
#!conda install -c conda-forge geopy --yes

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#!conda install -c conda-forge pgeocode --yes
import pgeocode
import requests
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Download Toronto Neighborhood data from Wikipedia

In [2]:
#! conda install beautifulsoup4
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re


def parse_data():
	page = urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
	html = page.read().decode("utf-8")
	soup = BeautifulSoup(html, 'html.parser')
	table_data = soup.find("table").findAll("td")
	parsed_data = []
	for data in table_data:
		new_data = []
		postal_code = data.find("b").text
		borough_data = data.find("span")
		if postal_code and borough_data.text != "Not assigned":
			borough_data_vals = re.split('[(]',borough_data.text)
			borough = borough_data_vals[0]
			neighborhoods = borough_data_vals[1].replace(" / ", ",").replace(")", "")
			new_data.append(postal_code)
			new_data.append(borough)
			new_data.append(neighborhoods)
			parsed_data.append(new_data)
	return parsed_data

In [3]:
data = parse_data()

## 1. EXPLORE DATA

In [4]:
neighborhoods = pd.DataFrame(data=data)
headers = ["PostalCode", "Borough", "Neighborhood"]
neighborhoods.columns = headers
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


### Prepare data for visualization

In [5]:
def get_lat(row):
    postal_code=row['PostalCode']
    nomi = pgeocode.Nominatim("ca")
    location = nomi.query_postal_code(postal_code)
    latitude = location.latitude
    return latitude

def get_lon(row):
    postal_code=row['PostalCode']
    nomi = pgeocode.Nominatim("ca")
    location = nomi.query_postal_code(postal_code)
    longitude = location.longitude
    return longitude

In [6]:
CLIENT_ID = 'S04PZERPPBIQX2VEEOIBK3YYJGLOQTG5KNPOPAUKOVW0OHBZ' # your Foursquare ID
CLIENT_SECRET = '33UQDTS2GD24HTWQA4KXGY0KCDYJJBRTEXVHDJXFFJHGCE52' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [7]:
neighborhoods["latitude"]= neighborhoods.apply(get_lat, axis = 1)
neighborhoods["longitude"]= neighborhoods.apply(get_lon, axis = 1)

In [8]:
neighborhoods['latitude'] = neighborhoods['latitude'].replace(np.nan, 43.754500)
neighborhoods['longitude'] = neighborhoods['longitude'].replace(np.nan, -79.330000)

In [9]:
# create map of Toronto using latitude and longitude values
latitude = 43.7182197
longitude = -79.4482688
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['latitude'], neighborhoods['longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Add Neighborhood venue data

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
        results = results["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['latitude'],longitudes=neighborhoods['longitude'])

In [13]:
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.3300,Brookbanks Park,43.751976,-79.332140,Park
1,Parkwoods,43.7545,-79.3300,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.7545,-79.3300,Brookbanks Pool,43.751389,-79.332184,Pool
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...
2167,"Mimico NW,The Queensway West,South of Bloor,Ki...",43.6256,-79.5231,Kingsway Boxing Club,43.627254,-79.526684,Gym
2168,"Mimico NW,The Queensway West,South of Bloor,Ki...",43.6256,-79.5231,Solmaz,43.626773,-79.527259,Food & Drink Shop
2169,"Mimico NW,The Queensway West,South of Bloor,Ki...",43.6256,-79.5231,Buon Giorno Cafe,43.622801,-79.519322,Italian Restaurant
2170,"Mimico NW,The Queensway West,South of Bloor,Ki...",43.6256,-79.5231,Queensway Fish & Chips,43.621720,-79.524588,Fish & Chips Shop


In [14]:

toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.7545,-79.33,Brookbanks Pool,43.751389,-79.332184,Pool
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Portugril,43.725819,-79.312785,Portuguese Restaurant


## 2. SEGMENT DATA
### Group data by Neighborhood

In [15]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood,Long Branch",8,8,8,8,8,8
"Bathurst Manor,Wilson Heights,Downsview North",6,6,6,6,6,6
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",24,24,24,24,24,24
...,...,...,...,...,...,...
WillowdaleWest,4,4,4,4,4,4
Woburn,2,2,2,2,2,2
Woodbine Heights,5,5,5,5,5,5
York Mills West,3,3,3,3,3,3


#### Let's find out how many unique categories can be curated from all the returned venues

In [16]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 257 uniques categories.


### Explore data for each Neighborhood

In [17]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [18]:
toronto_onehot.shape

(2172, 257)

Next, group rows by neighborhood and by mean of the frequency of occurrence of each category¶

In [19]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor,Wilson Heights,Downsview North",0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park,Lawrence Manor East",0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,WillowdaleWest,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,Woburn,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,Woodbine Heights,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,York Mills West,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Look at the new size¶

In [20]:
toronto_grouped.shape

(100, 257)

Let's print each neighborhood along with the top 5 most common venues¶

In [21]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt ----
                       venue  freq
0         Chinese Restaurant  0.25
1  Latin American Restaurant  0.25
2             Breakfast Spot  0.25
3            Badminton Court  0.25
4                Yoga Studio  0.00


----Alderwood,Long Branch----
            venue  freq
0    Dance Studio  0.12
1        Pharmacy  0.12
2  Sandwich Place  0.12
3             Gym  0.12
4             Pub  0.12


----Bathurst Manor,Wilson Heights,Downsview North----
                      venue  freq
0       Fried Chicken Joint  0.17
1               Pizza Place  0.17
2  Mediterranean Restaurant  0.17
3               Coffee Shop  0.17
4             Deli / Bodega  0.17


----Bayview Village----
                        venue  freq
0                        Park  0.25
1                       Trail  0.25
2  Construction & Landscaping  0.25
3                 Flower Shop  0.25
4         Monument / Landmark  0.00


----Bedford Park,Lawrence Manor East----
                venue  freq
0  Italian Restaurant 

                       venue  freq
0                     Bakery   0.5
1           Basketball Court   0.5
2                  Nightclub   0.0
3  Middle Eastern Restaurant   0.0
4         Miscellaneous Shop   0.0


----North Toronto West----
                venue  freq
0          Playground  0.25
1              Garden  0.25
2            Gym Pool  0.25
3                Park  0.25
4  Miscellaneous Shop  0.00


----Northwood Park,York University----
                        venue  freq
0                  Sports Bar  0.14
1              Sandwich Place  0.14
2   Middle Eastern Restaurant  0.14
3              Massage Studio  0.14
4  Modern European Restaurant  0.14


----Old Mill South,King's Mill Park,Sunnylea,Humber Bay,Mimico NE,The Queensway East,Royal York South East,Kingsway Park South East----
                 venue  freq
0       Baseball Field  0.33
1    Convenience Store  0.33
2                 Park  0.33
3  Moroccan Restaurant  0.00
4          Music Venue  0.00


----Ontario Provincial

First, let's write a function to sort the venues in descending order.

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Let's put that into a pandas dataframe
Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [23]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Badminton Court,Breakfast Spot,Chinese Restaurant,Financial or Legal Service,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Women's Store
1,"Alderwood,Long Branch",Pizza Place,Dance Studio,Sandwich Place,Gym,Pub,Coffee Shop,Convenience Store,Pharmacy,Curling Ice,Electronics Store
2,"Bathurst Manor,Wilson Heights,Downsview North",Pizza Place,Mediterranean Restaurant,Coffee Shop,Middle Eastern Restaurant,Deli / Bodega,Fried Chicken Joint,Fast Food Restaurant,Event Space,Falafel Restaurant,Farmers Market
3,Bayview Village,Flower Shop,Construction & Landscaping,Park,Trail,Women's Store,Fast Food Restaurant,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market
4,"Bedford Park,Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Butcher,Pizza Place,Juice Bar,Pub,Thai Restaurant,Comfort Food Restaurant,Liquor Store


## 3. DATA CLUSTERING

Run k-means to cluster the neighborhood into 5 clusters.

In [24]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [28]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.7545,-79.3300,0.0,Food & Drink Shop,Pool,Park,Flea Market,Flower Shop,Fish Market,Fish & Chips Shop,Financial or Legal Service,Eastern European Restaurant,Field
1,M4A,North York,Victoria Village,43.7276,-79.3148,1.0,Pizza Place,Financial or Legal Service,Portuguese Restaurant,Intersection,Park,Hockey Arena,Coffee Shop,Fish & Chips Shop,Fish Market,Flea Market
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.6555,-79.3626,1.0,Coffee Shop,Breakfast Spot,Yoga Studio,Pub,Food Truck,Spa,Beer Store,Event Space,Restaurant,Electronics Store
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.7223,-79.4504,1.0,Clothing Store,Coffee Shop,Women's Store,Restaurant,Sushi Restaurant,Toy / Game Store,Food Court,Furniture / Home Store,Men's Store,Bakery
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889,1.0,Sushi Restaurant,Italian Restaurant,Ethiopian Restaurant,Dance Studio,Burrito Place,Martial Arts School,Café,Mexican Restaurant,Bubble Tea Shop,Beer Bar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.6518,-79.5076,1.0,Breakfast Spot,Sushi Restaurant,Bank,Burger Joint,Pub,Liquor Store,Boutique,Coffee Shop,Restaurant,Bakery
99,M4Y,Downtown Toronto,Church and Wellesley,43.6656,-79.3830,1.0,Japanese Restaurant,Sushi Restaurant,Coffee Shop,Restaurant,Yoga Studio,Gay Bar,Hotel,Fast Food Restaurant,Mediterranean Restaurant,Men's Store
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L,43.7804,-79.2505,1.0,Coffee Shop,Restaurant,Yoga Studio,Italian Restaurant,Martial Arts School,Breakfast Spot,Bookstore,Sushi Restaurant,Japanese Restaurant,Bank
101,M8Y,Etobicoke,"Old Mill South,King's Mill Park,Sunnylea,Humbe...",43.6325,-79.4939,0.0,Park,Baseball Field,Convenience Store,Financial or Legal Service,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Women's Store


In [29]:
#drop any rows with nan values
toronto_merged[toronto_merged["Cluster Labels"].isnull()]
toronto_merged = toronto_merged.dropna()
  
# To reset the indices 
toronto_merged = toronto_merged.reset_index(drop = True)

### Visualize the Clusters

In [30]:
  
# Convert float values to int
toronto_merged["Cluster Labels"] = toronto_merged["Cluster Labels"].astype(int)
toronto_merged.head()
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### DataFrame Shape

In [31]:
toronto_merged.shape

(100, 16)