# Part 1
## Reading and cleaning the data

In [1]:
import pandas as pd
import numpy as np

### Reading the data

In [2]:
data=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
data=data[0]

### Cleaning the data

In [4]:
data=data[data['Borough']!='Not assigned']

In [5]:
from sklearn.utils import shuffle

Please note that the order of the observations is not same as shown in the Coursera 'My Submission' page. When I applied the GroupBy function, the observations were coming out to be in ascending order. Therefore I have applied 'shuffle' function to arrange observations in random order to make it look more realistic.

In [6]:
data=data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
# change order of the columns
data=data[['Postcode','Borough','Neighbourhood']]
data=shuffle(data)
data.reset_index(inplace=True,drop=True)
data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M7A,Downtown Toronto,Queen's Park
1,M2K,North York,Bayview Village
2,M5E,Downtown Toronto,Berczy Park
3,M2J,North York,"Fairview, Henry Farm, Oriole"
4,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel"
5,M6E,York,Caledonia-Fairbanks
6,M1W,Scarborough,L'Amoreaux West
7,M4H,East York,Thorncliffe Park
8,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo..."
9,M9P,Etobicoke,Westmount


In [7]:
data.shape

(103, 3)

# Part 2
## Adding Latitude and Longitude to the DataFrame
The Geocoder library was not working appropriately and hence I downloaded the 'Geospatial_Coordinates' file and imported the data into the notebook

In [8]:
coord=pd.read_csv('C:/Users/aryam/Desktop/Python Classes/Advance/4) Applied Data Science Capstone/3. Neighbourhood Segmentation and Clustering/Geospatial_Coordinates.csv')

In order to match the coordinates to their respective Post Codes, I have arranged both the dataframes 'data' and 'coord' in ascending order of Post Codes and then joined the dataframes.    
Again, I have used shuffle() to arrange observations in random order.

In [9]:
data.sort_values(by='Postcode',ascending=True,inplace=True)
data.reset_index(inplace=True,drop=True)
coord.sort_values(by='Postal Code',ascending=True,inplace=True)

In [10]:
data=data.join(coord[['Latitude','Longitude']])

In [11]:
data=shuffle(data)
data.reset_index(inplace=True,drop=True)
data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.643515,-79.577201
1,M4E,East Toronto,The Beaches,43.676357,-79.293031
2,M6L,North York,"Downsview, North Park, Upwood Park",43.713756,-79.490074
3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
4,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999
5,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
6,M3M,North York,Downsview Central,43.728496,-79.495697
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M3C,North York,"Flemingdon Park, Don Mills South",43.725900,-79.340923
9,M9N,York,Weston,43.706876,-79.518188


# Part 3
## Exploring and Clustering

I will be exploring and clustering only with the Boroughs that have the word 'Toronto' in it.

In [12]:
toronto=data[data.Borough.str.contains("Toronto")]
toronto.reset_index(inplace=True,drop=True)
toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
2,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
3,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
4,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
6,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
7,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
8,M4P,Central Toronto,Davisville North,43.712751,-79.390197
9,M5N,Central Toronto,Roselawn,43.711695,-79.416936


In [13]:
CLIENT_ID = 'NHADPNKE0XES32QDTZIDIFWNRAIXFLCVCTVUPG4KYNTCSDXZ' 
CLIENT_SECRET = 'C4A0RO4YY31OJ5GYY0KDT1RQLA4VJLZFQ0XOVXCOIESJ02GI' 
VERSION = '20200116' 
LIMIT=100

### Explore Neighbourhoods

In [14]:
import requests
import json

#### Create function to explore venues

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [16]:
toronto_venues=getNearbyVenues(names=toronto['Neighbourhood'],
                                   latitudes=toronto['Latitude'],
                                   longitudes=toronto['Longitude']
                                  )

In [17]:
toronto_venues

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.676300,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Domino's Pizza,43.679058,-79.297382,Pizza Place
5,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
6,Christie,43.669542,-79.422564,Fiesta Farms,43.668471,-79.420485,Grocery Store
7,Christie,43.669542,-79.422564,Contra Cafe,43.669107,-79.426105,Café
8,Christie,43.669542,-79.422564,Vinny’s Panini,43.670679,-79.426148,Italian Restaurant
9,Christie,43.669542,-79.422564,Starbucks,43.671530,-79.421400,Coffee Shop


#### Take a look at how many unique categories can be curated from all the returned 

In [45]:
print('Number of unique categories of venues: {}'.format(len(toronto_venues['Venue Category'].unique())))

Number of unique categories of venues: 238


### Analyze Neighbourhoods

In [32]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

#### Print each neighborhood along with the top 5 most common venues

In [35]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
              venue  freq
0       Coffee Shop  0.07
1        Steakhouse  0.04
2               Bar  0.04
3              Café  0.04
4  Asian Restaurant  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2          Steakhouse  0.04
3  Seafood Restaurant  0.04
4         Cheese Shop  0.04


----Brockton, Exhibition Place, Parkdale Village----
            venue  freq
0            Café  0.14
1  Breakfast Spot  0.09
2     Coffee Shop  0.09
3    Intersection  0.05
4         Stadium  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0    Light Rail Station  0.11
1  Gym / Fitness Center  0.06
2               Brewery  0.06
3                   Spa  0.06
4            Smoke Shop  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                venue  freq
0      Airport Lounge  0.12
1     Air

#### Putting values into dataframe

In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Clothing Store,Sushi Restaurant,Hotel,Burger Joint,Thai Restaurant,Asian Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Seafood Restaurant,Beer Bar,Steakhouse,Bakery,Farmers Market,Cheese Shop,Café,Italian Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Breakfast Spot,Intersection,Bar,Stadium,Bakery,Italian Restaurant,Restaurant,Climbing Gym
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Park,Comic Shop,Pizza Place,Restaurant,Butcher,Burrito Place,Brewery
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Sculpture Garden,Rental Car Location,Plane,Boat or Ferry,Boutique,Harbor / Marina,Coffee Shop


### Cluster Neighbourhoods

In [37]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [38]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto

# merge toronto_grouped with toronto to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Trail,Other Great Outdoors,Pizza Place,Health Food Store,Neighborhood,Pub,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store
1,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0,Grocery Store,Café,Park,Gas Station,Italian Restaurant,Diner,Restaurant,Candy Store,Athletics & Sports,Baby Store
2,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Sports Bar,Spa,Juice Bar,Brewery
3,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Playground,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
4,M6R,West Toronto,"Parkdale, Roncesvalles",43.64896,-79.456325,0,Gift Shop,Bookstore,Dessert Shop,Eastern European Restaurant,Italian Restaurant,Bar,Dog Run,Restaurant,Movie Theater,Breakfast Spot


#### Visualizing the clusters

In [43]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [44]:
# create map
map_clusters = folium.Map(location=(43.6532,-79.3832), zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters