## Data Science Capstone Final Project
### Recommendation of Neighbourhood for New Business by segmenting and Clustering Neighborhoods in Toronto.

###### Lets import the required libraries.

In [16]:
!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge lxml --yes
!conda install -c conda-forge scikit-learn --yes
!conda install -c conda-forge folium=0.5.0 --yes
print ("Installed required libraries...")

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00   3.31 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  33.34 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  36.88 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  44.75 MB/s
Installed required libraries...


#####  Extract the html file from the URL , by using Web scraping we will read the data into Pandas Dataframe. Also clean the unwanted data , removed the misleading data.

In [5]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd


res= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df_list = pd.read_html(str(table))
df=pd.DataFrame({'Postal Code':df_list[0][0],'Borough':df_list[0][1],'Neighborhood':df_list[0][2]})
df=df.drop([0],axis=0).reset_index(drop=True)
df=df[df.Borough != 'Not assigned'].reset_index(drop=True)

df1=df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(','.join)
df_postcode=pd.DataFrame(df1).reset_index(drop=False)
for index,row in df_postcode.iterrows():
   if row['Neighborhood']== 'Not assigned' :
     row['Neighborhood'] = row['Borough']
df_postcode.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### Downloading csv file that has the geographical coordinates of each postal code.

In [6]:
df_lanlat=pd.read_csv('http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')
df_final = pd.merge(df_postcode, df_lanlat, how='inner')
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Import the required libraries , Define Foursquare credentials and version and Define getNearbyVenues function for extracting venues.

In [10]:
from geopy.geocoders import Nominatim
#import folium
import matplotlib.cm as cm
import matplotlib.colors as colors


address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

df_toronto=df_final
df_toronto.head()

CLIENT_ID = 'H55E5BKXI4F0YD0TDDZSBNYWWELW1VZZS1QAHO3EBPQZEKYF' # your Foursquare ID
CLIENT_SECRET = 'EC0X5VYQUZ0B45CZAKRKO4MP0FAZ23NOE5MY43MGDZLKSX01' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

def getNearbyVenues(names, latitudes, longitudes, radius=200):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

df_toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

The geograpical coordinate of Toronto City are 43.653963, -79.387207.
Your credentails:
CLIENT_ID: H55E5BKXI4F0YD0TDDZSBNYWWELW1VZZS1QAHO3EBPQZEKYF
CLIENT_SECRET:EC0X5VYQUZ0B45CZAKRKO4MP0FAZ23NOE5MY43MGDZLKSX01
Rouge,Malvern
Highland Creek,Rouge Hill,Port Union
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffcrest,Cliffside,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Scarborough Town Centre,Wexford Heights
Maryvale,Wexford
Agincourt
Clarks Corners,Sullivan,Tam O'Shanter
Agincourt North,L'Amoreaux East,Milliken,Steeles East
L'Amoreaux West,Steeles West
Upper Rouge
Hillcrest Village
Fairview,Henry Farm,Oriole
Bayview Village
Silver Hills,York Mills
Newtonbrook,Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Downsview North,Wilson Heights
Northwood Park,York University
CFB Toronto,Downsvie


#### Create a one hot encoding dataframe for the neighborhood's venues

In [11]:
# one hot encoding Central_Toronto_venues
df_toronto_venues_onehot = pd.get_dummies(df_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_toronto_venues_onehot['Neighborhood'] = df_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_toronto_venues_onehot.columns[-1]] + list(df_toronto_venues_onehot.columns[:-1])
df_toronto_venues_onehot = df_toronto_venues_onehot[fixed_columns]

df_toronto_venues_onehot.head()
#df_toronto_venues_onehot.shape[0]

Unnamed: 0,Yoga Studio,Adult Boutique,American Restaurant,Arepa Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Auto Workshop,Bakery,Bank,...,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group the one hot encoded dataframe by Neighborhood to obtain frequencies for each venue and Define the function for extracting the most common venues. Build the top 5 most common venues for each neighborhood dataframe

In [12]:
import numpy as np

df_toronto_venues_onehot.shape
df_toronto_grouped = df_toronto_venues_onehot.groupby('Neighborhood').mean().reset_index()
df_toronto_grouped.head()
df_toronto_grouped.shape

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_toronto_grouped['Neighborhood']

for ind in np.arange(df_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.shape[0]

62

### Cluster the neighborhoods based on the common venues (by using k-means clustering where k=3)

In [31]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 3

df_toronto_grouped_clustering = df_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2,
       1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2], dtype=int32)

#### Create the dataframe for displaying the cluster which each neighborhood belongs to

In [32]:
df_toronto_merged= df_toronto_grouped[['Neighborhood']]

# merge df_toronto_merged with df_final to add latitude/longitude for each neighborhood
df_toronto_merged = df_toronto_merged.join(df_final.set_index('Neighborhood'), on='Neighborhood')

# add clustering labels
df_toronto_merged['Cluster_Labels'] = kmeans.labels_

# merge df_toronto_merged with neighborhoods_venues_sorted to add first five most common venues
df_toronto_merged = df_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_toronto_merged.shape[0]

62

### Generate Folium map of Toronto and show the created clusters

In [33]:
import folium

# Assuming Toronto coordinates as 43.653170, -79.383541
tor_latitude = 43.653170
tor_longitude = -79.383541

# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_merged['Latitude'], df_toronto_merged['Longitude'], df_toronto_merged['Neighborhood'], df_toronto_merged['Cluster_Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1 : There are many restaurants,Cafe's and eateries in this region

In [38]:
df_toronto_merged.loc[df_toronto_merged['Cluster_Labels'] == 0, df_toronto_merged.columns[[0] + [2] + list(range(5, df_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide,King,Richmond",Downtown Toronto,0,Steakhouse,Asian Restaurant,Park,Greek Restaurant,Hotel
1,"Alderwood,Long Branch",Etobicoke,0,Pharmacy,Coffee Shop,Dance Studio,Bank,Pizza Place
2,"Bathurst Manor,Downsview North,Wilson Heights",North York,0,Fried Chicken Joint,Deli / Bodega,Pizza Place,Coffee Shop,Restaurant
3,"Bedford Park,Lawrence Manor East",North York,0,Italian Restaurant,Comfort Food Restaurant,Coffee Shop,Sandwich Place,Juice Bar
4,"Birch Cliff,Cliffside West",Scarborough,0,Café,Women's Store,Dim Sum Restaurant,Farmers Market,Ethiopian Restaurant
5,"Bloordale Gardens,Eringate,Markland Wood,Old B...",Etobicoke,0,Liquor Store,Pizza Place,Café,Dim Sum Restaurant,Farmers Market
6,"Brockton,Exhibition Place,Parkdale Village",West Toronto,0,Playground,Women's Store,Dessert Shop,Farmers Market,Ethiopian Restaurant
7,Business reply mail Processing Centre969 Eastern,East Toronto,0,Auto Workshop,Brewery,Women's Store,Dim Sum Restaurant,Fast Food Restaurant
8,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Downtown Toronto,0,Performing Arts Venue,Dessert Shop,Farmers Market,Ethiopian Restaurant,Electronics Store
9,"Cabbagetown,St. James Town",Downtown Toronto,0,Restaurant,Indian Restaurant,Café,Market,General Entertainment


### Cluster 2 : This is famous for Women Health & Beauty Service and Women's Store

In [39]:
df_toronto_merged.loc[df_toronto_merged['Cluster_Labels'] == 1, df_toronto_merged.columns[[0] + [2] + list(range(5, df_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
46,Roselawn,Central Toronto,1,Health & Beauty Service,Women's Store,Dim Sum Restaurant,Fast Food Restaurant,Farmers Market


### Cluster 3 : There are parks and Dessert Shop and Farmer's Market

In [41]:
df_toronto_merged.loc[df_toronto_merged['Cluster_Labels'] == 2, df_toronto_merged.columns[[0] + [2] + list(range(5, df_toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
42,"Moore Park,Summerhill East",Central Toronto,2,Park,Women's Store,Dessert Shop,Farmers Market,Ethiopian Restaurant
45,Rosedale,Downtown Toronto,2,Park,Women's Store,Dessert Shop,Farmers Market,Ethiopian Restaurant
54,"The Beaches West,India Bazaar",East Toronto,2,Fish & Chips Shop,Park,Dessert Shop,Farmers Market,Ethiopian Restaurant
61,York Mills West,North York,2,Park,Women's Store,Dessert Shop,Farmers Market,Ethiopian Restaurant


#### As Custer 1 looks like quite of happening place and there are quite restorants , Cafes are present so lets explore and see is there any Indian Restaurant and Fitnes Club

In [91]:
df_cluster1= df_toronto_merged.loc[df_toronto_merged['Cluster_Labels'] == 0, df_toronto_merged.columns[[0] + [2] + list(range(5, df_toronto_merged.shape[1]))]]
df_result1=df_cluster1[df_cluster1['1st Most Common Venue'].str.contains('Indian Restaurant') | df_cluster1['2nd Most Common Venue'].str.contains('Indian Restaurant') | df_cluster1['3rd Most Common Venue'].str.contains('Indian Restaurant') ].reset_index(drop=False)

## Recommended location for Indian Fast food and snaks center:
#### In Cabbagetown,St. James Town, Thorncliffe Park Neighbourhood there are two Indian Restaurant.Our Contractor wants to open new restaurant for Indian Snaks and Fast Food.
#### So If he open here he can lure the Customer which are visiting Indian restaurants present at these locations and as these restaunrants are not offering/selling any Indian fast food and snaks(They are just selling Indian Meals). So opening new Indian Fast food and snaks center he can make the good start by attracting Indian Customers already vising here.Hence Cabbagetown,St. James Town, Thorncliffe Park Neighbourhood is recommended location for our Contractor.

### Now, Lets find out suitable location for fitness club

In [86]:
df_result2=df_cluster1[df_cluster1['1st Most Common Venue'].str.contains('Gym')| df_cluster1['1st Most Common Venue'].str.contains('Yoga') 
                      | df_cluster1['2nd Most Common Venue'].str.contains('Gym') | df_cluster1['2nd Most Common Venue'].str.contains('Yoga')
                      | df_cluster1['3rd Most Common Venue'].str.contains('Gym') | df_cluster1['3rd Most Common Venue'].str.contains('Yoga')
                     ].reset_index(drop=True)
df_result2.head()


Unnamed: 0,Neighborhood,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Church and Wellesley,Downtown Toronto,0,Burger Joint,Japanese Restaurant,Gym / Fitness Center,Sushi Restaurant,Nightclub
1,"Dovercourt Village,Dufferin",West Toronto,0,Gym / Fitness Center,Music Venue,Supermarket,Bank,Pharmacy
2,"Harbord,University of Toronto",Downtown Toronto,0,Sandwich Place,Coffee Shop,College Gym,College Cafeteria,Food Truck
3,"Harbourfront,Regent Park",Downtown Toronto,0,Gym / Fitness Center,Coffee Shop,Spa,Bakery,Breakfast Spot
4,Lawrence Park,Central Toronto,0,Gym / Fitness Center,Dessert Shop,Farmers Market,Ethiopian Restaurant,Electronics Store


In [90]:
#df_result3.loc[1, 'Neighborhood'] = df_cluster1.loc[1, 'Neighborhood'] - df_result2.loc[1, 'Neighborhood']
df_result3= pd.concat([df_cluster1, df_result2]).drop_duplicates(keep=False)
#df_result3

df_result3=df_result3[df_result3['1st Most Common Venue'].str.contains('Park')| df_result3['1st Most Common Venue'].str.contains('Outdoors') 
                      | df_result3['2nd Most Common Venue'].str.contains('Park') | df_result3['2nd Most Common Venue'].str.contains('Outdoors')
                      | df_result3['3rd Most Common Venue'].str.contains('Park') | df_result3['3rd Most Common Venue'].str.contains('Outdoors')
                     ].reset_index(drop=True)
df_result3.head()

Unnamed: 0,Neighborhood,Borough,Cluster_Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide,King,Richmond",Downtown Toronto,0,Steakhouse,Asian Restaurant,Park,Greek Restaurant,Hotel
1,The Beaches,East Toronto,0,Park,Other Great Outdoors,Playground,Trail,Dance Studio


## Recommended location for Fitness Club:
#### In The Beaches ,East Toronto seems to be the best place for  Fitness Club. This area is having park and outdoor activities but not the Indoor activities. Also all the activities are not at one place e.g Dance studio is seperate , plaground is at different spot,park is somewhere else so user or customer has to visit different places if he has to do all the sport for fitness activities.
#### So If he open the Fitness Club which will have all the Indoor activities like Gym, Zumba dance , Swimming  Pool, Table tennis, badminton and tennis Court etc will make good business here.