In [15]:
import requests 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


# Segmenting London to find the best Borough to open up a Sports Shop 

## 1. Download and Explore Dataset

In [16]:

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# install and import folium library 
!pip -q install folium
import folium 

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [17]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name[0]
        ll = data8.get_text()
        lat_long = ll[2]
        latitude = lat_long[0]
        longitude = lat_long[1]
#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

In [18]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name.split('[')
        borough_name = borough_name[0]
        borough_name = borough_name.strip()
        
        ll = data8.get_text()
        ll = ll.split('/')
        lat_long = ll[2]
        lat_long = lat_long.split('(')
        lat_long = lat_long[0]
        lat_long = lat_long.split(';')
        latitude = lat_long[0]
        latitude = latitude.strip()
        longitude = lat_long[1]
        longitude = longitude.strip()
        longitude = longitude.replace(u'\ufeff', '')
        latitude = float(latitude)
        longitude = float(longitude)

#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

In [19]:
df_boroughs = pd.DataFrame(df_list, columns=['Borough', 'Latitude' , 'Longitude'])

df_boroughs.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Barking and Dagenham,51.5607,0.1557
1,Barnet,51.6252,-0.1517
2,Bexley,51.4549,0.1505
3,Brent,51.5588,-0.2817
4,Bromley,51.4039,0.0198


In [20]:
df_boroughs.dtypes

Borough       object
Latitude     float64
Longitude    float64
dtype: object

In [21]:
address = 'London, UK'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London, UK are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London, UK are 51.5073219, -0.1276474.


In [62]:
# create map  using latitude and longitude values
map_ldn = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ldn)  
    
map_ldn

### Define Foursquare Credentials and Version

In [22]:
CLIENT_ID = 'VZEBPW03MNLIC50BVQSL322IWXGYDHW42JVWH0E3OI5LTRHA' # your Foursquare ID
CLIENT_SECRET = 'TWI2CNG2TQCO1C0M43TXUIEO550JKBKSWVOBJODHI5PCNGUT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VZEBPW03MNLIC50BVQSL322IWXGYDHW42JVWH0E3OI5LTRHA
CLIENT_SECRET:TWI2CNG2TQCO1C0M43TXUIEO550JKBKSWVOBJODHI5PCNGUT


## 2. Explore Venues in Boroughs

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=60):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
df_venues = getNearbyVenues(names=df_boroughs['Borough'],
                                   latitudes=df_boroughs['Latitude'],
                                   longitudes=df_boroughs['Longitude']
                                  )

In [32]:
df_venues.head()

Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barnet,51.6252,-0.1517,The Atrium,51.624726,-0.151933,Café
1,Brent,51.5588,-0.2817,Starbucks,51.558689,-0.281994,Coffee Shop
2,Bromley,51.4039,0.0198,Adventure Kingdom,51.404121,0.020271,Playground
3,Croydon,51.3714,-0.0977,Queen's Gardens,51.371861,-0.097697,Park
4,Enfield,51.6538,-0.0799,Ada,51.653614,-0.080456,Turkish Restaurant


## 3. Analyze Each Borough

In [33]:
# one hot encoding
df_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_onehot['Borough'] = df_venues['Borough'] 

# move neighborhood column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

Unnamed: 0,Borough,Asian Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,...,Playground,Pub,Restaurant,Sandwich Place,South American Restaurant,Supermarket,Theater,Turkish Restaurant,Vietnamese Restaurant,Warehouse Store
0,Barnet,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Brent,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bromley,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,Croydon,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Enfield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
df_onehot.shape

(45, 33)

In [35]:
df_grouped = df_onehot.groupby('Borough').mean().reset_index()
df_grouped

Unnamed: 0,Borough,Asian Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,...,Playground,Pub,Restaurant,Sandwich Place,South American Restaurant,Supermarket,Theater,Turkish Restaurant,Vietnamese Restaurant,Warehouse Store
0,Barnet,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Brent,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bromley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Croydon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Enfield,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,Hackney,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Hammersmith and Fulham,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Islington,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0
8,Kingston upon Thames,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
9,Lambeth,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [46]:
num_top_venues = 11

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
df_venues_sorted = pd.DataFrame(columns=columns)
df_venues_sorted['Borough'] = df_grouped['Borough']

for ind in np.arange(df_grouped.shape[0]):
    df_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

df_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
0,Barnet,Café,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,Flower Shop
1,Brent,Coffee Shop,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Diner,Fast Food Restaurant,Flower Shop
2,Bromley,Playground,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant
3,Croydon,Park,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant
4,Enfield,Turkish Restaurant,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant


## 4. Cluster Neighborhoods

In [47]:
# set number of clusters
kclusters = 5

df_grouped_clustering = df_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
kmeans.labels_.shape

(17,)

In [48]:
# add clustering labels
df_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df_boroughs

# merge 
df_merged = df_merged.join(df_venues_sorted.set_index('Borough'), on='Borough')

df_merged.head() 

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
0,Barking and Dagenham,51.5607,0.1557,,,,,,,,,,,,
1,Barnet,51.6252,-0.1517,2.0,Café,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,Flower Shop
2,Bexley,51.4549,0.1505,,,,,,,,,,,,
3,Brent,51.5588,-0.2817,4.0,Coffee Shop,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Diner,Fast Food Restaurant,Flower Shop
4,Bromley,51.4039,0.0198,1.0,Playground,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant


## 5. Examine Clusters

In [51]:
df_merged.loc[df_merged['Cluster Labels'] == 0]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
8,Enfield,51.6538,-0.0799,0.0,Turkish Restaurant,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant


In [53]:
df_merged.loc[df_merged['Cluster Labels'] == 1]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
4,Bromley,51.4039,0.0198,1.0,Playground,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant
10,Hackney,51.545,-0.0553,1.0,Movie Theater,Museum,Park,Performing Arts Venue,Pub,Warehouse Store,Fast Food Restaurant,Grocery Store,Flower Shop,Diner,Italian Restaurant
11,Hammersmith and Fulham,51.4927,-0.2339,1.0,Chinese Restaurant,Pub,Flower Shop,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Coffee Shop,Diner
17,Islington,51.5416,-0.1022,1.0,Mediterranean Restaurant,Turkish Restaurant,Bakery,South American Restaurant,Music Venue,Karaoke Bar,Diner,Flower Shop,Italian Restaurant,Gym,Grocery Store
19,Kingston upon Thames,51.4085,-0.3064,1.0,Italian Restaurant,Asian Restaurant,Vietnamese Restaurant,Café,Bakery,Bus Stop,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant
20,Lambeth,51.4607,-0.1163,1.0,Caribbean Restaurant,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Café,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,Flower Shop
21,Lewisham,51.4452,-0.0209,1.0,Coffee Shop,Theater,Café,Vietnamese Restaurant,Bus Stop,Bakery,Caribbean Restaurant,Chinese Restaurant,Mediterranean Restaurant,Diner,Fast Food Restaurant
22,Merton,51.4014,-0.1958,1.0,Bus Stop,Coffee Shop,Italian Restaurant,Warehouse Store,Vietnamese Restaurant,Bakery,Café,Caribbean Restaurant,Chinese Restaurant,Diner,Fast Food Restaurant
24,Redbridge,51.559,0.0741,1.0,Warehouse Store,Supermarket,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant
25,Richmond upon Thames,51.4479,-0.326,1.0,Vietnamese Restaurant,Pharmacy,Fast Food Restaurant,Karaoke Bar,Italian Restaurant,Gym,Grocery Store,Flower Shop,Warehouse Store,Mediterranean Restaurant,Coffee Shop


In [55]:
df_merged.loc[df_merged['Cluster Labels'] == 2]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
1,Barnet,51.6252,-0.1517,2.0,Café,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant,Flower Shop


In [56]:
df_merged.loc[df_merged['Cluster Labels'] == 3]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
6,Croydon,51.3714,-0.0977,3.0,Park,Warehouse Store,Mediterranean Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Diner,Fast Food Restaurant


In [57]:
df_merged.loc[df_merged['Cluster Labels'] == 4]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue
3,Brent,51.5588,-0.2817,4.0,Coffee Shop,Warehouse Store,Vietnamese Restaurant,Bakery,Bus Stop,Café,Caribbean Restaurant,Chinese Restaurant,Diner,Fast Food Restaurant,Flower Shop



## Conclusion: After examining the above 5 clusters, we can recommend our stakeholders that Islington is the best borough to open up a sports shop as it is the only one with Gym as one of its top venues. 