# Data Wrangling
## Import Libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis
import requests
!pip install folium # map rendering library
import folium # map rendering library
!pip install geocoder
import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

# import BeautifulSoup 4
from bs4 import BeautifulSoup

print('Libraries imported.')

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/72/ff/004bfe344150a064e558cb2aedeaa02ecbf75e60e148a55a9198f0c41765/folium-0.10.0-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 4.7MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.0
Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 5.1MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installin

## Scrape the Wikipedia page for district data

In [2]:
WC_URL = 'https://en.wikipedia.org/wiki/WC_postcode_area'
EC_URL = 'https://en.wikipedia.org/wiki/EC_postcode_area'

In [3]:
def getLondonDistrict(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'lxml')
    district_table = soup.find('table', class_='wikitable sortable')

    District = []
    Neighbourhood = []

    for row in district_table.findAll('tr'):
        cells=row.findAll('td')
        if len(cells) == 3:
            # Gather District and neighbourhood data
            curr_district=row.find('th').get_text(strip=True)
            District.append(curr_district)
            curr_nhood = ', '.join(str(x.get_text()) for x in cells[1].find_all('a'))
            Neighbourhood.append(curr_nhood)
            
    london_data=pd.DataFrame(District,columns = ['District'])
    london_data['Neighbourhood'] = Neighbourhood
    return(london_data)

In [4]:
london_data = getLondonDistrict(WC_URL)
london_data = london_data.append([getLondonDistrict(EC_URL)])
london_data.reset_index(drop=True, inplace=True)
london_data

Unnamed: 0,District,Neighbourhood
0,WC1A,New Oxford Street
1,WC1B,"Bloomsbury, British Museum, Southampton Row"
2,WC1E,"University College London, SOAS"
3,WC1H,"St Pancras, UCL Institute of Education"
4,WC1N,"Russell Square, Great Ormond Street"
5,WC1R,Gray's Inn
6,WC1V,High Holborn
7,WC1X,"Kings Cross, Finsbury, Clerkenwell"
8,WC2A,"Lincoln's Inn Fields, Royal Courts of Justice,..."
9,WC2B,"Drury Lane, Kingsway, Aldwych"


## Gather district location data

In [5]:
Latitude = []
Longitude = []

for curr_district in london_data['District']:
    # Gather latitude/longitude data using the Geocoder API
    g = geocoder.arcgis('{}, London, England'.format(curr_district))
    Latitude.append(g.latlng[0])
    Longitude.append(g.latlng[1])

london_data['Latitude'] = Latitude
london_data['Longitude'] = Longitude
london_data

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,WC1A,New Oxford Street,51.517165,-0.126811
1,WC1B,"Bloomsbury, British Museum, Southampton Row",51.51914,-0.127759
2,WC1E,"University College London, SOAS",51.52242,-0.133671
3,WC1H,"St Pancras, UCL Institute of Education",51.524755,-0.13139
4,WC1N,"Russell Square, Great Ormond Street",51.52446,-0.123583
5,WC1R,Gray's Inn,51.519375,-0.117611
6,WC1V,High Holborn,51.51866,-0.112101
7,WC1X,"Kings Cross, Finsbury, Clerkenwell",51.530185,-0.121485
8,WC2A,"Lincoln's Inn Fields, Royal Courts of Justice,...",51.516615,-0.116091
9,WC2B,"Drury Lane, Kingsway, Aldwych",51.5148,-0.121157


## Map each of the districts

In [6]:
# Obtain latitude/longitude of London
g = geocoder.arcgis('London, England')
london_lat = g.latlng[0]
london_long = g.latlng[1]

# create map of London using latitude and longitude values
map_london = folium.Map(location=[london_lat, london_long], zoom_start=13)

# add markers to map
for lat, lng, dist in zip(london_data['Latitude'], london_data['Longitude'], london_data['District']):
    label = folium.Popup(dist, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  

map_london

## Use Foursquare API to explore the neighbourhoods

In [7]:
# The code was removed by Watson Studio for sharing.

Foursquare credentails loaded


In [8]:
# A function that returns nearby Bike Shop venues for a given latitude/longitude
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100, categoryId='4bf58dd8d48988d115951735'):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit,
            categoryId)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                             'District Latitude', 
                             'District Longitude', 
                             'Venue', 
                             'Venue Latitude', 
                             'Venue Longitude',
                             'Venue Category']
    
    return(nearby_venues)

In [9]:
london_venues = getNearbyVenues(names=london_data['District'],
                                latitudes=london_data['Latitude'],
                                longitudes=london_data['Longitude']
                                )

WC1A
WC1B
WC1E
WC1H
WC1N
WC1R
WC1V
WC1X
WC2A
WC2B
WC2E
WC2H
WC2N
WC2R
EC1A
EC1M
EC1N
EC1P
EC1R
EC1V
EC1Y
EC2A
EC2M
EC2N
EC2P
EC2R
EC2V
EC2Y
EC3A
EC3M
EC3N
EC3P
EC3R
EC3V
EC4A
EC4M
EC4N
EC4P
EC4R
EC4V
EC4Y
EC50


In [10]:
london_venues

Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,WC1A,51.517165,-0.126811,Brompton Junction,51.514481,-0.122230,Bike Shop
1,WC1A,51.517165,-0.126811,B1866,51.513732,-0.126321,Bike Shop
2,WC1A,51.517165,-0.126811,Cloud 9 Cycles,51.520209,-0.131023,Bike Shop
3,WC1A,51.517165,-0.126811,Cycle Republic,51.517525,-0.124207,Bike Shop
4,WC1A,51.517165,-0.126811,Vanmoof,51.514126,-0.126456,Bike Shop
5,WC1A,51.517165,-0.126811,Specialized Concept Store,51.512979,-0.125918,Bike Shop
6,WC1B,51.519140,-0.127759,Cloud 9 Cycles,51.520209,-0.131023,Bike Shop
7,WC1B,51.519140,-0.127759,Cycle Republic,51.517525,-0.124207,Bike Shop
8,WC1E,51.522420,-0.133671,Cloud 9 Cycles,51.520209,-0.131023,Bike Shop
9,WC1E,51.522420,-0.133671,Central Bikes,51.520353,-0.136432,Motorcycle Shop


In [11]:
london_venues.groupby('District').count()

Unnamed: 0_level_0,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EC1A,5,5,5,5,5,5
EC1M,3,3,3,3,3,3
EC1N,4,4,4,4,4,4
EC1P,1,1,1,1,1,1
EC1R,4,4,4,4,4,4
EC1V,2,2,2,2,2,2
EC1Y,3,3,3,3,3,3
EC2A,4,4,4,4,4,4
EC2M,4,4,4,4,4,4
EC2N,1,1,1,1,1,1


In [12]:
# One hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# Add district column back to dataframe
london_onehot['District'] = london_venues['District'] 

# Move District column to the first column
col_list = list(london_onehot.columns)
col_list.remove('District')
cols = ['District'] + [col for col in col_list]

london_onehot = london_onehot[cols]
london_onehot.head()

Unnamed: 0,District,Bike Shop,Café,Juice Bar,Motorcycle Shop
0,WC1A,1,0,0,0
1,WC1A,1,0,0,0
2,WC1A,1,0,0,0
3,WC1A,1,0,0,0
4,WC1A,1,0,0,0


In [13]:
# Drop non Bike Shop related venues (these are incorrectly returned from Foursquare)
london_onehot.drop(['Café', 'Juice Bar', 'Motorcycle Shop'], axis=1, inplace=True)
london_onehot

Unnamed: 0,District,Bike Shop
0,WC1A,1
1,WC1A,1
2,WC1A,1
3,WC1A,1
4,WC1A,1
5,WC1A,1
6,WC1B,1
7,WC1B,1
8,WC1E,1
9,WC1E,0


In [14]:
# Merge the onehot back in with all districts, since not all districts will have venues
london_onehot = pd.merge(london_onehot, london_data['District'], on='District', how='outer')

london_onehot.fillna(value=0, inplace=True)
london_onehot

Unnamed: 0,District,Bike Shop
0,WC1A,1.0
1,WC1A,1.0
2,WC1A,1.0
3,WC1A,1.0
4,WC1A,1.0
5,WC1A,1.0
6,WC1B,1.0
7,WC1B,1.0
8,WC1E,1.0
9,WC1E,0.0


In [15]:
# Prepare the data for clustering
london_grouped = london_onehot.groupby('District').mean().reset_index()
london_grouped

Unnamed: 0,District,Bike Shop
0,EC1A,1.0
1,EC1M,1.0
2,EC1N,1.0
3,EC1P,1.0
4,EC1R,1.0
5,EC1V,1.0
6,EC1Y,0.666667
7,EC2A,0.5
8,EC2M,1.0
9,EC2N,1.0


## Cluster Bike Shops
Run _k_-means to cluster the district into 3 clusters.

In [16]:
# Set number of clusters
kclusters = 3

london_grouped_clustering = london_grouped.drop('District', 1)

# Run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(london_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 2, 2, 0, 0], dtype=int32)

In [17]:
# Add clustering labels
london_grouped.insert(1, 'Cluster Labels', kmeans.labels_)
london_grouped

Unnamed: 0,District,Cluster Labels,Bike Shop
0,EC1A,0,1.0
1,EC1M,0,1.0
2,EC1N,0,1.0
3,EC1P,0,1.0
4,EC1R,0,1.0
5,EC1V,0,1.0
6,EC1Y,2,0.666667
7,EC2A,2,0.5
8,EC2M,0,1.0
9,EC2N,0,1.0


Create a new dataframe that includes the cluster and original data sets

In [18]:
london_merged = london_data

# Merge london_grouped with london_data to add latitude/longitude for each neighborhood
london_merged = london_merged.join(london_grouped.set_index('District'), on='District')

# Where there are no venues a NaN will have been created, drop those rows and convert the cluster back to int
london_merged.dropna(inplace=True)
london_merged["Cluster Labels"]=london_merged["Cluster Labels"].astype(int)

london_merged.head()

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,Bike Shop
0,WC1A,New Oxford Street,51.517165,-0.126811,0,1.0
1,WC1B,"Bloomsbury, British Museum, Southampton Row",51.51914,-0.127759,0,1.0
2,WC1E,"University College London, SOAS",51.52242,-0.133671,2,0.666667
3,WC1H,"St Pancras, UCL Institute of Education",51.524755,-0.13139,1,0.0
4,WC1N,"Russell Square, Great Ormond Street",51.52446,-0.123583,0,1.0


## Map the resulting clusters

In [19]:
# Create map
map_clusters = folium.Map(location=[london_lat, london_long], zoom_start=13)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(london_merged['Latitude'], london_merged['Longitude'], london_merged['District'], london_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine the clusters
## First Cluster

In [20]:
london_merged.loc[london_merged['Cluster Labels'] == 0]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,Bike Shop
0,WC1A,New Oxford Street,51.517165,-0.126811,0,1.0
1,WC1B,"Bloomsbury, British Museum, Southampton Row",51.51914,-0.127759,0,1.0
4,WC1N,"Russell Square, Great Ormond Street",51.52446,-0.123583,0,1.0
5,WC1R,Gray's Inn,51.519375,-0.117611,0,1.0
6,WC1V,High Holborn,51.51866,-0.112101,0,1.0
7,WC1X,"Kings Cross, Finsbury, Clerkenwell",51.530185,-0.121485,0,1.0
8,WC2A,"Lincoln's Inn Fields, Royal Courts of Justice,...",51.516615,-0.116091,0,1.0
9,WC2B,"Drury Lane, Kingsway, Aldwych",51.5148,-0.121157,0,1.0
10,WC2E,Covent Garden,51.511185,-0.121374,0,1.0
11,WC2H,"Leicester Square, St. Giles",51.51396,-0.129275,0,1.0


## Second Cluster

In [21]:
london_merged.loc[london_merged['Cluster Labels'] == 1]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,Bike Shop
3,WC1H,"St Pancras, UCL Institute of Education",51.524755,-0.13139,1,0.0
30,EC3N,"Tower Hill, Tower of London",51.508825,-0.076407,1,0.0


## Third Cluster

In [22]:
london_merged.loc[london_merged['Cluster Labels'] == 2]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,Bike Shop
2,WC1E,"University College London, SOAS",51.52242,-0.133671,2,0.666667
20,EC1Y,"St Luke's, Bunhill Fields",51.52563,-0.086289,2,0.666667
21,EC2A,Shoreditch,51.524365,-0.079623,2,0.5
