# Capstone Project – The Battle of Neighborhoods (Week 2)

## For problem description, data sources and methodology please visit:
- https://github.com/biancovic/Coursera_Capstone/blob/master/BattleOfNeighbourhoods/doc/week-1-intro.pdf
- https://github.com/biancovic/Coursera_Capstone/blob/master/BattleOfNeighbourhoods/doc/week-1-data.pdf

## Part 1: scrap the web for information about London boroughs


Get the list of London boroughs from Wikipedia

In [2]:
import pandas as pd
import requests 

#Interrogate the target url
website_url = 'https://en.wikipedia.org/wiki/London_boroughs'
result = requests.get(website_url).text

#Define the class of the element we are looking for [wikipedia table]
class_to_search = 'wikitable sortable'

#Retrieve all the tables in the document. Return an exception if no table is found.
tables = pd.read_html(website_url, attrs = {'class': class_to_search})
if not len(tables) > 0:
    raise Exception('The source page contains no tables')

#Get the first table (there is only one in the source page)
df = tables[0]

#Keep the borough column only
df = df[['London borough']]

#Clean the data
pd.set_option('mode.chained_assignment', None) #Switches off annoying warning
replacements = {'Hammersmith[notes 2]' : 'Hammersmith and Fulham', 
                'Barking[notes 3]' : 'Barking'}
for key in replacements.keys():
    df.replace(to_replace = key, value = replacements[key], inplace = True)
df


Unnamed: 0,London borough
0,Camden
1,Greenwich
2,Hackney
3,Hammersmith and Fulham
4,Islington
5,Kensington and Chelsea
6,Lambeth
7,Lewisham
8,Southwark
9,Tower Hamlets


Get the boroughs coordinates through GeoPy and add them to the dataframe

In [3]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "courser_capstone_battle_of_neighbourhoods_fb")
boroughs = df['London borough'].tolist()

#Get the locations
locations = list()
for borough in boroughs:
    locations.append(geolocator.geocode(borough + ', London'))

In [4]:
#Add geolocation to the dataframe
df['Latitude'] = [location.latitude for location in locations]
df['Longitude'] = [location.longitude for location in locations]

Get average income data and add them to the dataframe. Data are referred to year 2018; values are in £/week. Source: https://data.london.gov.uk/dataset/earnings-place-residence-borough

In [5]:
#Get the data from the repo
source = 'https://raw.githubusercontent.com/biancovic/Coursera_Capstone/master/BattleOfNeighbourhoods/data/London_weekly_income_2018_by_borough.csv'
df_income = pd.read_csv(source)

#Clean the income data
pd.set_option('mode.chained_assignment', None) #Switches off annoying warning
replacements = {'Barking and Dagenham' : 'Barking'}
for key in replacements.keys():
    df_income.replace(to_replace = key, value = replacements[key], inplace = True)

#Merge with the original dataset
df = pd.merge(df, df_income, how = 'inner', on = ['London borough'])

Get the population by borough. Data are referred to year 2018; values are in units. Source: https://www.citypopulation.de/en/uk/greaterlondon/

In [6]:
#Get the data from the repo
source = 'https://raw.githubusercontent.com/biancovic/Coursera_Capstone/master/BattleOfNeighbourhoods/data/London_population_2018_by_borough.csv'
df_population = pd.read_csv(source)

#Merge with the original dataset
df = pd.merge(df, df_population, how = 'inner', on = ['London borough'])

In [7]:
#Show the dataframe sorted by borough
df.sort_values(by = 'London borough', axis = 0, ascending = True)

Unnamed: 0,London borough,Latitude,Longitude,Income,Population
11,Barking,51.538992,0.080424,479.1,211998
12,Barnet,51.65309,-0.200226,536.6,392140
13,Bexley,51.441679,0.150488,513.8,247258
14,Brent,51.563826,-0.27576,480.0,330795
15,Bromley,51.402805,0.014814,632.5,331096
0,Camden,51.542305,-0.13956,634.7,262226
16,Croydon,51.371305,-0.101957,552.0,385346
17,Ealing,51.512655,-0.305195,523.0,341982
18,Enfield,51.652085,-0.081018,479.1,333869
1,Greenwich,51.482084,-0.004542,573.7,286186


## Part 2: use Foursquare API to get the points of interest around the centre of each borough

For each borough get the number of each of the following facilities within one mile from the centre:
- Italian restaurants;
- Cinemas;
- Theatres;
- Universities.

In [8]:
#Set the credentials for Foursquare (change with your own settings here)
CLIENT_ID = 'DGYMLBVGJRPOYU41OYW0ZOPSOS5BF5NXKQ1OUQZGP50IE0LV' # your Foursquare ID
CLIENT_SECRET = 'EPYV4BBYSUJAI4TRM0D1I3QOSYMRAXIJJMEJGEBE5WLSGMV3' # your Foursquare Secret

In [9]:
#*******************************************************
#********* This cell is for debug purpose only ********* 
#*******************************************************
# latitude = df.loc[0, 'Latitude']
# longitude = df.loc[0, 'Longitude']
# latitude, longitude
# #url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, row['Latitude'], row['Longitude'], VERSION, search_query, radius, LIMIT)
# endpoint = 'search'
# query = 'University'
# radius = 1.0 * 1609.34
# limit = 50
# version = '20200616'
# url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&ll={},{}&query={}&radius={}&v={}&limit={}'.format(endpoint, CLIENT_ID, CLIENT_SECRET, latitude, longitude, query, radius, version, limit)
# url

# results = requests.get(url).json()
# len(results['response']['venues'])

In [10]:
#List of amenities/places to look for
search_queries = ['Italian Restaurant', 'Theatre', 'Cinema', 'University']

#Add the corresponding empty columns to the dataframe
df = pd.concat([df, pd.DataFrame(columns = search_queries)], sort = True)

#Get the number of amenities/places within 1.0 miles from the center of each borough
radius = 1.0 * 1609.34

#Number of results to return
limit = 50

#Foursquare version
version = 20200617

#Update the dataframe
for i, row in df.iterrows():
    for search_query in search_queries:
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&query={}&radius={}&v={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, row['Latitude'], row['Longitude'], search_query, radius, version, limit)
        results = requests.get(url).json()
        if 'venues' in results['response'].keys():
            value = len(results['response']['venues'])
        else:
            value = 0
        df.loc[i, search_query] = value

df.head()

Unnamed: 0,Cinema,Income,Italian Restaurant,Latitude,London borough,Longitude,Population,Theatre,University
0,5,634.7,50,51.542305,Camden,-0.13956,262226.0,25,10
1,3,573.7,29,51.482084,Greenwich,-0.004542,286186.0,4,25
2,4,555.6,34,51.54324,Hackney,-0.049362,279665.0,4,5
3,4,681.3,0,51.492038,Hammersmith and Fulham,-0.22364,185426.0,10,1
4,9,687.6,50,51.538429,Islington,-0.099905,239142.0,28,50


## Part 3: rank the boroughs from the most to the least suitable

In [11]:
#Create a new dataframe to store the ranks
df_ranks = pd.DataFrame()
df_ranks['London borough'] = df['London borough']

#Select the features to rank and the way they are ranked (True = the higher the better, 
#False = the lower the better)
features_to_rank = [('Population', True), 
                    ('Income', True), 
                    ('Italian Restaurant', False),
                    ('Cinema', True), 
                    ('Theatre', True),
                    ('University', True)]

#Compute the rankings by feature
for feature in features_to_rank:
    column_name = feature[0] + '_rnk'
    df_ranks[column_name] = df[feature[0]].rank(ascending = feature[1])
    
#Add the overall rank as the average of the by-feature ranks
cols = list(df_ranks.columns)
cols.remove('London borough')
df_ranks['Overall_rnk'] = (df_ranks.loc[: , cols]).mean(axis = 1)

#Show the top five boroughs
df_ranks.sort_values(by = 'Overall_rnk', ascending = False, inplace = True)
df_ranks.head()
    

Unnamed: 0,London borough,Population_rnk,Income_rnk,Italian Restaurant_rnk,Cinema_rnk,Theatre_rnk,University_rnk,Overall_rnk
6,Lambeth,23.0,22.0,3.5,31.0,30.5,28.0,23.0
8,Southwark,21.0,20.0,3.5,30.0,30.5,27.0,22.0
4,Islington,8.0,30.0,3.5,29.0,29.0,31.0,21.75
9,Tower Hamlets,22.0,24.0,8.0,27.0,26.0,23.0,21.666667
3,Hammersmith and Fulham,3.0,29.0,31.0,21.5,24.0,13.5,20.333333


# Part 4: show the results on a map

In [12]:
import folium
from folium.features import DivIcon
from matplotlib import cm
from matplotlib.colors import to_hex

#Create an empty map of London
london_coordinates = (51.507222, -0.1275)
zoom = 12
m = folium.Map(location = london_coordinates,
               zoom_start = zoom,
               prefer_canvas = True)

#Define the colormap
colormap = cm.get_cmap('coolwarm')


#Add the markers to the map
standing = 1
for i, row in df_ranks.iterrows():
    index_in_df = df[df['London borough'] == row['London borough']].index.tolist()[0]
    fill_colour = to_hex(colormap((standing - 1)/df_ranks.shape[0]))
    coordinates = [df.loc[index_in_df, 'Latitude'], df.loc[index_in_df, 'Longitude']]
    
    #Add a circle to mark the center of each neigbourhood
    folium.CircleMarker(
         location = coordinates,
         radius = 15,
         color = 'Black',
         weight = 1.0,   
         fill = True,
         fill_color = fill_colour,
         fill_opacity = 0.8,
    ).add_to(m)
    
    #Write the standing with the circle mark
    folium.Marker(
        location = coordinates,
        popup = row['London borough'],
        icon = DivIcon(
            icon_size=(50,50),
            icon_anchor=(12,14),
            html='<div style="font-size: 16pt">%s</div>' % '{:02d}'.format(standing),
        )
   ).add_to(m) 
    standing = standing + 1
    
m.save('output/results.html')
m