In [1]:
import pandas as pd
import requests 

Get the list of London boroughs from Wikipedia

In [2]:
#Interrogate the target url
website_url = 'https://en.wikipedia.org/wiki/London_boroughs'
result = requests.get(website_url).text

#Define the class of the elment we're looking for [wikipedia table]
class_to_search = 'wikitable sortable'

#Retrieve all the tables in the document. Return an exception if no table is found.
tables = pd.read_html(website_url, attrs = {'class': class_to_search})
if not len(tables) > 0:
    raise Exception('The source page contains no tables')

#Get the first table (there is only one in the source page)
df = tables[0]

#Keep the borough column only
df = df[['London borough']]

#Clean the data
pd.set_option('mode.chained_assignment', None) #Switches off annoying warning
replacements = {'Hammersmith[notes 2]' : 'Hammersmith and Fulham', 
                'Barking[notes 3]' : 'Barking'}
for key in replacements.keys():
    df.replace(to_replace = key, value = replacements[key], inplace = True)
df


Unnamed: 0,London borough
0,Camden
1,Greenwich
2,Hackney
3,Hammersmith and Fulham
4,Islington
5,Kensington and Chelsea
6,Lambeth
7,Lewisham
8,Southwark
9,Tower Hamlets


Get the boroughs coordinates through GeoPy and add them to the dataframe

In [3]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent = "courser_capstone_battle_of_neighbourhoods_fb")
boroughs = df['London borough'].tolist()

#Get the locations
locations = list()
for borough in boroughs:
    locations.append(geolocator.geocode(borough + ', London'))

In [4]:
#Add geolocation to the dataframe
df['Latitude'] = [location.longitude for location in locations]
df['Longitude'] = [location.longitude for location in locations]
df

Unnamed: 0,London borough,Latitude,Longitude
0,Camden,-0.13956,-0.13956
1,Greenwich,-0.004542,-0.004542
2,Hackney,-0.049362,-0.049362
3,Hammersmith and Fulham,-0.22364,-0.22364
4,Islington,-0.099905,-0.099905
5,Kensington and Chelsea,-0.199123,-0.199123
6,Lambeth,-0.117287,-0.117287
7,Lewisham,-0.010133,-0.010133
8,Southwark,-0.103458,-0.103458
9,Tower Hamlets,-0.033585,-0.033585


Get average income data and add them to the dataframe. Data are referred to year 2018; values are in £/week. Source: https://data.london.gov.uk/dataset/earnings-place-residence-borough

In [5]:
#Get the data from the repo
source = 'https://raw.githubusercontent.com/biancovic/Coursera_Capstone/master/BattleOfNeighbourhoods/data/London_weekly_income_2018_by_borough.csv'
df_income = pd.read_csv(source)

#Clean the income data
pd.set_option('mode.chained_assignment', None) #Switches off annoying warning
replacements = {'Barking and Dagenham' : 'Barking'}
for key in replacements.keys():
    df_income.replace(to_replace = key, value = replacements[key], inplace = True)

#Merge with the original dataset
df = pd.merge(df, df_income, how = 'inner', on = ['London borough'])
df

Unnamed: 0,London borough,Latitude,Longitude,Income
0,Camden,-0.13956,-0.13956,634.7
1,Greenwich,-0.004542,-0.004542,573.7
2,Hackney,-0.049362,-0.049362,555.6
3,Hammersmith and Fulham,-0.22364,-0.22364,681.3
4,Islington,-0.099905,-0.099905,687.6
5,Kensington and Chelsea,-0.199123,-0.199123,669.3
6,Lambeth,-0.117287,-0.117287,620.4
7,Lewisham,-0.010133,-0.010133,551.4
8,Southwark,-0.103458,-0.103458,589.4
9,Tower Hamlets,-0.033585,-0.033585,627.9
