## Installing Llibraries

In [1]:
#!pip install pandas
#!pip install requests
#!pip install bs4
#!pip install plotly

## Importing the required libraries

In [2]:
# Pandas for data analysis
import pandas as pd
print('pandas imported...')

# K-means from ScikitLearn
from sklearn.cluster import KMeans
print('Kmeans imported...')

# Folium for map visualisation
import folium
print('Folium imported...')

# import geocoder to get latitude and longitude coordinates
import geocoder
print('Geocoder imported...')

# to convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
print('Nominatim imported...')


# Request and BeautifulSoup for websacrapping
import requests
print('requests imported...')

from bs4 import BeautifulSoup
print('BeautifulSoup imported...')

pandas imported...
Kmeans imported...
Folium imported...
Geocoder imported...
Nominatim imported...
requests imported...
BeautifulSoup imported...


---
## Preparing the Data

The Greater London Area will be covered in this project. The data will be collected and organised from the wikipedia <code>List of areas of London</code> page which can be found at:
https://en.wikipedia.org/wiki/List_of_areas_of_London

### Webscrapping
We start by scrapping the wikipedia page. 

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_areas_of_London'

The data is already organised in a tablular format so it is possible to simply read it using pandas

In [4]:
dfs = pd.read_html(url)

The necessary dataframe can now be extracted. 

In [5]:
london_df = dfs[1]

# checking the data
london_df.head()

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


It is now possible to start cleaning the data. First, the columns are renamed.

In [6]:
london_df.columns = ['Neighborhood', 'Borough', 'Town', 'Postcode', 'Dialcode', 'OSgridRef']
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode,Dialcode,OSgridRef
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


Then the unnecessary informaiton from <code>Borough</code> column is removed.

In [7]:
london_df['Borough'] = london_df['Borough'].str.replace("[","")
london_df['Borough'] = london_df['Borough'].str.replace("]","")
london_df['Borough'] = london_df['Borough'].str.replace("0","")
london_df['Borough'] = london_df['Borough'].str.replace("1","")
london_df['Borough'] = london_df['Borough'].str.replace("2","")
london_df['Borough'] = london_df['Borough'].str.replace("3","")
london_df['Borough'] = london_df['Borough'].str.replace("4","")
london_df['Borough'] = london_df['Borough'].str.replace("5","")
london_df['Borough'] = london_df['Borough'].str.replace("6","")
london_df['Borough'] = london_df['Borough'].str.replace("7","")
london_df['Borough'] = london_df['Borough'].str.replace("8","")
london_df['Borough'] = london_df['Borough'].str.replace("9","")
london_df['Borough'] = london_df['Borough'].str.replace("10","")

  london_df['Borough'] = london_df['Borough'].str.replace("[","")
  london_df['Borough'] = london_df['Borough'].str.replace("]","")


In [8]:
#to see if the unnecessary informaiton has been cleaned as required
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode,Dialcode,OSgridRef
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon,CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon,CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [9]:
#to examine the data
london_df.shape

(531, 6)

Only the <code>Location</code>, <code>Borough</code>, <code>Town</code> and <code>Postcode</code> is required for the analysis so <code>Dialcode</code> and <code>OSgridRef</code> are removed

In [10]:
london_df = london_df[['Neighborhood', 'Borough', 'Town', 'Postcode']]
london_df.reset_index(drop=True, inplace=True)
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4"
2,Addington,Croydon,CROYDON,CR0
3,Addiscombe,Croydon,CROYDON,CR0
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14"


In [11]:
#to examine the data after manipulation
london_df.shape

(531, 4)

Rows with multiple <code>Postcode</code> values, Such as <code>Acton W3, W4</code> are spread to multiple rows with the same values for the reamining columns.

In [12]:
london_df = (london_df.set_index(['Neighborhood', 'Borough', 'Town'])
   .apply(lambda x: x.str.split(',').explode())
   .reset_index())

london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,W3
2,Acton,"Ealing, Hammersmith and Fulham",LONDON,W4
3,Addington,Croydon,CROYDON,CR0
4,Addiscombe,Croydon,CROYDON,CR0


In [13]:
#examine the data after manipulation
london_df.shape

(636, 4)

Remove white space from <code>Postcode</code> and convert <code>Town</code> to title case

In [14]:
london_df['Postcode'] = london_df['Postcode'].str.strip()
london_df['Town'] = london_df['Town'].str.title()
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",London,SE2
1,Acton,"Ealing, Hammersmith and Fulham",London,W3
2,Acton,"Ealing, Hammersmith and Fulham",London,W4
3,Addington,Croydon,Croydon,CR0
4,Addiscombe,Croydon,Croydon,CR0


Subet is used to only include postcodes with London as <code>Town</code>.

In [15]:
london_df = london_df[london_df['Town'] == 'London']
london_df.reset_index(drop=True, inplace=True)
#examine the data after manipulation
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode
0,Abbey Wood,"Bexley, Greenwich",London,SE2
1,Acton,"Ealing, Hammersmith and Fulham",London,W3
2,Acton,"Ealing, Hammersmith and Fulham",London,W4
3,Aldgate,City,London,EC3
4,Aldwych,Westminster,London,WC2


In [16]:
#examine the data after manipulation
london_df.tail()

Unnamed: 0,Neighborhood,Borough,Town,Postcode
350,Woodford,Redbridge,London,IG8
351,Woodford,Redbridge,London,E18
352,Woodside Park,Barnet,London,N12
353,Woolwich,Greenwich,London,SE18
354,Wormwood Scrubs,Hammersmith and Fulham,London,W12


In [17]:
#examine the data after manipulation
london_df.shape

(355, 4)

The coordinates for the different postcodes are collected using <code>geocoder</code> and <code>ArcGIS</code>

In [18]:
#Function to get coordinates
def get_coord(arcgis_geocoder):
    
    # Initialize the Location (lat. and long.) to "None"
    lat_lng_coords = None
    #  loop  to geocode all locations
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, United Kingdom'.format(arcgis_geocoder))
        lat_lng_coords = g.latlng
    return lat_lng_coords

Getting the coordinated adn preparing <code>geo_tag_coord</code> dataframe.

In [19]:
postal_codes = london_df['Postcode']
coordinates = [get_coord(postal_code) for postal_code in postal_codes.tolist()]

In [20]:
geo_tag_coord = pd.DataFrame(coordinates, columns = ['Latitude', 'Longitude'])
geo_tag_coord.head()

Unnamed: 0,Latitude,Longitude
0,51.49245,0.12127
1,51.51324,-0.26746
2,51.48944,-0.26194
3,51.512,-0.08058
4,51.51651,-0.11968


Adding Latitude and Longitude to <code>london_df</code>

In [21]:
london_df['Latitude'] = geo_tag_coord['Latitude']
london_df['Longitude'] = geo_tag_coord['Longitude']
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode,Latitude,Longitude
0,Abbey Wood,"Bexley, Greenwich",London,SE2,51.49245,0.12127
1,Acton,"Ealing, Hammersmith and Fulham",London,W3,51.51324,-0.26746
2,Acton,"Ealing, Hammersmith and Fulham",London,W4,51.48944,-0.26194
3,Aldgate,City,London,EC3,51.512,-0.08058
4,Aldwych,Westminster,London,WC2,51.51651,-0.11968


The geocoder limit the numbers of calls and the coordinates are not fully coded

In [23]:
#reset index
london_df.reset_index(drop=True, inplace=True)
london_df.head()

Unnamed: 0,Neighborhood,Borough,Town,Postcode,Latitude,Longitude
350,Woodford,Redbridge,London,IG8,51.50642,-0.12721
351,Woodford,Redbridge,London,E18,51.58977,0.03052
352,Woodside Park,Barnet,London,N12,51.61592,-0.17674
353,Woolwich,Greenwich,London,SE18,51.48207,0.07143
354,Wormwood Scrubs,Hammersmith and Fulham,London,W12,51.50645,-0.23691


In [24]:
# to see if there are missing values 
london_df['Latitude'].isnull().values.any()

False

In [25]:
# to count the number of missing values
london_df['Latitude'].isnull().values.sum()

0

In [27]:
#to save the new dataframe

london_df.to_csv('london_df.csv')

Alternativaly, it is possible to find data that is required [from the following link](https://www.freemaptools.com/download-uk-postcode-lat-lng.htm)

In [None]:
# csv is downloaded and imported
# geo_tag = pd.read_csv(ukpostcodes.csv)

## Use geopy library to get the latitude and longitude values of London.

In [28]:
address = 'London, UK'

geolocator = Nominatim(user_agent="london_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London are 51.5073219, -0.1276474.


Create a map of New York with neighborhoods superimposed on top.

In [29]:
# create map of New York using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(london_df['Latitude'], london_df['Longitude'], london_df['Borough'], london_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  
    
map_london

Define foursquare credentials

In [None]:
CLIENT_ID = 'OEIS52KF003ICJX0JJD431KGSXJUSJVI5LCQMFLWFPEQ54BU' # your Foursquare ID
CLIENT_SECRET = 'ZKRTMXDRBMXTPIQGTBJIGCUXAY2SRW4WT43OH5KO5V4RRIC5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)