In [1]:
import numpy as np 

import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json 
from geopy.geocoders import Nominatim 

import geocoder 

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


# 2. Scrap data from Wikipedia page into a DataFrame

In [2]:
import re
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/Boroughs_of_Amsterdam').text
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')
# create a list to store neighborhood data
neighborhoodList = list()
# parse the data from the page
table_res = soup.find_all("table", class_=re.compile(r".*wikitable.*"))
tbody_res = table_res[0].find_all('tbody')
nelem_res = tbody_res[0].find_all('a', lambda c_: not c_)
# append the data into the list
for nelem in nelem_res:
    neighborhoodList.append(nelem.getText())

# create a new DataFrame from the list
df = pd.DataFrame({"Neighborhood": neighborhoodList})
df.head()

Unnamed: 0,Neighborhood
0,Centrum
1,Noord
2,Nieuw-West
3,Oost
4,West


In [3]:
df.shape

(7, 1)

In [4]:
address = 'amsterdam , netherlands'

geolocator = Nominatim(user_agent="can_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Amsterdam are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Amsterdam are 52.3727598, 4.8936041.


# 3. Get the geographical coordinates for each neighborhood

In [5]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, amsterdam , netherlands'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [6]:
coords = [ get_latlng(neighborhood) for neighborhood in df["Neighborhood"].tolist() ]

In [7]:
coords

[[52.375950000000046, 4.89915000000002],
 [52.38150092518048, 4.916137875513465],
 [52.37241593355806, 4.900627489953397],
 [52.36013000000003, 4.925320000000056],
 [52.37280000000004, 4.889760000000024],
 [52.37638000000004, 4.937500000000057],
 [52.31619000000006, 4.951370000000054]]

In [8]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [9]:
# merge the coordinates into the original dataframe
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [10]:
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Centrum,52.37595,4.89915
1,Noord,52.381501,4.916138
2,Nieuw-West,52.372416,4.900627
3,Oost,52.36013,4.92532
4,West,52.3728,4.88976
5,Zuid,52.37638,4.9375
6,Zuidoost,52.31619,4.95137


In [11]:
df.shape

(7, 3)

# 4. Create a map of Amsterdam with neighborhoods

In [12]:
# create map of amsterdam using latitude and longitude values
map_da = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'],df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_da)  
    
map_da

# 5. Use the Foursquare API to explore the neighborhoods

In [13]:
# Define Foursquare Credentials and Version
LIMIT = 100

CLIENT_ID = '4510O2EFHUUWKW4WLHQJT2BUYYKD10YZ53DSL1XLQH2IIZES' # your Foursquare ID
CLIENT_SECRET = 'RTMAUAZW4Y0XDJA4PAUAHH32T5D5EHKWVT3VHTB0KG14M22O' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4510O2EFHUUWKW4WLHQJT2BUYYKD10YZ53DSL1XLQH2IIZES
CLIENT_SECRET:RTMAUAZW4Y0XDJA4PAUAHH32T5D5EHKWVT3VHTB0KG14M22O


 *Let's get the top 20 venues that are within a radius of 500 meters.*

In [14]:
radius = 500
LIMIT = 20

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [18]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()


#Creating a csv file of the data 
venues_df.to_csv(r'amsterdam_data.csv', index=False)

(140, 7)


In [19]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Centrum,20,20,20,20,20,20
Nieuw-West,20,20,20,20,20,20
Noord,20,20,20,20,20,20
Oost,20,20,20,20,20,20
West,20,20,20,20,20,20
Zuid,20,20,20,20,20,20
Zuidoost,20,20,20,20,20,20


*Lets see how many unique categories are there*

In [20]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 73 uniques categories.


In [21]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:20]

array(['Breakfast Spot', 'Coffee Shop', 'French Restaurant', 'Brewery',
       'Burger Joint', 'Pub', 'Church', 'Marijuana Dispensary', 'Museum',
       'Cheese Shop', 'Music Venue', 'Gay Bar', 'Bar', 'Hotel',
       'Chocolate Shop', 'Bakery', 'Whisky Bar', 'Smoke Shop',
       'Scenic Lookout', 'Supermarket'], dtype=object)

# 6. Analyze Each Neighborhood

In [22]:
# one hot encoding
kl_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
kl_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [kl_onehot.columns[-1]] + list(kl_onehot.columns[:-1])
kl_onehot = kl_onehot[fixed_columns]

print(kl_onehot.shape)
kl_onehot.head()


(140, 74)


Unnamed: 0,Neighborhoods,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Beer Bar,Beer Garden,Beer Store,Bookstore,Breakfast Spot,Brewery,Burger Joint,Butcher,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Concert Hall,Deli / Bodega,Department Store,Dessert Shop,Drugstore,Falafel Restaurant,Fish Market,French Restaurant,Friterie,Furniture / Home Store,Gastropub,Gay Bar,Gift Shop,Grocery Store,Gym,Gymnastics Gym,Hostel,Hotel,Hotel Bar,Indonesian Restaurant,Italian Restaurant,Jazz Club,Lounge,Marijuana Dispensary,Market,Museum,Music Venue,North Indian Restaurant,Organic Grocery,Palace,Park,Plaza,Pub,Restaurant,Salad Place,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shopping Mall,Smoke Shop,South American Restaurant,Spanish Restaurant,Spiritual Center,Supermarket,Swiss Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Whisky Bar
0,Centrum,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Centrum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Centrum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Centrum,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Centrum,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


*Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category*

In [23]:
kl_grouped = kl_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(kl_grouped.shape)
kl_grouped

(7, 74)


Unnamed: 0,Neighborhoods,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bar,Beer Bar,Beer Garden,Beer Store,Bookstore,Breakfast Spot,Brewery,Burger Joint,Butcher,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Concert Hall,Deli / Bodega,Department Store,Dessert Shop,Drugstore,Falafel Restaurant,Fish Market,French Restaurant,Friterie,Furniture / Home Store,Gastropub,Gay Bar,Gift Shop,Grocery Store,Gym,Gymnastics Gym,Hostel,Hotel,Hotel Bar,Indonesian Restaurant,Italian Restaurant,Jazz Club,Lounge,Marijuana Dispensary,Market,Museum,Music Venue,North Indian Restaurant,Organic Grocery,Palace,Park,Plaza,Pub,Restaurant,Salad Place,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shopping Mall,Smoke Shop,South American Restaurant,Spanish Restaurant,Spiritual Center,Supermarket,Swiss Restaurant,Tea Room,Thai Restaurant,Turkish Restaurant,Whisky Bar
0,Centrum,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.05,0.05,0.05,0.0,0.0,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05
1,Nieuw-West,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.05,0.15,0.0,0.0,0.1,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.05,0.0,0.0
2,Noord,0.05,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.1,0.05,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
3,Oost,0.0,0.0,0.0,0.0,0.05,0.05,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.0
4,West,0.0,0.05,0.0,0.0,0.0,0.1,0.05,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Zuid,0.0,0.0,0.0,0.05,0.05,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.05,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.0
6,Zuidoost,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.1,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.05,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.05,0.05,0.05,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
len(kl_grouped[kl_grouped["Bakery"] > 0])

5

In [25]:
kl_mall = kl_grouped[["Neighborhoods","Bakery"]]

In [26]:
kl_mall.head()

Unnamed: 0,Neighborhoods,Bakery
0,Centrum,0.05
1,Nieuw-West,0.0
2,Noord,0.05
3,Oost,0.05
4,West,0.0


# 7. Cluster Neighborhoods

In [27]:
# set number of clusters
kclusters = 3

kl_clustering = kl_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 0, 2, 2, 0, 2, 1])

In [28]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
kl_merged = kl_mall.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_

In [29]:
kl_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
kl_merged.head()

Unnamed: 0,Neighborhood,Bakery,Cluster Labels
0,Centrum,0.05,2
1,Nieuw-West,0.0,0
2,Noord,0.05,2
3,Oost,0.05,2
4,West,0.0,0


In [30]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
kl_merged = kl_merged.join(df.set_index("Neighborhood"), on="Neighborhood")

print(kl_merged.shape)
kl_merged.head() # check the last columns

(7, 5)


Unnamed: 0,Neighborhood,Bakery,Cluster Labels,Latitude,Longitude
0,Centrum,0.05,2,52.37595,4.89915
1,Nieuw-West,0.0,0,52.372416,4.900627
2,Noord,0.05,2,52.381501,4.916138
3,Oost,0.05,2,52.36013,4.92532
4,West,0.0,0,52.3728,4.88976


In [31]:
# sort the results by Cluster Labels
print(kl_merged.shape)
kl_merged.sort_values(["Cluster Labels"], inplace=True)
kl_merged

(7, 5)


Unnamed: 0,Neighborhood,Bakery,Cluster Labels,Latitude,Longitude
1,Nieuw-West,0.0,0,52.372416,4.900627
4,West,0.0,0,52.3728,4.88976
6,Zuidoost,0.1,1,52.31619,4.95137
0,Centrum,0.05,2,52.37595,4.89915
2,Noord,0.05,2,52.381501,4.916138
3,Oost,0.05,2,52.36013,4.92532
5,Zuid,0.05,2,52.37638,4.9375


In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['Latitude'], kl_merged['Longitude'], kl_merged['Neighborhood'], kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [33]:
# regions with no bakeries (most prominent for new bakery)
kl_merged.loc[kl_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,Latitude,Longitude
1,Nieuw-West,0.0,0,52.372416,4.900627
4,West,0.0,0,52.3728,4.88976


In [34]:
kl_merged.loc[kl_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,Latitude,Longitude
6,Zuidoost,0.1,1,52.31619,4.95137


In [35]:
kl_merged.loc[kl_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,Latitude,Longitude
0,Centrum,0.05,2,52.37595,4.89915
2,Noord,0.05,2,52.381501,4.916138
3,Oost,0.05,2,52.36013,4.92532
5,Zuid,0.05,2,52.37638,4.9375
