In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import numpy as np
from pandas.io.json import json_normalize
import airbnb
import ezhc as hc


#### Fetch the latitude & longitude of the cities compared

In [2]:
geolocator = Nominatim(user_agent="blabla_explorer")

In [145]:
res = dict()
addresses = ['Bayonne', 'Biarritz', 'Anglet', 'Bordeaux', 'Istanbul', 'London', 'Ankara',
             'Bilbao', 'Barcelone', 'Birmingham', 'Dublin', 'Abu Dhabi', 'Dubai', 'Roma',
             'Milan', 'Berlin', 'Zagreb', 'Budapest']
for address in addresses:
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    res[address] = (latitude, longitude)
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Bayonne are 43.4933379, -1.475099.
The geograpical coordinate of Biarritz are 43.4832523, -1.5592776.
The geograpical coordinate of Anglet are 43.4897459, -1.5197091473862367.
The geograpical coordinate of Bordeaux are 44.841225, -0.5800364.
The geograpical coordinate of Istanbul are 41.0766019, 29.052495.
The geograpical coordinate of London are 51.5073219, -0.1276474.
The geograpical coordinate of Ankara are 39.7160439, 32.7059948.
The geograpical coordinate of Bilbao are 43.2630051, -2.9349915.
The geograpical coordinate of Barcelone are 41.3828939, 2.1774322.
The geograpical coordinate of Birmingham are 52.4796992, -1.9026911.
The geograpical coordinate of Dublin are 53.3497645, -6.2602732.
The geograpical coordinate of Abu Dhabi are 24.39647445, 54.536663086435915.
The geograpical coordinate of Dubai are 25.0657, 55.1713.
The geograpical coordinate of Roma are 41.8933203, 12.4829321.
The geograpical coordinate of Milan are 45.4668, 9.1905.
The geograp

#### Get the venues for both cities

In [146]:
CLIENT_ID = open("C:\\temp\\foursquare_id", "r").read()
CLIENT_SECRET = open("C:\\temp\\foursquare_key", "r").read()
VERSION = '20180605' # Foursquare API version

In [147]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            500)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng,
            v['venue']['name'], 
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Latitude', 
                  'Longitude',
                  'Venue', 
                  'Venue Category']
    
    return(nearby_venues)

#### Create list using a loop to pass as parameters to the newly created function

In [148]:
names, latitudes, longitudes = list(), list(), list()

for k, v in res.items():
    names.append(k)
    latitudes.append(v[0])
    longitudes.append(v[1])

print(names)
print(latitudes)
print(longitudes)

['Bayonne', 'Biarritz', 'Anglet', 'Bordeaux', 'Istanbul', 'London', 'Ankara', 'Bilbao', 'Barcelone', 'Birmingham', 'Dublin', 'Abu Dhabi', 'Dubai', 'Roma', 'Milan', 'Berlin', 'Zagreb', 'Budapest']
[43.4933379, 43.4832523, 43.4897459, 44.841225, 41.0766019, 51.5073219, 39.7160439, 43.2630051, 41.3828939, 52.4796992, 53.3497645, 24.39647445, 25.0657, 41.8933203, 45.4668, 52.5170365, 45.813177, 47.48138955]
[-1.475099, -1.5592776, -1.5197091473862367, -0.5800364, 29.052495, -0.1276474, 32.7059948, -2.9349915, 2.1774322, -1.9026911, -6.2602732, 54.536663086435915, 55.1713, 12.4829321, 9.1905, 13.3888599, 15.977048, 19.14607278448202]


#### Fetch the venues for all the cities

In [149]:
all_venues = getNearbyVenues(names=names, latitudes=latitudes, longitudes=longitudes)

Bayonne
Biarritz
Anglet
Bordeaux
Istanbul
London
Ankara
Bilbao
Barcelone
Birmingham
Dublin
Abu Dhabi
Dubai
Roma
Milan
Berlin
Zagreb
Budapest


In [150]:
all_venues

Unnamed: 0,Neighborhood,Latitude,Longitude,Venue,Venue Category
0,Bayonne,43.493338,-1.475099,Chocolat Cazenave,Tea Room
1,Bayonne,43.493338,-1.475099,Place Charles de Gaulle,Plaza
2,Bayonne,43.493338,-1.475099,Le Chistera,Southwestern French Restaurant
3,Bayonne,43.493338,-1.475099,À La Bolée,Creperie
4,Bayonne,43.493338,-1.475099,Katie Daly's,Pub
...,...,...,...,...,...
1084,Budapest,47.481390,19.146073,Csősztorony Bisztró,Bistro
1085,Budapest,47.481390,19.146073,Óhegy park futókör,Track
1086,Budapest,47.481390,19.146073,Barátság park,Park
1087,Budapest,47.481390,19.146073,"Csősztorony (185, 85)",Bus Stop


#### Discovery of the data

In [151]:
df = all_venues.groupby(['Neighborhood', 'Venue Category']).count()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Latitude,Longitude,Venue
Neighborhood,Venue Category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abu Dhabi,Convenience Store,1,1,1
Abu Dhabi,Cricket Ground,1,1,1
Abu Dhabi,Shopping Mall,1,1,1
Anglet,Cafeteria,1,1,1
Anglet,Gym,1,1,1
...,...,...,...,...
Zagreb,Steakhouse,1,1,1
Zagreb,Supermarket,1,1,1
Zagreb,Theater,3,3,3
Zagreb,Theme Park Ride / Attraction,1,1,1


In [152]:
colors_2 = ['#7cb5ec', '#434348', '#90ed7d', '#f7a35c', '#8085e9',
          '#f15c80', '#e4d354', '#2b908f', '#f45b5b', '#91e8e1']

points = hc.build.series_tree(df[['Venue']], set_color=True, colors=colors_2, set_value=True, precision=2)
points[:5]

[{'name': 'Convenience Store', 'value': 1, 'id': '0.0.0', 'parent': '0.0'},
 {'name': 'Cricket Ground', 'value': 1, 'id': '0.0.1', 'parent': '0.0'},
 {'name': 'Shopping Mall', 'value': 1, 'id': '0.0.2', 'parent': '0.0'},
 {'name': 'Abu Dhabi', 'id': '0.0', 'color': '#7cb5ec', 'value': 3},
 {'name': 'Cafeteria', 'value': 1, 'id': '0.1.0', 'parent': '0.1'}]

#### Treemap (you can click to zoom) to explore the data

In [153]:
g = hc.Highcharts()

g.chart.type = 'treemap'
g.chart.width = 900
g.chart.height = 600

g.title.text = 'Venues per city and category'
g.subtitle.text = 'Click points to drill down'
g.exporting = False

g.series = [{
    'type': "treemap",
    'layoutAlgorithm': 'squarified',
    'allowDrillToNode': True,
    'dataLabels': {
        'enabled': False
    },
    'levelIsConstant': False,
    'levels': [{
        'level': 1,
        'dataLabels': {
            'enabled': True
        },
        'borderWidth': 3
    }],
    'data': points,
}]

g.plot(version='7.0.3')

In [154]:
all_venues.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Venue,Venue Category
0,Bayonne,43.493338,-1.475099,Chocolat Cazenave,Tea Room
1,Bayonne,43.493338,-1.475099,Place Charles de Gaulle,Plaza
2,Bayonne,43.493338,-1.475099,Le Chistera,Southwestern French Restaurant
3,Bayonne,43.493338,-1.475099,À La Bolée,Creperie
4,Bayonne,43.493338,-1.475099,Katie Daly's,Pub


#### One-hot encoding to feed the ML model (k-mean)

In [155]:
df_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

df_onehot['Neighborhood'] = all_venues['Neighborhood'] 

fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]

df_onehot.head()

Unnamed: 0,Women's Store,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,...,Toy / Game Store,Track,Trail,Tram Station,Trattoria/Osteria,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfront,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [156]:
df_grouped = df_onehot.groupby('Neighborhood').mean().reset_index()
df_grouped

Unnamed: 0,Neighborhood,Women's Store,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,...,Toy / Game Store,Track,Trail,Tram Station,Trattoria/Osteria,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfront,Wine Bar,Wine Shop
0,Abu Dhabi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Anglet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Barcelone,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.04,0.01
3,Bayonne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berlin,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.034884,0.0,0.0,0.034884,0.0
5,Biarritz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0
6,Bilbao,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0
7,Birmingham,0.0,0.0,0.0,0.0,0.0,0.0,0.010989,0.0,0.0,...,0.010989,0.0,0.010989,0.0,0.0,0.0,0.0,0.0,0.0,0.010989
8,Bordeaux,0.0,0.0,0.010638,0.0,0.021277,0.0,0.010638,0.0,0.0,...,0.010638,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.010638,0.0
9,Budapest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Data transformation to fetch the most common venues by city

In [157]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [158]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_grouped['Neighborhood']

for ind in np.arange(df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abu Dhabi,Convenience Store,Cricket Ground,Shopping Mall,Electronics Store,Food,Flea Market,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Fabric Shop
1,Anglet,Pharmacy,Shopping Mall,Gym,Cafeteria,Wine Shop,Electronics Store,Flea Market,Fast Food Restaurant,Farmers Market,Falafel Restaurant
2,Barcelone,Tapas Restaurant,Plaza,Bar,Cocktail Bar,Wine Bar,Hotel,Coffee Shop,Dessert Shop,Spanish Restaurant,Ice Cream Shop
3,Bayonne,French Restaurant,Hotel,Restaurant,Tapas Restaurant,Historic Site,Tea Room,Bus Stop,Market,Southwestern French Restaurant,Spanish Restaurant
4,Berlin,Hotel,German Restaurant,Coffee Shop,Café,Italian Restaurant,Clothing Store,Wine Bar,Vegetarian / Vegan Restaurant,Plaza,Department Store
5,Biarritz,French Restaurant,Tapas Restaurant,Hotel,Bar,Nightclub,Café,Dessert Shop,Restaurant,Plaza,Spanish Restaurant
6,Bilbao,Restaurant,Spanish Restaurant,Tapas Restaurant,Seafood Restaurant,Plaza,Café,Bakery,Wine Bar,Cocktail Bar,Japanese Restaurant
7,Birmingham,Coffee Shop,Hotel,Pub,Bar,Indian Restaurant,Italian Restaurant,Bistro,Shopping Mall,Plaza,Cocktail Bar
8,Bordeaux,French Restaurant,Plaza,Coffee Shop,Hotel,Tram Station,Pedestrian Plaza,Shopping Mall,Tea Room,Bistro,Dessert Shop
9,Budapest,Park,Bus Stop,Bistro,Track,Grocery Store,Wine Shop,Electronics Store,Flea Market,Fast Food Restaurant,Farmers Market


#### We are going to cluster the city according to their most common venues and see if that is in line with the cultural differences

In [167]:
# set number of clusters
kclusters = 10

grouped_clustering = df_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

clusters = pd.DataFrame(kmeans.labels_[0:20], columns=['cluster'])

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]



array([5, 2, 9, 8, 3, 8, 9, 1, 3, 6, 4, 1, 0, 3, 3, 7, 9])

In [168]:
grouped_clustering

Unnamed: 0,Women's Store,African Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Baby Store,Bagel Shop,...,Toy / Game Store,Track,Trail,Tram Station,Trattoria/Osteria,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfront,Wine Bar,Wine Shop
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.04,0.01
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.034884,0.0,0.0,0.034884,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,0.0,0.0
6,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.010989,0.0,0.0,0.0,...,0.010989,0.0,0.010989,0.0,0.0,0.0,0.0,0.0,0.0,0.010989
8,0.0,0.0,0.010638,0.0,0.021277,0.0,0.010638,0.0,0.0,0.0,...,0.010638,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.010638,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [169]:
clusters

Unnamed: 0,cluster
0,5
1,2
2,9
3,8
4,3
5,8
6,9
7,1
8,3
9,6


In [170]:
res = pd.concat([neighborhoods_venues_sorted, clusters], axis=1)
res = pd.merge(left=res, right=all_venues, on='Neighborhood')
res = res[['Latitude', 'Longitude', 'Neighborhood', 'cluster']]
res

Unnamed: 0,Latitude,Longitude,Neighborhood,cluster
0,24.396474,54.536663,Abu Dhabi,5
1,24.396474,54.536663,Abu Dhabi,5
2,24.396474,54.536663,Abu Dhabi,5
3,43.489746,-1.519709,Anglet,2
4,43.489746,-1.519709,Anglet,2
...,...,...,...,...
1084,45.813177,15.977048,Zagreb,9
1085,45.813177,15.977048,Zagreb,9
1086,45.813177,15.977048,Zagreb,9
1087,45.813177,15.977048,Zagreb,9


In [177]:
df = pd.merge(left=res[['Neighborhood', 'cluster']].drop_duplicates(), right=neighborhoods_venues_sorted, on='Neighborhood')
df[['Neighborhood', 'cluster', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue']].sort_values('cluster')

Unnamed: 0,Neighborhood,cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
12,Istanbul,0,Lighthouse,Pool,Café
11,Dublin,1,Pub,Coffee Shop,Café
7,Birmingham,1,Coffee Shop,Hotel,Pub
1,Anglet,2,Pharmacy,Shopping Mall,Gym
8,Bordeaux,3,French Restaurant,Plaza,Coffee Shop
14,Milan,3,Italian Restaurant,Boutique,Hotel
13,London,3,Hotel,Theater,Cocktail Bar
4,Berlin,3,Hotel,German Restaurant,Coffee Shop
10,Dubai,4,Medical Supply Store,Fast Food Restaurant,Frozen Yogurt Shop
0,Abu Dhabi,5,Convenience Store,Cricket Ground,Shopping Mall


In [172]:
# create map
map_clusters = folium.Map(location=[43.6044622, 1.4442469], zoom_start=3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(res['Latitude'], res['Longitude'], res['Neighborhood'], res['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters