# Analyzing the similarity of major German Cities

## Setup

In [1]:
#pip install geopy
#pip install folium
#pip install shapely
#pip install pyproj

In [2]:
import numpy as np
import pandas as pd

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

import re # for regular expressions

# for transforming geocoordinates
import shapely.geometry
import pyproj
import math

import requests # library to handle requests

from sklearn.cluster import KMeans # for clustering

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from collections import Counter # to count repeated items in list

import warnings
warnings.simplefilter('ignore')

print('Libraries imported.')

Libraries imported.


### Generate neighborhoods

In [3]:
address = 'Brandenburg Gate, Berlin, Germany'

geolocator = Nominatim(user_agent="hamburg_explorer")
location = geolocator.geocode(address)
berlin_lat = location.latitude
berlin_lon = location.longitude
print('The geograpical coordinates of Berlin are {}, {}.'.format(berlin_lat, berlin_lon))

The geograpical coordinates of Berlin are 52.51628045, 13.37770188288172.


In [4]:
address = 'Außenalster, Hamburg, Germany'

geolocator = Nominatim(user_agent="hamburg_explorer")
location = geolocator.geocode(address)
hamburg_lat = location.latitude
hamburg_lon = location.longitude
print('The geograpical coordinates of Hamburg are {}, {}.'.format(hamburg_lat, hamburg_lon))

The geograpical coordinates of Hamburg are 53.5689488, 10.007305547125247.


In [5]:
def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]

def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=33, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]

def calc_xy_distance(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx*dx + dy*dy)

print('Coordinate transformation check')
print('-------------------------------')
print('Hamburg center longitude={}, latitude={}'.format(hamburg_lon, hamburg_lat))
x, y = lonlat_to_xy(hamburg_lon, hamburg_lat)
print('Hamburg center UTM X={}, Y={}'.format(x, y))
lo, la = xy_to_lonlat(x, y)
print('Hamburg center longitude={}, latitude={}'.format(lo, la))

Coordinate transformation check
-------------------------------
Hamburg center longitude=10.007305547125247, latitude=53.5689488
Hamburg center UTM X=169483.03662988317, Y=5947163.190106782
Hamburg center longitude=10.007305547125249, latitude=53.568948799999994


In [6]:
berlin_center_x, berlin_center_y = lonlat_to_xy(berlin_lon, berlin_lat) # City center in Cartesian coordinates
hamburg_center_x, hamburg_center_y = lonlat_to_xy(hamburg_lon, hamburg_lat)

k = math.sqrt(3) / 2 # Vertical offset for hexagonal grid cells
square_width = 10000
neigborhood_radius = 1500
x_step = neigborhood_radius
y_step = neigborhood_radius * k

x_min = berlin_center_x - square_width/2
y_min = berlin_center_y - square_width/2 - (int(21/k)*k*neigborhood_radius - square_width)/2
berlin_latitudes = []
berlin_longitudes = []
berlin_distances_from_center = []
xs = []
ys = []
for i in range(0, int(21/k)):
    y = y_min + i * y_step
    x_offset = neigborhood_radius/2 if i%2==0 else 0
    for j in range(0, 21):
        x = x_min + j * x_step + x_offset
        berlin_distance_from_center = calc_xy_distance(berlin_center_x, berlin_center_y, x, y)
        if (berlin_distance_from_center <= square_width/2+1):
            lon, lat = xy_to_lonlat(x, y)
            berlin_latitudes.append(lat)
            berlin_longitudes.append(lon)
            berlin_distances_from_center.append(berlin_distance_from_center)
            xs.append(x)
            ys.append(y)
            
x_min = hamburg_center_x - square_width/2
y_min = hamburg_center_y - square_width/2 - (int(21/k)*k*neigborhood_radius - square_width)/2
hamburg_latitudes = []
hamburg_longitudes = []
hamburg_distances_from_center = []
xs = []
ys = []
for i in range(0, int(21/k)):
    y = y_min + i * y_step
    x_offset = neigborhood_radius/2 if i%2==0 else 0
    for j in range(0, 21):
        x = x_min + j * x_step + x_offset
        hamburg_distance_from_center = calc_xy_distance(hamburg_center_x, hamburg_center_y, x, y)
        if (hamburg_distance_from_center <= square_width/2+1):
            lon, lat = xy_to_lonlat(x, y)
            hamburg_latitudes.append(lat)
            hamburg_longitudes.append(lon)
            hamburg_distances_from_center.append(hamburg_distance_from_center)
            xs.append(x)
            ys.append(y)

print(len(berlin_latitudes), 'Berlin neighborhood centers generated.')
print(len(hamburg_latitudes), 'Hamburg neighborhood centers generated.')

39 Berlin neighborhood centers generated.
39 Hamburg neighborhood centers generated.


In [7]:
map_berlin = folium.Map(location=[berlin_lat, berlin_lon], zoom_start=12)

# add markers to map
for lat, lng in zip(berlin_latitudes, berlin_longitudes):
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=neigborhood_radius/40,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_berlin)  
    
map_berlin

In [8]:
map_hamburg = folium.Map(location=[hamburg_lat, hamburg_lon], zoom_start=12)

# add markers to map
for lat, lng in zip(hamburg_latitudes, hamburg_longitudes):
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=neigborhood_radius/40,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_hamburg)  
    
map_hamburg

In [9]:
hamburg_neighborhoods = []

for i in range(0,len(hamburg_latitudes)):
    reverse = geolocator.reverse((hamburg_latitudes[i],hamburg_longitudes[i]))
    address = reverse[0] 
    address_n = re.findall(".*, (.*),.*,.*,.*", address)[0]
    geo_lat = reverse[1][0]
    geo_lon = reverse[1][1]
    city = "Hamburg"
    hamburg_neighborhoods.append([address, geo_lat, geo_lon, address_n, city])

hamburg_neighborhoods = pd.DataFrame(hamburg_neighborhoods)
hamburg_neighborhoods.rename(columns={0:"Neighborhood",1:"Latitude",2:"Longitude",3:"Borough",4:"City"}, inplace=True)

In [10]:
berlin_neighborhoods = []

for i in range(0,len(berlin_latitudes)):
    reverse = geolocator.reverse((berlin_latitudes[i],berlin_longitudes[i]))
    address = reverse[0] 
    try:
        address_n = re.findall(".*, (.*),.*,.*,.*", address)[0]
    except:
        address_n = re.findall("(.*),.*,.*,.*", address)[0]
    
    geo_lat = reverse[1][0]
    geo_lon = reverse[1][1]
    city = "Berlin"
    berlin_neighborhoods.append([address, geo_lat, geo_lon, address_n, city])

berlin_neighborhoods = pd.DataFrame(berlin_neighborhoods)
berlin_neighborhoods.rename(columns={0:"Neighborhood",1:"Latitude",2:"Longitude",3:"Borough",4:"City"}, inplace=True)

In [11]:
hamburg_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Zweite Querkanalbrücke, Worthdamm, Steinwerder...",53.532424,9.981784,Steinwerder,Hamburg
1,"37, Chicagokai, Quartier Elbtorquartier, Hafen...",53.534379,10.002156,HafenCity,Hamburg
2,"12, Zweibrückenstraße, Quartier Elbbrücken, Ha...",53.534061,10.025496,HafenCity,Hamburg
3,"129e, Marckmannstraße, Rothenburgsort, Hamburg...",53.534455,10.048362,Hamburg-Mitte,Hamburg
4,"Am Altonaer Holzhafen, Altona-Altstadt, Altona...",53.54422,9.945906,Altona,Hamburg


In [12]:
berlin_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin
1,"Rosenpromenade, KGA Papestraße, Tempelhof, Tem...",52.481159,13.371573,Tempelhof-Schöneberg,Berlin
2,"Flughafen Tempelhof, Werner-Loebermann-Weg, Ga...",52.480649,13.388919,Tempelhof-Schöneberg,Berlin
3,"Hasenschänke, Columbiadamm, Tempelhof, Tempelh...",52.482636,13.416267,Tempelhof-Schöneberg,Berlin
4,"24, Pommersche Straße, Wilmersdorf, Charlotten...",52.492595,13.315906,Wilmersdorf,Berlin


In [13]:
neighborhoods = pd.concat([hamburg_neighborhoods, berlin_neighborhoods])

In [14]:
hamburg_neighborhoods.to_csv("Data/hamburg_neighborhoods.csv", index=False)
berlin_neighborhoods.to_csv("Data/berlin_neighborhoods.csv", index=False)
neighborhoods.to_csv("Data/neighborhoods.csv", index=False)

### Get venue data

In [15]:
hamburg_neighborhoods = pd.read_csv("Data/hamburg_neighborhoods.csv")
hamburg_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Zweite Querkanalbrücke, Worthdamm, Steinwerder...",53.532424,9.981784,Steinwerder,Hamburg
1,"37, Chicagokai, Quartier Elbtorquartier, Hafen...",53.534379,10.002156,HafenCity,Hamburg
2,"12, Zweibrückenstraße, Quartier Elbbrücken, Ha...",53.534061,10.025496,HafenCity,Hamburg
3,"129e, Marckmannstraße, Rothenburgsort, Hamburg...",53.534455,10.048362,Hamburg-Mitte,Hamburg
4,"Am Altonaer Holzhafen, Altona-Altstadt, Altona...",53.54422,9.945906,Altona,Hamburg


In [16]:
berlin_neighborhoods = pd.read_csv("Data/berlin_neighborhoods.csv")
berlin_neighborhoods.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Borough,City
0,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin
1,"Rosenpromenade, KGA Papestraße, Tempelhof, Tem...",52.481159,13.371573,Tempelhof-Schöneberg,Berlin
2,"Flughafen Tempelhof, Werner-Loebermann-Weg, Ga...",52.480649,13.388919,Tempelhof-Schöneberg,Berlin
3,"Hasenschänke, Columbiadamm, Tempelhof, Tempelh...",52.482636,13.416267,Tempelhof-Schöneberg,Berlin
4,"24, Pommersche Straße, Wilmersdorf, Charlotten...",52.492595,13.315906,Wilmersdorf,Berlin


In [17]:
print("There are", berlin_neighborhoods.shape[0], "neighborhoods in Berlin and",
      hamburg_neighborhoods.shape[0], "in Hamburg.")
print("They belong to",
      berlin_neighborhoods.Borough.unique().shape[0],
      "and",
      hamburg_neighborhoods.Borough.unique().shape[0],
      "boroughs respectively."
     )

There are 39 neighborhoods in Berlin and 39 in Hamburg.
They belong to 7 and 13 boroughs respectively.


In [18]:
limit = 100
radius = 1500 # see neighbourhood radius above

VERSION = '20180605' # Foursquare API version

In [19]:
%run credentials.py # client_id and client_secret for Foursquare

In [44]:
def getNearbyVenues(neighborhoods, latitudes, longitudes, boroughs, cities):
    
    venues_list=[]
    for neigh, lat, lng, bor, city in zip(neighborhoods, latitudes, longitudes, boroughs, cities):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            results = []
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            neigh,
            lat,
            lng,
            bor,
            city,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Address Latitude', 
                  'Address Longitude',
                  'Borough',
                  'City',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [45]:
hamburg_venues = getNearbyVenues(
    neighborhoods = hamburg_neighborhoods['Neighborhood'],
    latitudes = hamburg_neighborhoods['Latitude'],
    longitudes = hamburg_neighborhoods['Longitude'],
    boroughs = hamburg_neighborhoods['Borough'],
    cities = hamburg_neighborhoods['City']
)

In [46]:
berlin_venues = getNearbyVenues(
    neighborhoods = berlin_neighborhoods['Neighborhood'],
    latitudes = berlin_neighborhoods['Latitude'],
    longitudes = berlin_neighborhoods['Longitude'],
    boroughs = berlin_neighborhoods['Borough'],
    cities = berlin_neighborhoods['City']
)

In [47]:
all_venues = pd.concat([berlin_venues, hamburg_venues], ignore_index=True)

In [48]:
all_venues.shape

(7056, 9)

In [49]:
# exclude neighborhoods that are too small or too big
a_count = all_venues.groupby("Neighborhood").count()
a_incl = a_count[a_count["Venue"] >= 100].reset_index().Neighborhood
all_venues = all_venues[all_venues.Neighborhood.isin(a_incl)]

all_venues.shape

(5800, 9)

In [50]:
all_venues.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Café de Enrico,52.481014,13.349788,Café
1,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Osbili,52.479532,13.349973,Bistro
2,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Odeon,52.482086,13.349483,Indie Movie Theater
3,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Rüyam Gemüse Kebab,52.484807,13.353681,Doner Restaurant
4,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Brunnen Goldener Hirsch,52.483355,13.344001,Fountain


In [51]:
all_venues.to_csv("Data/all_venues.csv", index=False)

### Find Top 10 venue types for each neighborhood and borough

In [52]:
all_venues = pd.read_csv("Data/all_venues.csv")
all_venues.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Café de Enrico,52.481014,13.349788,Café
1,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Osbili,52.479532,13.349973,Bistro
2,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Odeon,52.482086,13.349483,Indie Movie Theater
3,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Rüyam Gemüse Kebab,52.484807,13.353681,Doner Restaurant
4,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,Brunnen Goldener Hirsch,52.483355,13.344001,Fountain


In [53]:
venue_cat_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")
venue_cat_onehot['Neighborhood'] = all_venues['Neighborhood'] 
venue_cat_onehot['Borough'] = all_venues['Borough'] 

In [54]:
boroughs_grouped = venue_cat_onehot.groupby('Borough').mean().reset_index()
boroughs_grouped.head()

Unnamed: 0,Borough,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Altona,0.006667,0.0,0.0,0.0,0.0,0.0,0.0,0.006667,0.0,...,0.0,0.0,0.006667,0.016667,0.003333,0.0,0.0,0.0,0.0,0.0
1,Altstadt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Charlottenburg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.015,...,0.0,0.0,0.01,0.0,0.01,0.005,0.0,0.005,0.0,0.025
3,Eimsbüttel,0.0,0.0,0.004286,0.0,0.0,0.001429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015714,0.001429,0.0,0.0,0.0,0.001429,0.005714
4,Friedrichshain-Kreuzberg,0.0,0.0,0.0,0.0075,0.0,0.0,0.00125,0.0175,0.005,...,0.00125,0.005,0.0,0.00625,0.0075,0.0,0.0,0.01,0.0,0.00125


In [55]:
neighborhoods_grouped = venue_cat_onehot.groupby('Neighborhood').mean().reset_index()
neighborhoods_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yemeni Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,"10, Torstraße, Spandauer Vorstadt, Mitte, Berl...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0
1,"104, Ackerstraße, Gesundbrunnen, Mitte, Berlin...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0
2,"108, Schönhauser Allee, Arnimkiez, Prenzlauer ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
3,"121, Karl-Marx-Allee, Hausburgviertel, Friedri...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
4,"143, Weidestraße, Barmbek-Süd, Hamburg-Nord, H...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [57]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = neighborhoods_grouped['Neighborhood']

for ind in np.arange(neighborhoods_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(neighborhoods_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"10, Torstraße, Spandauer Vorstadt, Mitte, Berl...",Italian Restaurant,Coffee Shop,Ice Cream Shop,Café,Art Gallery,Bookstore,Park,Sandwich Place,Beer Bar,Pub
1,"104, Ackerstraße, Gesundbrunnen, Mitte, Berlin...",Café,Coffee Shop,Hotel,Gym / Fitness Center,Restaurant,Bar,Dance Studio,History Museum,Trail,Supermarket
2,"108, Schönhauser Allee, Arnimkiez, Prenzlauer ...",Café,Vietnamese Restaurant,Bakery,Coffee Shop,Bar,Organic Grocery,Playground,Italian Restaurant,Ice Cream Shop,Bistro
3,"121, Karl-Marx-Allee, Hausburgviertel, Friedri...",Café,Coffee Shop,Ice Cream Shop,Bar,Vegetarian / Vegan Restaurant,Pizza Place,Falafel Restaurant,Pub,Middle Eastern Restaurant,Thai Restaurant
4,"143, Weidestraße, Barmbek-Süd, Hamburg-Nord, H...",Italian Restaurant,Supermarket,Café,Burger Joint,Ice Cream Shop,German Restaurant,Bakery,Restaurant,Drugstore,Coffee Shop


In [58]:
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = boroughs_grouped['Borough']

for ind in np.arange(boroughs_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(boroughs_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Altona,Café,Bar,Seafood Restaurant,Coffee Shop,Cocktail Bar,Bakery,Pizza Place,Park,Nightclub,Ice Cream Shop
1,Altstadt,Hotel,Coffee Shop,Café,Plaza,Ice Cream Shop,Seafood Restaurant,Burger Joint,Middle Eastern Restaurant,Museum,Concert Hall
2,Charlottenburg,Italian Restaurant,Hotel,Café,Gourmet Shop,German Restaurant,Asian Restaurant,Zoo Exhibit,Bakery,Korean Restaurant,Coffee Shop
3,Eimsbüttel,Café,Italian Restaurant,Supermarket,Bakery,Ice Cream Shop,Park,Coffee Shop,Gym / Fitness Center,German Restaurant,Bar
4,Friedrichshain-Kreuzberg,Coffee Shop,Bar,Café,Italian Restaurant,Hotel,Bakery,Turkish Restaurant,Vietnamese Restaurant,Ice Cream Shop,Vegetarian / Vegan Restaurant


In [59]:
boroughs = all_venues[["Borough","City"]]
boroughs.drop_duplicates(inplace=True)
boroughs.index = range(0,len(boroughs))
boroughs.head()

Unnamed: 0,Borough,City
0,Tempelhof-Schöneberg,Berlin
1,Wilmersdorf,Berlin
2,Friedrichshain-Kreuzberg,Berlin
3,Charlottenburg,Berlin
4,Mitte,Berlin


In [60]:
boroughs_loc = []
geolocator = Nominatim(user_agent="hamburg_explorer")

for i in range(0,len(boroughs)):
    borough = boroughs["Borough"][i]
    city = boroughs["City"][i]
    address = '{}, {}'.format(borough, city)
    location = geolocator.geocode(address)
    lat = location.latitude
    lon = location.longitude
    boroughs_loc.append([borough, city, lat, lon])

boroughs_loc = pd.DataFrame(boroughs_loc)
boroughs_loc.rename(columns={0:"Borough",1:"City",2:"Latitude",3:"Longitude"}, inplace=True)
boroughs_loc.head()

Unnamed: 0,Borough,City,Latitude,Longitude
0,Tempelhof-Schöneberg,Berlin,52.440603,13.373703
1,Wilmersdorf,Berlin,52.487115,13.32033
2,Friedrichshain-Kreuzberg,Berlin,52.515306,13.461612
3,Charlottenburg,Berlin,52.515747,13.309683
4,Mitte,Berlin,52.51769,13.402376


In [61]:
venues_loc = all_venues[["Neighborhood","Address Latitude","Address Longitude","Borough","City"]]
venues_loc = venues_loc.drop_duplicates()
venues_loc.index = range(0, venues_loc.shape[0])

In [62]:
neighborhoods_grouped.to_csv("Data/neighborhoods_grouped.csv", index=False)
boroughs_grouped.to_csv("Data/boroughs_grouped.csv", index=False)
neighborhoods_venues_sorted.to_csv("Data/neighborhoods_venues_sorted.csv", index=False)
boroughs_venues_sorted.to_csv("Data/boroughs_venues_sorted.csv", index=False)
boroughs_loc.to_csv("Data/boroughs_loc.csv", index=False)
venues_loc.to_csv("Data/venues_loc.csv", index=False)

### Cluster neighborhoods and boroughs

I want to have realtively small clusters with an average of 4 neighborhoods or boroughs.

In [63]:
neighborhoods_grouped = pd.read_csv("Data/neighborhoods_grouped.csv")
boroughs_grouped = pd.read_csv("Data/boroughs_grouped.csv")
neighborhoods_venues_sorted = pd.read_csv("Data/neighborhoods_venues_sorted.csv")
boroughs_venues_sorted = pd.read_csv("Data/boroughs_venues_sorted.csv")
all_venues = pd.read_csv("Data/all_venues.csv")
boroughs_loc = pd.read_csv("Data/boroughs_loc.csv")
venues_loc = pd.read_csv("Data/venues_loc.csv")

In [64]:
kclusters_n = round(neighborhoods_grouped.shape[0]/4)
kclusters_b = round(boroughs_grouped.shape[0]/4)

neighborhood_clustering = neighborhoods_grouped.drop('Neighborhood', 1)
borough_clustering = boroughs_grouped.drop('Borough', 1)

In [65]:
kmeans_n = KMeans(n_clusters=kclusters_n, random_state=0).fit(neighborhood_clustering)
kmeans_b = KMeans(n_clusters=kclusters_b, random_state=0).fit(borough_clustering)

In [66]:
#neighborhoods_venues_sorted.drop("Cluster Labels",1, inplace=True)
#boroughs_venues_sorted.drop("Cluster Labels",1, inplace=True)

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans_n.labels_)
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans_b.labels_)

In [67]:
neighborhoods_merged = venues_loc
neighborhoods_merged = neighborhoods_merged.merge(neighborhoods_venues_sorted.set_index('Neighborhood'), left_on='Neighborhood', right_on="Neighborhood")
neighborhoods_merged.head()

Unnamed: 0,Neighborhood,Address Latitude,Address Longitude,Borough,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Pasta Bar, 1, Fritz-Reuter-Straße, Schöneberg,...",52.480966,13.349539,Tempelhof-Schöneberg,Berlin,7,Café,Vietnamese Restaurant,Pizza Place,Bakery,Bistro,Ice Cream Shop,French Restaurant,German Restaurant,Italian Restaurant,Gym / Fitness Center
1,"Rosenpromenade, KGA Papestraße, Tempelhof, Tem...",52.481159,13.371573,Tempelhof-Schöneberg,Berlin,7,Café,Italian Restaurant,Korean Restaurant,Historic Site,Bistro,Pizza Place,Grocery Store,French Restaurant,Gym / Fitness Center,Supermarket
2,"Flughafen Tempelhof, Werner-Loebermann-Weg, Ga...",52.480649,13.388919,Tempelhof-Schöneberg,Berlin,0,Italian Restaurant,Café,Bar,Coffee Shop,Cocktail Bar,Music Venue,Historic Site,Park,Austrian Restaurant,Greek Restaurant
3,"Hasenschänke, Columbiadamm, Tempelhof, Tempelh...",52.482636,13.416267,Tempelhof-Schöneberg,Berlin,11,Café,Bar,Pizza Place,Coffee Shop,Indie Movie Theater,Italian Restaurant,Restaurant,Vegetarian / Vegan Restaurant,Beer Garden,Turkish Restaurant
4,"24, Pommersche Straße, Wilmersdorf, Charlotten...",52.492595,13.315906,Wilmersdorf,Berlin,13,Bakery,Italian Restaurant,Dessert Shop,Café,German Restaurant,Japanese Restaurant,Supermarket,Asian Restaurant,Vietnamese Restaurant,Seafood Restaurant


In [68]:
boroughs_merged = boroughs_loc
boroughs_merged = boroughs_merged.merge(boroughs_venues_sorted.set_index('Borough'), left_on='Borough', right_on="Borough")
boroughs_merged.head()

Unnamed: 0,Borough,City,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Tempelhof-Schöneberg,Berlin,52.440603,13.373703,1,Café,Italian Restaurant,Bar,Pizza Place,Coffee Shop,Park,Cocktail Bar,Bakery,Korean Restaurant,Ice Cream Shop
1,Wilmersdorf,Berlin,52.487115,13.32033,2,Bakery,Italian Restaurant,Dessert Shop,Café,German Restaurant,Japanese Restaurant,Supermarket,Asian Restaurant,Vietnamese Restaurant,Seafood Restaurant
2,Friedrichshain-Kreuzberg,Berlin,52.515306,13.461612,1,Coffee Shop,Bar,Café,Italian Restaurant,Hotel,Bakery,Turkish Restaurant,Vietnamese Restaurant,Ice Cream Shop,Vegetarian / Vegan Restaurant
3,Charlottenburg,Berlin,52.515747,13.309683,2,Italian Restaurant,Hotel,Café,Gourmet Shop,German Restaurant,Asian Restaurant,Zoo Exhibit,Bakery,Korean Restaurant,Coffee Shop
4,Mitte,Berlin,52.51769,13.402376,1,Café,Hotel,Coffee Shop,Bar,Italian Restaurant,Park,Gym / Fitness Center,Ice Cream Shop,Restaurant,Plaza


In [69]:
neighborhoods_merged.to_csv("Data/neighborhoods_merged.csv", index=False)
boroughs_merged.to_csv("Data/boroughs_merged.csv", index=False)

### Map neighborhood clusters

In [70]:
# set color scheme for the clusters
x = np.arange(kclusters_n)
ys = [i + x + (i*x)**2 for i in range(kclusters_n)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# create map
map_clusters = folium.Map(location=[hamburg_lat, hamburg_lon], zoom_start=12)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighborhoods_merged['Address Latitude'],
                                  neighborhoods_merged['Address Longitude'],
                                  neighborhoods_merged['Neighborhood'],
                                  neighborhoods_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=35, # neighborhood_radius/40
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5,
        parse_html=False
    ).add_to(map_clusters)
       
map_clusters

In [71]:
map_clusters

### Map borough clusters

In [72]:
# set color scheme for the clusters
x = np.arange(kclusters_b)
ys = [i + x + (i*x)**2 for i in range(kclusters_b)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# create map
map_clusters_b = folium.Map(location=[hamburg_lat, hamburg_lon], zoom_start=12)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(boroughs_merged['Latitude'],
                                  boroughs_merged['Longitude'],
                                  boroughs_merged['Borough'],
                                  boroughs_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=35, # neighborhood_radius/40
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.5,
        parse_html=False
    ).add_to(map_clusters_b)
       
map_clusters_b

In [73]:
map_clusters_b

### Explore neighborhood clusters

The boroughs are clearly too large to be in any way distinct. I will therefore continue exploring the neighborhoods.

In [74]:
cluster_neigh_num = []
for i in range(0,kclusters_n):
    city_group = neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == i].groupby("City").count()["Neighborhood"]
    city_group = pd.DataFrame(city_group).reset_index()
    
    try:
        hamburg = city_group[city_group.City == "Hamburg"]["Neighborhood"][1]
    except:
        try: 
            hamburg = city_group[city_group.City == "Hamburg"]["Neighborhood"][0]
        except:
            hamburg = 0
    
    try:
        berlin = city_group[city_group.City == "Berlin"]["Neighborhood"][0]
    except:
        berlin = 0
    
    cluster_neigh_num.append({"Cluster Labels": i, "Hamburg": hamburg, "Berlin": berlin})
    print("Cluster", i, "has", hamburg, "Hamburg neighborhoods and", berlin, "Berlin neighborhoods.")
    
cluster_neigh_num = pd.DataFrame(cluster_neigh_num)

Cluster 0 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 1 has 6 Hamburg neighborhoods and 1 Berlin neighborhoods.
Cluster 2 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 3 has 6 Hamburg neighborhoods and 2 Berlin neighborhoods.
Cluster 4 has 0 Hamburg neighborhoods and 6 Berlin neighborhoods.
Cluster 5 has 1 Hamburg neighborhoods and 4 Berlin neighborhoods.
Cluster 6 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 7 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 8 has 4 Hamburg neighborhoods and 0 Berlin neighborhoods.
Cluster 9 has 4 Hamburg neighborhoods and 0 Berlin neighborhoods.
Cluster 10 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 11 has 0 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 12 has 1 Hamburg neighborhoods and 3 Berlin neighborhoods.
Cluster 13 has 0 Hamburg neighborhoods and 2 Berlin neighborhoods.


I'm only interested in those clusters that have both Hamburg and Berlin neighbordhoods.

In [75]:
cluster_neigh_num[(cluster_neigh_num.Hamburg > 0) & (cluster_neigh_num.Berlin > 0)]

Unnamed: 0,Cluster Labels,Hamburg,Berlin
1,1,6,1
3,3,6,2
5,5,1,4
12,12,1,3


In [76]:
cluster_list = cluster_neigh_num[(cluster_neigh_num.Hamburg > 0) & (cluster_neigh_num.Berlin > 0)]["Cluster Labels"].tolist()

venues_list = [[]]*len(cluster_list)

for i in range(0,len(cluster_list)):
    j = cluster_list[i]
    venues = neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == j].iloc[:, 6:16].values.tolist()
    venues_list[i] = []
    for sublist in venues:
        for item in sublist:
            venues_list[i].append(item)

In [77]:
for i in range(0,len(cluster_list)):
    venues_count = Counter(venues_list[i])
    venues_count = pd.DataFrame.from_dict(venues_count, orient='index').reset_index()
    j = cluster_list[i]
    print("Top 5 venues types in Cluster", j)
    display(venues_count.sort_values(0, ascending=False).head())
    display(neighborhoods_merged[neighborhoods_merged["Cluster Labels"] == j].Neighborhood.values.tolist())
    print("\n")

Top 5 venues types in Cluster 1


Unnamed: 0,index,0
0,Hotel,7
8,Coffee Shop,6
10,Café,6
3,Italian Restaurant,4
5,Vietnamese Restaurant,4


['Singer109, 109, Singerstraße, Luisenstadt, Mitte, Friedrichshain-Kreuzberg, Berlin, 10179, Deutschland',
 '15, Reimerstwiete, Altstadt, Hamburg, 20457, Deutschland',
 'Prizeotel Hamburg City, 28, Högerdamm, Hammerbrook, Hamburg, 20097, Deutschland',
 'Wassertreppen, Sievekingplatz, Neustadt, Hamburg, 20355, Deutschland',
 'Kennedybrücke, St. Georg, Hamburg, 20099, Deutschland',
 'Staatliche Handelsschule Am Lämmermarkt H2, 2, Wallstraße, Hohenfelde, Hamburg-Nord, Hamburg, 22087, Deutschland',
 '55, Rothenbaumchaussee, Rotherbaum, Eimsbüttel, Hamburg, 20148, Deutschland']



Top 5 venues types in Cluster 3


Unnamed: 0,index,0
0,Café,8
7,Italian Restaurant,7
5,Ice Cream Shop,6
4,Bakery,6
9,Vietnamese Restaurant,5


['191, Prenzlauer Allee, Winsviertel, Prenzlauer Berg, Pankow, Berlin, 10405, Deutschland',
 '108, Schönhauser Allee, Arnimkiez, Prenzlauer Berg, Pankow, Berlin, 10439, Deutschland',
 'Schule Telemannstraße, Heußweg, Eimsbüttel, Hamburg, 20255, Deutschland',
 '2, Hoheluftchaussee, Hoheluft-West, Eimsbüttel, Hamburg, 20253, Deutschland',
 'Anleger Krugkoppelbrücke, Krugkoppel, Harvestehude, Eimsbüttel, Hamburg, 20149, Deutschland',
 '67c, Grandweg, Lokstedt, Eimsbüttel, Hamburg, 22529, Deutschland',
 '24, Schottmüllerstraße, Eppendorf, Hamburg-Nord, Hamburg, 20251, Deutschland',
 '144d, Maria-Louisen-Straße, Jarrestadt, Winterhude, Hamburg-Nord, Hamburg, 22301, Deutschland']



Top 5 venues types in Cluster 5


Unnamed: 0,index,0
0,Café,5
1,Bar,5
5,Coffee Shop,5
3,Turkish Restaurant,4
10,Park,4


['27, Lynarstraße, Sprengelkiez, Wedding, Mitte, Berlin, 13353, Deutschland',
 'Anna Kiryakova, 8, Kameruner Straße, Brüsseler Kiez, Wedding, Mitte, Berlin, 13351, Deutschland',
 '41, Gottschedstraße, Gesundbrunnen, Mitte, Berlin, 13357, Deutschland',
 '9, Spanheimstraße, Gesundbrunnen, Mitte, Berlin, 13357, Deutschland',
 'Agentur für Arbeit Hamburg, Augustenburger Straße, Altona-Nord, Altona, Hamburg, 22769, Deutschland']



Top 5 venues types in Cluster 12


Unnamed: 0,index,0
0,Italian Restaurant,4
2,Ice Cream Shop,4
3,Café,4
6,Park,3
1,Coffee Shop,3


['10, Torstraße, Spandauer Vorstadt, Mitte, Berlin, 10115, Deutschland',
 'La Vie, 7, Straßburger Straße, Kollwitzkiez, Prenzlauer Berg, Mitte, Berlin, 10405, Deutschland',
 'Am Friedrichshain, Bötzowviertel, Prenzlauer Berg, Pankow, Berlin, 10407, Deutschland',
 'Außenalster, Schöne Aussicht, Uhlenhorst, Hamburg-Nord, Hamburg, 22085, Deutschland']



