# Frankfurt Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [1]:
import pandas as pd
import requests

from bs4 import BeautifulSoup

from foursquare import fetch_venues, rank_venues_by_frequency
from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Frankfurt neighborhood list

In [42]:
wiki_url = "https://en.wikipedia.org/wiki/Category:Districts_of_Frankfurt"
r  = requests.get(wiki_url)
data = r.text
soup = BeautifulSoup(data)

results = []
for category_group in soup.find_all("div", class_="mw-category-group"):
    links = category_group.find_all('a')
    for link in links:
        if link and link.get('title'):
            stripped = link.get('title').replace(' (Frankfurt am Main)', '')
            results.append(stripped)

# drop first item, a list of administrative divisions
results = results[1:]
print(len(results))
print(results)

46
['Altstadt', 'Bahnhofsviertel', 'Bankenviertel', 'Bergen-Enkheim', 'Berkersheim', 'Bockenheim', 'Bonames', 'Bornheim', 'Dornbusch', 'Eckenheim', 'Eschersheim', 'Europaviertel', 'Fechenheim', 'Flughafen', 'Frankfurter Berg', 'Gallus', 'Ginnheim', 'Griesheim', 'Gutleutviertel', 'Harheim', 'Hausen', 'Heddernheim', 'Höchst', 'Innenstadt', 'Kalbach-Riedberg', 'Mainhattan', 'Nied', 'Nieder-Erlenbach', 'Nieder-Eschbach', 'Niederrad', 'Niederursel', 'Nordend', 'Oberrad', 'Ostend', 'Praunheim', 'Preungesheim', 'Riederwald', 'Rödelheim', 'Sachsenhausen', 'Schwanheim', 'Seckbach', 'Sindlingen', 'Sossenheim', 'Unterliederbach', 'Westend', 'Zeilsheim']


#### Drop duplicates

In [43]:
results_set = set(results)
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

46


In [44]:
df = pd.DataFrame(neighborhoods_list)
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Ostend
1,Gallus
2,Schwanheim
3,Harheim
4,Bahnhofsviertel


### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [48]:
enrich_neighborhoods_with_geocoder(df, "Frankfurt, Germany")
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Ostend,50.115935,8.720546
1,Gallus,50.103658,8.636706
2,Schwanheim,50.082955,8.57868
3,Harheim,50.185589,8.690445
4,Bahnhofsviertel,50.107741,8.668736


### Handle missing values

Spot-fix singular missing value from https://travel.sygic.com/en/poi/bankenviertel-poi:42460

In [49]:
df.loc[df['Neighborhood'] == 'Bankenviertel', 'Latitude'] = 50.110589
df.loc[df['Neighborhood'] == 'Bankenviertel', 'Longitude'] = 8.672952
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Ostend,50.115935,8.720546
1,Gallus,50.103658,8.636706
2,Schwanheim,50.082955,8.57868
3,Harheim,50.185589,8.690445
4,Bahnhofsviertel,50.107741,8.668736
5,Fechenheim,50.12562,8.75765
6,Mainhattan,50.110644,8.682092
7,Sachsenhausen,50.100262,8.683599
8,Riederwald,50.126372,8.733924
9,Ginnheim,50.145069,8.649153


### Visualize Frankfurt neighborhoods
<a id="vis-neighborhoods"></a>

In [3]:
m = map_neighborhoods(df, "Frankfurt, Germany")
m

Frankfurt, Germany
50.1106444
8.6820917
<folium.folium.Map object at 0x123f6c650>


### Save neighborhood coordinates dataset

In [None]:
df.to_csv('data/frankfurt_neighborhood_coords.csv')