# Tokyo Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [10]:
import pandas as pd
import requests

from bs4 import BeautifulSoup

from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Tokyo neighborhood list

In [33]:
wiki_url = "https://en.wikipedia.org/wiki/Category:Neighborhoods_of_Tokyo"
r  = requests.get(wiki_url)
data = r.text
soup = BeautifulSoup(data)

results = []
for category_group in soup.find_all("div", class_="mw-category-group"):
    links = category_group.find_all('a')
    for link in links:
        if link and link.get('title'):
            stripped = link.get('title').replace('Category:', '')
            results.append(stripped)

print(len(results))
print(results)

95
['Akasaka, Tokyo', 'Akihabara', 'Asakusa', 'Harajuku', 'Ikebukuro', 'Kanda, Tokyo', 'Marunouchi', 'Nihonbashi, Tokyo', 'Roppongi', 'Shiodome', 'Areas of Tokyo', 'Ueno, Tokyo', 'Agariyashiki', 'Akihabara', 'Aoyama, Minato, Tokyo', 'Asagaya', 'Banchō', 'Daikanyamachō, Shibuya', 'Ebisu, Shibuya', 'Ebisuminami, Shibuya', 'Ebisunishi, Shibuya, Tokyo', 'Harajuku', 'Hatsudai', 'Hibiya', 'Higashi, Shibuya', 'Higashiōizumi, Nerima, Tokyo', 'Hiroo, Shibuya', 'Hitotsubashi, Chiyoda', 'Honjo, Tokyo', 'Honmachi, Shibuya', 'Ichigaya', 'Ikebukuro', 'Imado', 'Jingūmae', 'Kabukichō, Tokyo', 'Kagurazaka', 'Kami-ikebukuro', 'Kamiyamachō, Shibuya, Tokyo', 'Kanda, Tokyo', 'Kichijōji', 'Kōjimachi', 'Kyōbashi', 'Marunouchi', 'Mejiro', 'Minamiōizumi, Nerima, Tokyo', 'Mita, Meguro, Tokyo', 'Miyamoto-cho, Tokyo', 'Motoyoyogichō', 'Mount Jinba', 'Nanpeidaichō, Shibuya', 'Nihonbashi', 'Nishi-Shinjuku', 'Nishihara, Shibuya', 'Nishiogikubo', 'Nishiōizumi, Nerima, Tokyo', 'Nishiōizumimachi, Nerima, Tokyo', 'Ochan

#### Drop duplicates

In [3]:
results_set = set(results)
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

89


In [34]:
df = pd.DataFrame(neighborhoods_list)
df.columns = ['Neighborhood']
df['Neighborhood'] = df['Neighborhood'].str.replace(', Tokyo', '')
df.head()

Unnamed: 0,Neighborhood
0,Kami-ikebukuro
1,Kagurazaka
2,Yanaka Cemetery
3,Sugamo
4,"Minamiōizumi, Nerima"


### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [7]:
address = 'Tokyo, Japan'

In [35]:
enrich_neighborhoods_with_geocoder(df, address)
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Kami-ikebukuro,35.738525,139.717028
1,Kagurazaka,35.703889,139.734222
2,Yanaka Cemetery,35.725211,139.771596
3,Sugamo,35.733412,139.739427
4,"Minamiōizumi, Nerima",,


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 3 columns):
Neighborhood    89 non-null object
Latitude        74 non-null float64
Longitude       74 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.2+ KB


In [37]:
# dropping but should address missing values
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74 entries, 0 to 88
Data columns (total 3 columns):
Neighborhood    74 non-null object
Latitude        74 non-null float64
Longitude       74 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.3+ KB


### Visualize Tokyo neighborhoods
<a id="vis-neighborhoods"></a>

In [38]:
m = map_neighborhoods(df.dropna(), address)
m

In [39]:
df.sort_values('Longitude')

Unnamed: 0,Neighborhood,Latitude,Longitude
36,Honjo,36.243338,139.190533
26,Mount Jinba,35.661360,139.315477
52,Kichijōji,35.703141,139.580308
35,Nishiogikubo,35.704034,139.600448
74,Yōga,35.626648,139.634172
...,...,...,...
40,San'ya,35.732714,139.797940
37,Imado,35.719345,139.803548
69,Shin-Kiba,35.646183,139.828043
55,Tateishi,35.738182,139.848055


In [40]:
df.drop(26, inplace=True)

In [41]:
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Kami-ikebukuro,35.738525,139.717028
1,Kagurazaka,35.703889,139.734222
2,Yanaka Cemetery,35.725211,139.771596
3,Sugamo,35.733412,139.739427
4,Akasaka,35.671679,139.735622
...,...,...,...
68,Asagaya,35.703164,139.636247
69,"Mita, Meguro",35.648839,139.742771
70,Mejiro,35.721186,139.706482
71,Asakusa,35.717528,139.797635


### Save neighborhood coordinates dataset

In [3]:
# df.to_csv('data/tokyo_neighborhood_coords.csv')

# Load instead
df = pd.read_csv('data/tokyo_neighborhood_coords.csv', index_col=0)
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Kami-ikebukuro,35.738525,139.717028
1,Kagurazaka,35.703889,139.734222
2,Yanaka Cemetery,35.725211,139.771596
3,Sugamo,35.733412,139.739427
4,Akasaka,35.671679,139.735622
...,...,...,...
68,Asagaya,35.703164,139.636247
69,"Mita, Meguro",35.648839,139.742771
70,Mejiro,35.721186,139.706482
71,Asakusa,35.717528,139.797635
