# Beijing Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [2]:
import pandas as pd
import numpy as np
import re
import requests

from bs4 import BeautifulSoup

from foursquare import fetch_venues, venue_frequency, rank_venues_by_frequency
from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Beijing neighborhood list

In [11]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_township-level_divisions_of_Beijing"
r  = requests.get(wiki_url)
data = r.text

soup = BeautifulSoup(data)

areas = ['Changping District', 'Chaoyang District', 'Daxing District', 'Dongcheng District', 'Fangshan District', 'Fengtai District', 'Haidian District', 'Huairou District', 'Mentougou District', 'Pinggu District', 'Shijingshan District', 'Shunyi District', 'Tongzhou District', 'Xicheng District', 'Miyun District', 'Yanqing District']
results = []
for area in soup('h2'):
    if area and area.contents and area.getText().replace('[edit]', '') in areas:
        sib = area.findNext('ul')
        for district in sib.findAll('li'):
            subdistricts = district.getText().split(", ")
            [results.append(i) for i in subdistricts]

print(results[0:20])

['Chengbei Subdistrict (城北街道)', 'Chengnan Subdistrict (城南街道)', 'Huilongguan Subdistrict (回龙观街道)', 'Longzeyuan Subdistrict (龙泽园街道)', 'Shigezhuang Subdistrict (史各庄街道)', 'Tiantongyuanbei Subdistrict (天通苑北街道)', 'Tiantongyuannan Subdistrict (天通苑南街道)', 'Huoying Subdistrict (霍营街道)', 'Jianwai Subdistrict (建外街道)', 'Chaowai Subdistrict (朝外街道)', 'Hujialou Subdistrict (呼家楼街道)', 'Sanlitun Subdistrict (三里屯街道)', 'Zuojiazhuang Subdistrict (左家庄街道)', 'Xiangheyuan Subdistrict (香河园街道)', 'Heping Avenue Subdistrict (和平街街道)', 'Anzhen Subdistrict (安贞街道)', 'Yayuncun Subdistrict (亚运村街道)', 'Xiaoguan Subdistrict (小关街道)', 'Jiuxianqiao Subdistrict (酒仙桥街道)', 'Maizidian Subdistrict (麦子店街道)']


In [12]:
print(len(results))

143


#### Drop duplicates

In [13]:
results_set = set(results)
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

142


In [14]:
neighborhoods_list[0:5]

['Yuqiao Subdistrict (玉桥街道)',
 'Shigezhuang Subdistrict (史各庄街道)',
 'Beitaipingzhuang Subdistrict (北太平庄街道)',
 'Yongdingmenwai Subdistrict (永定门外街道)',
 'Jinding Avenue Subdistrict (金顶街街道)']

In [21]:
# neighborhoods_en = [re.sub(r' Subdistrict \(.*\)', '', n) for n in neighborhoods_list]
neighborhoods_en = [re.sub(r' Subdistrict.*$', '', n) for n in neighborhoods_list]
neighborhoods_en

['Yuqiao',
 'Shigezhuang',
 'Beitaipingzhuang',
 'Yongdingmenwai',
 'Jinding Avenue',
 'Guangming',
 'Xinhua',
 'Xincun',
 'Andingmen',
 'Xiluoyuan',
 'Jingshan',
 'Binhe',
 "West Chang'an Avenue",
 'Chaowai',
 'Sanlitun',
 'Chengguan',
 'Dongsi',
 'Fengtai',
 'Xueyuan Road',
 'Xisanqi',
 'Zuojiazhuang',
 'Shichahai',
 'Taoranting',
 'Guoyuan',
 'Guangning',
 'Konggang',
 'Haidian',
 'Xinzhen',
 'Longshan',
 'Aoyuncun',
 'Shuangjing',
 'Huayuan Road',
 'Taipingqiao',
 'Malianwa',
 'Beixinqiao',
 'Chunshu',
 'Gongchen',
 'Xiaoguan',
 'Qinghe',
 'Qingsong',
 'Xingfeng',
 'Datun',
 'Nanyuan',
 'Tiancun Road',
 'Dayu',
 'Anzhen',
 'Dongzhimen',
 'Niujie',
 'Baizhifang',
 'Baiquan',
 'Bajiao',
 'Rulin',
 'Zhongcang',
 'Shiyuan',
 'Dongfeng',
 'Desheng',
 'Tiantan',
 'Donghuashi',
 'Qingyuan',
 'Yuetan',
 'Yanyuan',
 'Xiangheyuan',
 'Liulitun',
 'Shengli',
 'Heping Avenue',
 'Pingguoyuan',
 'Lugouqiao',
 'Tiangongyuan',
 'Shuguang',
 'Huoying',
 'Fatou',
 'Yingfeng (迎风街道)',
 'Xiangshan',
 "Y

In [24]:
df = pd.DataFrame(neighborhoods_en)
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Yuqiao
1,Shigezhuang
2,Beitaipingzhuang
3,Yongdingmenwai
4,Jinding Avenue


In [25]:
address = 'Beijing, China'

enrich_neighborhoods_with_geocoder(df, address)
df

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Dahongmen, Beijing, China',), **{}).
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/geopy/geocoders/base.py", line 355, in _call_geocoder
    page = requester(req, timeout=timeout, **kwargs)
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 543, in _open
    '_open', req)
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 1360, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "/opt/anaconda3/lib/python3.7/urllib/request.py", line 1320, in do_open
    r = h.getresponse()
  File "/opt/anaconda3/lib/python3.7/http/client.py", line 1336, in getresponse
    response.begin()
  File "/opt/anaconda3/lib/python3.7/htt

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Yuqiao,39.898820,116.670814
1,Shigezhuang,39.931943,116.557680
2,Beitaipingzhuang,39.973639,116.363844
3,Yongdingmenwai,39.866794,116.392355
4,Jinding Avenue,39.934977,116.389006
...,...,...,...
137,Guanyinsi,39.725189,116.342683
138,Dashilan,39.894498,116.390172
139,Beiyuan,40.041644,116.428315
140,Tiyuguan Road,39.882695,116.420461


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 3 columns):
Neighborhood    142 non-null object
Latitude        111 non-null float64
Longitude       111 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.5+ KB


In [28]:
# drop neighborhoods with missing coordinates
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 140
Data columns (total 3 columns):
Neighborhood    111 non-null object
Latitude        111 non-null float64
Longitude       111 non-null float64
dtypes: float64(2), object(1)
memory usage: 3.5+ KB


### Save neighborhood coordinates dataset

In [2]:
# df.to_csv('data/beijing_neighborhood_coords.csv')

#load saved dataset
df = pd.read_csv('data/beijing_neighborhood_coords.csv', index_col=0)
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Yuqiao,39.898820,116.670814
1,Shigezhuang,39.931943,116.557680
2,Beitaipingzhuang,39.973640,116.363844
3,Yongdingmenwai,39.866794,116.392355
4,Jinding Avenue,39.934977,116.389006
...,...,...,...
134,Xiangshuiyuan,40.465203,115.987435
137,Guanyinsi,39.725189,116.342683
138,Dashilan,39.894498,116.390172
139,Beiyuan,40.041644,116.428315


### Visualize Beijing neighborhoods
<a id="vis-neighborhoods"></a>

In [31]:
m = map_neighborhoods(df, address)
m