# Shanghai Neighborhood mining

This notebook combines data sources (wikipedia and Foursquare API) to assemble a list of neighborhoods and map them to their coordinates for futher processing.

In [33]:
import pandas as pd
import re

from geocoder import enrich_neighborhoods_with_geocoder, map_neighborhoods

### Scrape wikipedia to compile Shanghai neighborhood list

'Neighborhood' refers to any town, township, or subdistrict.

In [3]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_township-level_divisions_of_Shanghai')
tables[0].head()

Unnamed: 0,0,1
0,,This article may contain excessive or inapprop...


In [4]:
tables[1][('Hanyu Pinyin', 'Subdistricts')]

0                        Wúsōng Jiēdào
1                       Yǒuyìlù Jiēdào
2                     Zhāngmiào Jiēdào
3                                Towns
4                         Dàchǎng Zhèn
5                         Gāojìng Zhèn
6                           Gùcūn Zhèn
7                         Luōdiàn Zhèn
8                         Luōjīng Zhèn
9                        Miàoxíng Zhèn
10                        Sōngnán Zhèn
11                       Yángxíng Zhèn
12                          Yuèpǔ Zhèn
13    Special Township-Level Divisions
14      Bǎoshān Chéngshì Gōngyè Yuánqū
Name: (Hanyu Pinyin, Subdistricts), dtype: object

In [5]:
# 'Subdistricts', 'Towns', and 'Special Township-Level Divisions'
# have become rows. we will work around the multi-index and drop
# incorrect rows with duplicates

In [5]:
subdistricts = pd.concat([tables[i][('Hanyu Pinyin', 'Subdistricts')] for i in range(1, 16)])
subdistricts.head(20)

0                        Wúsōng Jiēdào
1                       Yǒuyìlù Jiēdào
2                     Zhāngmiào Jiēdào
3                                Towns
4                         Dàchǎng Zhèn
5                         Gāojìng Zhèn
6                           Gùcūn Zhèn
7                         Luōdiàn Zhèn
8                         Luōjīng Zhèn
9                        Miàoxíng Zhèn
10                        Sōngnán Zhèn
11                       Yángxíng Zhèn
12                          Yuèpǔ Zhèn
13    Special Township-Level Divisions
14      Bǎoshān Chéngshì Gōngyè Yuánqū
0                    Běixīnjīng Jiēdào
1                  Chéngjiāqiáo Jiēdào
2                      Hóngqiáo Jiēdào
3                     Huáyánglù Jiēdào
4                     Jiāngsūlù Jiēdào
Name: (Hanyu Pinyin, Subdistricts), dtype: object

In [6]:
len(subdistricts)

235

#### Drop duplicates

In [7]:
results_set = set(subdistricts)
results_set.remove('Towns')
results_set.remove('Special Township-Level Divisions')
neighborhoods_list = list(results_set)
print(len(neighborhoods_list))

214


In [8]:
df = pd.DataFrame(neighborhoods_list)
df.columns = ['Neighborhood']
df.head()

Unnamed: 0,Neighborhood
0,Jiāngnínglù Jiēdào
1,Zhōujiāqiáo Jiēdào
2,Huìnán Zhèn
3,Tángzhèn
4,Xiàyáng Jiēdào


### Combine wikipedia data with geocoder data

Since Wikipedia dataset does not include zip code coordinates, we should hydrate dataset with longitude and latitude from Geocoder in order to access Foursquare data.

In [9]:
address = 'Shanghai, China'

enrich_neighborhoods_with_geocoder(df, address)
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Jiāngnínglù Jiēdào,,
1,Zhōujiāqiáo Jiēdào,,
2,Huìnán Zhèn,31.056003,121.757317
3,Tángzhèn,31.21036,121.651043
4,Xiàyáng Jiēdào,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 3 columns):
Neighborhood    214 non-null object
Latitude        83 non-null float64
Longitude       83 non-null float64
dtypes: float64(2), object(1)
memory usage: 5.1+ KB


In [12]:
df['Neighborhood'] = df['Neighborhood'].str.replace(' Jiēdào', '')
df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Jiāngnínglù,,
1,Zhōujiāqiáo,,
2,Huìnán Zhèn,31.056003,121.757317
3,Tángzhèn,31.210360,121.651043
4,Xiàyáng,,
...,...,...,...
209,Hóngméilù,,
210,Línfénlù,,
211,Péngpǔ Xīncūn,,
212,Mǎqiáo Zhèn,,


In [13]:
enrich_neighborhoods_with_geocoder(df, address)
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Jiāngnínglù,,
1,Zhōujiāqiáo,,
2,Huìnán Zhèn,31.056003,121.757317
3,Tángzhèn,31.21036,121.651043
4,Xiàyáng,31.150194,121.120585


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 3 columns):
Neighborhood    214 non-null object
Latitude        130 non-null float64
Longitude       130 non-null float64
dtypes: float64(2), object(1)
memory usage: 5.1+ KB


In [29]:
# lots of missing data here but distribution is more even
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 2 to 213
Data columns (total 3 columns):
Neighborhood    130 non-null object
Latitude        130 non-null float64
Longitude       130 non-null float64
dtypes: float64(2), object(1)
memory usage: 4.1+ KB


### Visualize Shanghai neighborhoods
<a id="vis-neighborhoods"></a>

In [17]:
m = map_neighborhoods(df, address)
m

### Save neighborhood coordinates dataset

In [18]:
df.to_csv('data/shanghai_neighborhood_coords.csv')