# Data in the project

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
import requests
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

## District in Taipei City
We get the district name and postalcode via wikipedia

In [3]:
# Get the data from wiki
wiki_url = requests.get('https://en.wikipedia.org/wiki/Postal_codes_in_Taiwan').text

# Parse the html data using BeautifulSoup
soup = BeautifulSoup(wiki_url, 'html.parser')

# Find all tables
tables = soup.find_all('table')

taipei_postalcode = tables[1] # Taipei is the first table
taipei_postalcode

<table class="wikitable">
<tbody><tr>
<th>Code</th>
<th>Division name</th>
<th>Chinese
</th></tr>
<tr>
<th colspan="3"><a href="/wiki/Taipei" title="Taipei">Taipei City</a>
</th></tr>
<tr>
<td>100</td>
<td><a href="/wiki/Zhongzheng_District" title="Zhongzheng District">Zhongzheng District</a></td>
<td>中正區
</td></tr>
<tr>
<td>103</td>
<td><a href="/wiki/Datong_District,_Taipei" title="Datong District, Taipei">Datong District</a></td>
<td>大同區
</td></tr>
<tr>
<td>104</td>
<td><a href="/wiki/Zhongshan_District,_Taipei" title="Zhongshan District, Taipei">Zhongshan District</a></td>
<td>中山區
</td></tr>
<tr>
<td>105</td>
<td><a href="/wiki/Songshan_District,_Taipei" title="Songshan District, Taipei">Songshan District</a></td>
<td>松山區
</td></tr>
<tr>
<td>106</td>
<td><a href="/wiki/Daan_District,_Taipei" title="Daan District, Taipei">Daan District</a></td>
<td>大安區
</td></tr>
<tr>
<td>108</td>
<td><a href="/wiki/Wanhua_District" title="Wanhua District">Wanhua District</a></td>
<td>萬華區
</td></tr>

In [4]:
# Create a dataframe
columns = ['PostalCode', 'Division', 'Chinese_Name']
df_taipei = pd.DataFrame(columns = columns)
df_taipei

Unnamed: 0,PostalCode,Division,Chinese_Name


In [8]:
for row in taipei_postalcode.find_all('tr'):
    col = row.find_all('td')
    if col != []:
        postalcode = col[0].text
        division = col[1].text
        chinese_name = col[2].text.strip('\n')
        df_taipei = df_taipei.append({'PostalCode': postalcode, 'Division': division, 'Chinese_Name': chinese_name}, ignore_index = True)
    
df_taipei

Unnamed: 0,PostalCode,Division,Chinese_Name
0,100,Zhongzheng District,中正區
1,103,Datong District,大同區
2,104,Zhongshan District,中山區
3,105,Songshan District,松山區
4,106,Daan District,大安區
5,108,Wanhua District,萬華區
6,110,Xinyi District,信義區
7,111,Shilin District,士林區
8,112,Beitou District,北投區
9,114,Neihu District,內湖區


## Combine the Coordinate data

In [9]:
# Load coordinate data
coor_path = '/Users/Brian/Python/IBM Data Science Certificate/Capstone_Project/tp_coor_data.csv'

df_tp = pd.read_csv(coor_path)
df_tp

Unnamed: 0,PostalCode,Division,Chinese_Name,Latitude,Longitude
0,100,Zhongzheng District,中正區,25.032405,121.519884
1,103,Datong District,大同區,25.063424,121.513042
2,104,Zhongshan District,中山區,25.069699,121.53816
3,105,Songshan District,松山區,25.059991,121.557588
4,106,Daan District,大安區,25.02677,121.543445
5,108,Wanhua District,萬華區,25.02859,121.497986
6,110,Xinyi District,信義區,25.030621,121.57167
7,111,Shilin District,士林區,25.125467,121.550847
8,112,Beitou District,北投區,25.148068,121.517799
9,114,Neihu District,內湖區,25.083706,121.592383


## 