In [1]:
import re
import time
import os
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from string import Template
import urllib
import json
import pickle


from pprint import pprint
import sys

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,davies_bouldin_score,calinski_harabasz_score
from sklearn.decomposition import PCA, SparsePCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
pd.set_option('display.max_rows', 500)


In [4]:
## 經濟地理資訊平台API抓的資料

with open('/content/drive/MyDrive/03_RESOURCE/GIS/gis_open_api.pickle', 'rb') as f:
    api_data = pickle.load(f)

In [5]:

# 分行爬蟲資訊

## 各分行地址與基本資訊
brn_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/taishin_brn.csv').drop_duplicates()

## 分行地址與點位
brn_xy_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/brn_xy_df.csv')

## 合併並去掉重複
brn_df = brn_df.merge(brn_xy_df,how='left' ,on = 'addr').copy()
brn_df = brn_df.drop_duplicates()

## 作為主表
brn_code_df = brn_df[['brn', 'addr', 'lon', 'lat']]

## 整理經濟三級發布區
brn_code3_df = pd.DataFrame(
    [(x,api_data[x]['code3_data']['ADMIV']['CODE3']) for x in list(api_data.keys()) if x !='南崁分行'],
    columns = ['brn','eco_code3']
)
brn_code3_df = pd.concat([brn_code3_df, pd.DataFrame([['南崁分行','A6800005028']], columns=['brn','eco_code3'])])
brn_code3_df = brn_code3_df.reset_index(drop=True)

## 整理統計二級發布區
brn_code2_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/分行統計區代碼對照.csv', encoding='CP950')
brn_code2_df = brn_code2_df[['ID','縣市', '鄉鎮市區', '村里', '二級發布區', '一級發布區', '最小統計區']].rename(
    {
      'ID': 'brn',
      '縣市':'city',
      '鄉鎮市區':'town',
      '村里':'village',
      '二級發布區':'code_2',
      '一級發布區':'code_1',
      '最小統計區':'code_min'
    },
    axis=1
)

# 整併
brn_code_df = brn_code_df.merge(brn_code2_df, how='left', on=['brn'])
brn_code_df = brn_code_df.merge(brn_code3_df, how='left', on=['brn'])

# 資料清理
## 處理異體字跟鄉鎮市區層級更名

brn_code_df['town'] = brn_code_df['town'].replace('員林鎮', '員林市')
brn_code_df.loc[brn_code_df['town']=='新店區','village'] = brn_code_df.loc[brn_code_df['town']=='新店區','village'].replace('五?里', '五峰里')


In [6]:
tele_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/109年11月行政區電信信令人口統計資料_鄉鎮市區.csv', encoding='CP950')

In [30]:
tele_df.head()

Unnamed: 0,COUNTY_ID,COUNTY,TOWN_ID,TOWN,NIGHT_WORK,DAY_WORK(7:00~13:00),DAY_WORK(13:00~19:00),DAY_WORK,NIGHT_WEEKEND,DAY_WEEKEND(7:00~13:00),DAY_WEEKEND(13:00~19:00),DAY_WEEKEND,MORNING_WORK,MIDDAY_WORK,AFTERNOON_WORK,EVENING_WORK,MORNING_WEEKEND,MIDDAY_WEEKEND,AFTERNOON_WEEKEND,EVENING_WEEKEND,INFO_TIME
0,縣市代碼,縣市名稱,鄉鎮市區代碼,鄉鎮市區名稱,平日夜間停留人數,平日上午活動人數,平日下午活動人數,平日日間活動人數,假日夜間停留人數,假日上午活動人數,假日下午活動人數,假日日間活動人數,平日早晨旅次,平日中午旅次,平日午後旅次,平日晚上旅次,假日早晨旅次,假日中午旅次,假日午後旅次,假日晚上旅次,資料時間
1,65000,新北市,65000010,板橋區,577007,468604,459821,461811,574620,553665,547852,552865,1370756.45,1199378.12,1217279.46,1417770.61,1181447.57,1447237.98,1452579.49,1427165.52,109Y11M
2,65000,新北市,65000020,三重區,426580,353762,336820,343466,424259,402368,379533,395318,1047368.37,873343.72,880980.15,1000009.8,887589.65,1004799.36,975547.52,939548.09,109Y11M
3,65000,新北市,65000030,中和區,470287,414013,398795,408358,466276,439917,417541,433865,1142144.94,995924.74,991048.66,1074345.45,966434.32,1062291.77,1036495.62,972990.59,109Y11M
4,65000,新北市,65000040,永和區,232342,169268,157426,161573,230134,215493,199485,210496,513518.18,415321.92,414020.3,495365.41,461549.08,526167.13,495412.61,485350.57,109Y11M


In [31]:
# 公司行號工廠資訊

poi_num_df = []
for brn in list(api_data.keys()):
    df = pd.DataFrame([list(api_data[brn]['around_num'].values())],columns=['factory_num','bu_num', 'cmp_num', 'stk_num'])
    poi_num_df.append(df)
poi_num_df = pd.concat(poi_num_df)
poi_num_df['brn'] = list(api_data.keys())

In [32]:
%%capture
'''
## openstreetmap資料
import pickle
with open('/content/drive/MyDrive/03_RESOURCE/GIS/brn_poi.pickle', 'rb') as f:
  poi = pickle.load(f)
'''

In [33]:
## 人口資料

pop_ratio = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/台灣二級統計區人口指標.csv')
pop_data = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/台灣二級統計區人口資料.csv')

pop_ratio = pop_ratio.rename({'CODE2':'code_2'},axis=1)
pop_data = pop_data.rename({'CODE2':'code_2'},axis=1)


In [34]:
## 年收入資料 
## 資料年度107年 單位千元
sal_df = pd.read_csv('https://www.fia.gov.tw/WEB/fia/ias/ias106/106_165-9.csv')

In [35]:
%%capture
## 電子發票
## 資料年度107年 

"""
e_inv_amt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_SalesAmount.csv')
e_inv_store_cnt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_StoreCount.csv')
e_inv_cnt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_InvoiceCount.csv')

## 有點怪怪的

e_inv_amt_df = e_inv_amt_df.replace(-99,np.nan)
e_inv_amt_df['EINV_AMT'] = e_inv_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)

e_inv_store_cnt_df = e_inv_store_cnt_df.replace(-99,np.nan)
e_inv_store_cnt_df['EINV_BU_CNT'] = e_inv_store_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)

e_inv_cnt_df = e_inv_cnt_df.replace(-99,np.nan)
e_inv_cnt_df['EINV_CNT'] = e_inv_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)
"""

In [36]:
## 政府製作的消費熱度

e_inv_ratio = pd.read_csv('https://sip.einvoice.nat.gov.tw/ods-main/ODS308E/download/691C0280-CEFB-488F-9E71-6AA4F39A41CD/1/1124193D-09F5-4711-AB9A-01848E3B88E4/0/?fileType=csv')

In [37]:
def whitespace_remover(dataframe):
    # iterating over the columns
    for i in dataframe.columns:        
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':       
            # applying strip function on column
            dataframe[i] = dataframe[i].str.replace('\s+', '', regex=True)
        else:              
            # if condn. is False then it will do nothing.
            pass

In [38]:
e_inv_ratio = e_inv_ratio[e_inv_ratio['年度']==2020][['縣市','鄉鎮市區', '村里','主行業別', '消費熱度計算來源', '張數指標', '銷售額指標']]
e_inv_ratio.eval('綜合指標 = (張數指標+銷售額指標)/2', inplace=True)

## groupby mean計算不同計算來源

e_inv_ratio = e_inv_ratio.groupby(['縣市', '鄉鎮市區', '村里', '主行業別'])[['綜合指標']]\
.agg('mean')\
.reset_index(level=[0,1,2,3])

e_inv_ratio = e_inv_ratio.pivot_table(index=['縣市', '鄉鎮市區', '村里'], columns = ['主行業別'], values = ['綜合指標']).fillna(0)
e_inv_ratio.columns = ['_'.join(col) for col in e_inv_ratio.columns.values]
e_inv_ratio = e_inv_ratio.reset_index()


In [39]:
whitespace_remover(e_inv_ratio)

In [40]:
e_inv_ratio.columns = ['city','town','village','hotel_ind', 'retail_ind', 'ctring_ind']

In [41]:
e_inv_ratio['village'] = e_inv_ratio['village'].replace('羣賢里','群賢里')
#e_inv_ratio.query('village.str.contains("賢")', engine='python')

電子發票消費熱度指標

https://data.gov.tw/dataset/36843

https://sip.einvoice.nat.gov.tw/ods-main/ODS308E/download/691C0280-CEFB-488F-9E71-6AA4F39A41CD/1/1124193D-09F5-4711-AB9A-01848E3B88E4/0/?fileType=csv

finlab實價登錄爬蟲
https://www.finlab.tw/real-estate-analasys-histograms/

## 開始整併

In [42]:
'''
性比例	戶量	人口密度  扶養比	扶幼比	扶老比	老化指數
'''
pop_ratio.head(3)

Unnamed: 0,city,code_2,M_F_RAT,P_H_CNT,P_DEN,DEPENDENCY_RAT,A0A14_A15A65_RAT,A65UP_A15A64_RAT,A65_A0A14_RAT,INFO_TIME
0,南投縣,A0801-01,91.86,2.91,3512.41,52.33,27.77,24.56,88.45,109Y12M
1,南投縣,A0801-02,80.5,2.23,1568.15,94.48,20.04,74.44,371.56,109Y12M
2,南投縣,A0801-03,115.34,2.73,384.38,40.75,13.08,27.66,211.48,109Y12M


In [43]:
sal_dict={
  '縣市':'city',
  '鄉鎮市區':'town',
  '村里':'village',
  '平均數':'sal_mean',
  '中位數':'sal_med'
}

sal_df.columns = ['city', 'town', 'village', 'tax_unit_cnt', 'all_amt', 'sal_mean', 'sal_med', 'Q1', 'Q3', 'std', 's']
sal_df['village'] = sal_df['village'].replace('羣賢里','群賢里')

In [44]:
pop_ratio[pop_ratio['P_DEN']>40000]

Unnamed: 0,city,code_2,M_F_RAT,P_H_CNT,P_DEN,DEPENDENCY_RAT,A0A14_A15A65_RAT,A65UP_A15A64_RAT,A65_A0A14_RAT,INFO_TIME
463,基隆市,A1701-15,86.34,2.07,72233.15,30.43,16.06,14.37,89.44,109Y12M
464,基隆市,A1701-16,91.48,2.14,68962.73,24.73,11.53,13.21,114.57,109Y12M
468,基隆市,A1701-20,102.07,2.61,46820.64,23.14,12.48,10.66,85.37,109Y12M
473,基隆市,A1702-04,96.97,2.53,49092.45,40.95,16.03,24.93,155.56,109Y12M
476,基隆市,A1702-07,91.91,2.43,40484.91,51.20,18.62,32.58,174.92,109Y12M
...,...,...,...,...,...,...,...,...,...,...
7763,高雄市,A6412-B8,94.42,2.52,52895.97,32.81,11.88,20.93,176.13,109Y12M
7765,高雄市,A6412-C0,95.67,2.40,43047.51,25.99,11.86,14.14,119.22,109Y12M
7769,高雄市,A6412-C5,90.73,2.26,54166.55,25.97,9.49,16.48,173.64,109Y12M
7778,高雄市,A6412-D6,91.93,2.61,43065.25,39.07,16.50,22.57,136.79,109Y12M


In [45]:
## 人口指標
brn_gis_df = brn_code_df.merge(pop_ratio[['code_2','P_DEN','A65UP_A15A64_RAT']], how='left', left_on='code_2', right_on='code_2')

## 年收
brn_gis_df = brn_gis_df.merge(sal_df[['city', 'town', 'village','sal_mean', 'sal_med']], how='left', on=['town','city','village'])

## 消費熱度
brn_gis_df = brn_gis_df.merge(e_inv_ratio, how='left', on=['town','city','village'])


## 公司行號工廠
brn_gis_df = brn_gis_df.merge(poi_num_df, how='left', on='brn')



In [93]:
feat_list = [
    'P_DEN', 'A65UP_A15A64_RAT',
    'sal_mean',  
    'factory_num', 'bu_num', 'cmp_num', 'stk_num'#, 
    #'hotel_ind', 'retail_ind', 'ctring_ind'
]

In [94]:
arr = brn_gis_df[feat_list]
brn_gis_df[['brn','city','town','village']+feat_list].head()

Unnamed: 0,brn,city,town,village,P_DEN,A65UP_A15A64_RAT,sal_mean,factory_num,bu_num,cmp_num,stk_num
0,營業部(總行),臺北市,中山區,民安里,19921.56,34.61,1290,0,1369,5066,15
1,敦南分行,臺北市,大安區,敦安里,41990.78,34.41,2085,0,829,4887,11
2,新生分行,臺北市,中正區,幸市里,27008.2,36.09,2241,1,399,2574,14
3,新莊分行,新北市,新莊區,中華里,60972.12,15.98,1021,6,1793,1122,0
4,桃園分行,桃園市,桃園區,文明里,15269.66,28.4,915,19,656,637,1


In [95]:
brn_gis_df[brn_gis_df.isnull().any(axis=1)]

Unnamed: 0,brn,addr,lon,lat,city,town,village,code_2,code_1,code_min,eco_code3,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,hotel_ind,retail_ind,ctring_ind,factory_num,bu_num,cmp_num,stk_num


In [96]:
brn_gis_df[brn_gis_df['brn']=='汐止分行']

Unnamed: 0,brn,addr,lon,lat,city,town,village,code_2,code_1,code_min,eco_code3,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,hotel_ind,retail_ind,ctring_ind,factory_num,bu_num,cmp_num,stk_num
75,汐止分行,新北市汐止區中興路135號,121.631582,25.065473,新北市,汐止區,中興里,A6511-44,A6511-44-007,A6511-0670-00,A6500011044,43171.45,15.69,804,568,0.0,95.5,94.25,125,593,820,3


## EDA

[觀光景點消費熱度分析-電子發票載具客源地區統計-資料集](https://sip.einvoice.nat.gov.tw/ods-main/ODS303E/691C0280-CEFB-488F-9E71-6AA4F39A41CD/30/Mjs=?FUNCTION_ID=ODS303E&BUILD_INFO=20211008-1333&SYSTEM_ID=ODS&SYSTEM_NAME=%E9%9A%A8%E9%81%B8&ENVIRONMENT_DISPLAY_NAME=&TITLE=%E6%AD%A1%E8%BF%8E%E8%92%9E%E8%87%A8+%E8%B2%A1%E6%94%BF%E9%83%A8%E9%9B%BB%E5%AD%90%E7%99%BC%E7%A5%A8+%E6%99%BA%E6%85%A7%E5%A5%BD%E7%94%9F%E6%B4%BB+%E6%9C%8D%E5%8B%99%E5%B9%B3%E5%8F%B0)

[電信信令人口統計之建置、分析與應用](https://ws.moi.gov.tw/Download.ashx?u=LzAwMS9VcGxvYWQvNDAwL3JlbGZpbGUvMC8xNDk0NS85NzMxZjkxNi01MzU5LTQzZDktYmVlOS0zNjMyYTUwOTcxMDYucGRm&n=6Zu75L%2Bh5L%2Bh5Luk5Lq65Y%2Bj57Wx6KiI5LmL5bu6572u44CB5YiG5p6Q6IiH5oeJ55SoLnBkZg%3D%3D&icon=..pdf)

In [121]:
fnl_df = brn_gis_df[['brn','lon','lat','city','town','village']+feat_list]

In [122]:
fnl_df.insert(0,'lon_lat', brn_gis_df['lat'].astype(str)+','+brn_gis_df['lon'].astype(str))

In [123]:
clstr_feat = [
  'P_DEN',
  'A65UP_A15A64_RAT',
  'sal_mean',
  'factory_num',
  'bu_num',
  'cmp_num',
  'stk_num'#,
  #'hotel_ind',
  #'retail_ind',
  #'ctring_ind'
]

In [124]:
from sklearn.preprocessing import StandardScaler

In [125]:
clstr_arr = StandardScaler().fit_transform(fnl_df[clstr_feat])

In [126]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

In [127]:
clustering = AgglomerativeClustering(n_clusters=5).fit(clstr_arr)
#clustering = KMeans(n_clusters=4, max_iter=5000).fit(clstr_arr)


In [128]:
clustering.labels_

array([1, 1, 1, 0, 3, 3, 0, 4, 3, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 3, 0, 3,
       3, 0, 1, 0, 0, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 1, 0, 3, 0, 0,
       2, 1, 0, 3, 4, 0, 0, 0, 1, 1, 1, 1, 0, 4, 0, 1, 0, 0, 3, 0, 1, 0,
       3, 4, 0, 3, 0, 0, 3, 1, 3, 2, 4, 0, 3, 1, 0, 3, 4, 0, 3, 0, 0, 3,
       1, 4, 3, 3, 3, 0, 4, 3, 3, 0, 0, 4, 0, 3, 3, 3])

In [129]:
fnl_df.insert(0,'clstr', clustering.labels_)

In [139]:
i=4
fnl_df[fnl_df['clstr']==i]

Unnamed: 0,clstr,lon_lat,brn,lon,lat,city,town,village,P_DEN,A65UP_A15A64_RAT,sal_mean,factory_num,bu_num,cmp_num,stk_num
7,4,"22.661373,120.29174599999999",高雄分行,120.291746,22.661373,高雄市,鼓山區,龍水里,26852.31,12.17,1527,1,481,944,1
48,4,"24.800079999999998,120.98954599999999",竹科分行,120.989546,24.80008,新竹市,東區,綠水里,8103.49,20.78,1828,33,280,383,4
57,4,"24.161794,120.67018999999999",民權分行,120.67019,24.161794,臺中市,北區,健行里,24816.94,18.04,1591,9,549,975,0
67,4,"24.15318,120.64960900000001",大墩分行,120.649609,24.15318,臺中市,南屯區,溝墘里,26486.19,12.18,1620,5,722,1466,0
76,4,"24.815442,121.024363",成功分行,121.024363,24.815442,新竹縣,竹北市,鹿場里,14869.55,5.44,2606,1,365,557,1
82,4,"25.015033,121.466439",新板分行,121.466439,25.015033,新北市,板橋區,福丘里,14547.66,15.16,1992,21,659,852,8
89,4,"24.782114,121.019691",關東橋分行,121.019691,24.782114,新竹市,東區,仙水里,4844.52,9.53,1655,3,535,439,1
94,4,"24.157928,120.646433",市府分行,120.646433,24.157928,臺中市,西屯區,惠來里,11859.94,10.87,2546,0,287,1052,2
99,4,"25.074393,121.60674399999999",東湖分行,121.606744,25.074393,臺北市,內湖區,金湖里,41822.87,14.89,2271,0,191,513,2


- 群1

- 群2
- 群3
- 群4

In [131]:
fnl_df.groupby('clstr').describe().T#.to_csv('brn_gis.csv')

Unnamed: 0,clstr,0,1,2,3,4
lon,count,40.0,19.0,2.0,34.0,9.0
lon,mean,121.18604,121.547911,121.525292,120.723606,120.929418
lon,std,0.494562,0.021556,0.150317,0.493432,0.418351
lon,min,120.201614,121.513333,121.419001,120.17996,120.291746
lon,25%,120.687709,121.53573,121.472146,120.296928,120.649609
lon,50%,121.477652,121.546319,121.525292,120.604718,120.989546
lon,75%,121.526983,121.557084,121.578437,121.165465,121.024363
lon,max,121.769916,121.613919,121.631582,121.747312,121.606744
lat,count,40.0,19.0,2.0,34.0,9.0
lat,mean,24.533234,25.050222,25.045867,23.876496,24.402371
