In [2]:
import re
import time
import os
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from string import Template
import urllib
import json
import pickle


from pprint import pprint
import sys

import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,davies_bouldin_score,calinski_harabasz_score
from sklearn.decomposition import PCA, SparsePCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
pd.set_option('display.max_rows', 500)


In [12]:
## 經濟地理資訊平台API抓的資料

with open('/content/drive/MyDrive/03_RESOURCE/GIS/gis_open_api.pickle', 'rb') as f:
    api_data = pickle.load(f)

In [13]:

# 分行爬蟲資訊

## 各分行地址與基本資訊
brn_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/taishin_brn.csv').drop_duplicates()

## 分行地址與點位
brn_xy_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/brn_xy_df.csv')

## 合併並去掉重複
brn_df = brn_df.merge(brn_xy_df,how='left' ,on = 'addr').copy()
brn_df = brn_df.drop_duplicates()

## 作為主表
brn_code_df = brn_df[['brn', 'addr', 'lon', 'lat']]

## 整理經濟三級發布區
brn_code3_df = pd.DataFrame(
    [(x,api_data[x]['code3_data']['ADMIV']['CODE3']) for x in list(api_data.keys()) if x !='南崁分行'],
    columns = ['brn','eco_code3']
)
brn_code3_df = pd.concat([brn_code3_df, pd.DataFrame([['南崁分行','A6800005028']], columns=['brn','eco_code3'])])
brn_code3_df = brn_code3_df.reset_index(drop=True)

## 整理統計二級發布區
brn_code2_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/分行統計區代碼對照.csv', encoding='CP950')
brn_code2_df = brn_code2_df[['ID','縣市', '鄉鎮市區', '村里', '二級發布區', '一級發布區', '最小統計區']].rename(
    {
      'ID': 'brn',
      '縣市':'city',
      '鄉鎮市區':'town',
      '村里':'village',
      '二級發布區':'code_2',
      '一級發布區':'code_1',
      '最小統計區':'code_min'
    },
    axis=1
)

# 整併
brn_code_df = brn_code_df.merge(brn_code2_df, how='left', on=['brn'])
brn_code_df = brn_code_df.merge(brn_code3_df, how='left', on=['brn'])

# 資料清理
## 處理異體字跟鄉鎮市區層級更名

brn_code_df['town'] = brn_code_df['town'].replace('員林鎮', '員林市')
brn_code_df.loc[brn_code_df['town']=='新店區','village'] = brn_code_df.loc[brn_code_df['town']=='新店區','village'].replace('五?里', '五峰里')


In [14]:
tele_df = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/109年11月行政區電信信令人口統計資料_鄉鎮市區.csv', encoding='CP950')

In [15]:
# 公司行號工廠資訊

poi_num_df = []
for brn in list(api_data.keys()):
    df = pd.DataFrame([list(api_data[brn]['around_num'].values())],columns=['factory_num','bu_num', 'cmp_num', 'stk_num'])
    poi_num_df.append(df)
poi_num_df = pd.concat(poi_num_df)
poi_num_df['brn'] = list(api_data.keys())

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
%%capture
'''
## openstreetmap資料
import pickle
with open('/content/drive/MyDrive/03_RESOURCE/GIS/brn_poi.pickle', 'rb') as f:
  poi = pickle.load(f)
'''

In [18]:
## 人口資料

pop_ratio = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/台灣二級統計區人口指標.csv')
pop_data = pd.read_csv('/content/drive/MyDrive/03_RESOURCE/GIS/台灣二級統計區人口資料.csv')

pop_ratio = pop_ratio.rename({'CODE2':'code_2'},axis=1)
pop_data = pop_data.rename({'CODE2':'code_2'},axis=1)


In [19]:
## 年收入資料 
## 資料年度107年 單位千元
sal_df = pd.read_csv('https://www.fia.gov.tw/WEB/fia/ias/ias106/106_165-9.csv')

In [20]:
%%capture
## 電子發票
## 資料年度107年 

"""
e_inv_amt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_SalesAmount.csv')
e_inv_store_cnt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_StoreCount.csv')
e_inv_cnt_df = pd.read_csv('https://egis.moea.gov.tw/EGIS_FILE/EGP_SD/107_B2C_InvoiceCount.csv')

## 有點怪怪的

e_inv_amt_df = e_inv_amt_df.replace(-99,np.nan)
e_inv_amt_df['EINV_AMT'] = e_inv_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)

e_inv_store_cnt_df = e_inv_store_cnt_df.replace(-99,np.nan)
e_inv_store_cnt_df['EINV_BU_CNT'] = e_inv_store_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)

e_inv_cnt_df = e_inv_cnt_df.replace(-99,np.nan)
e_inv_cnt_df['EINV_CNT'] = e_inv_cnt_df.loc[:,'1月':'12月'].sum(axis=1,skipna=False)
"""

In [21]:
## 政府製作的消費熱度

e_inv_ratio = pd.read_csv('https://sip.einvoice.nat.gov.tw/ods-main/ODS308E/download/691C0280-CEFB-488F-9E71-6AA4F39A41CD/1/1124193D-09F5-4711-AB9A-01848E3B88E4/0/?fileType=csv')

In [22]:
def whitespace_remover(dataframe):
    # iterating over the columns
    for i in dataframe.columns:        
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':       
            # applying strip function on column
            dataframe[i] = dataframe[i].str.replace('\s+', '', regex=True)
        else:              
            # if condn. is False then it will do nothing.
            pass

In [23]:
e_inv_ratio = e_inv_ratio[e_inv_ratio['年度']==2020][['縣市','鄉鎮市區', '村里','主行業別', '消費熱度計算來源', '張數指標', '銷售額指標']]
e_inv_ratio.eval('綜合指標 = (張數指標+銷售額指標)/2', inplace=True)

## groupby mean計算不同計算來源

e_inv_ratio = e_inv_ratio.groupby(['縣市', '鄉鎮市區', '村里', '主行業別'])[['綜合指標']]\
.agg('mean')\
.reset_index(level=[0,1,2,3])

e_inv_ratio = e_inv_ratio.pivot_table(index=['縣市', '鄉鎮市區', '村里'], columns = ['主行業別'], values = ['綜合指標']).fillna(0)
e_inv_ratio.columns = ['_'.join(col) for col in e_inv_ratio.columns.values]
e_inv_ratio = e_inv_ratio.reset_index()


In [24]:
whitespace_remover(e_inv_ratio)

In [25]:
e_inv_ratio.columns = ['city','town','village','hotel_ind', 'retail_ind', 'ctring_ind']

In [26]:
e_inv_ratio['village'] = e_inv_ratio['village'].replace('羣賢里','群賢里')
#e_inv_ratio.query('village.str.contains("賢")', engine='python')

電子發票消費熱度指標

https://data.gov.tw/dataset/36843

https://sip.einvoice.nat.gov.tw/ods-main/ODS308E/download/691C0280-CEFB-488F-9E71-6AA4F39A41CD/1/1124193D-09F5-4711-AB9A-01848E3B88E4/0/?fileType=csv

finlab實價登錄爬蟲
https://www.finlab.tw/real-estate-analasys-histograms/

## 開始整併

In [27]:
'''
性比例	戶量	人口密度  扶養比	扶幼比	扶老比	老化指數
'''
pop_ratio.head(3)

Unnamed: 0,city,code_2,M_F_RAT,P_H_CNT,P_DEN,DEPENDENCY_RAT,A0A14_A15A65_RAT,A65UP_A15A64_RAT,A65_A0A14_RAT,INFO_TIME
0,南投縣,A0801-01,91.86,2.91,3512.41,52.33,27.77,24.56,88.45,109Y12M
1,南投縣,A0801-02,80.5,2.23,1568.15,94.48,20.04,74.44,371.56,109Y12M
2,南投縣,A0801-03,115.34,2.73,384.38,40.75,13.08,27.66,211.48,109Y12M


In [28]:
sal_dict={
  '縣市':'city',
  '鄉鎮市區':'town',
  '村里':'village',
  '平均數':'sal_mean',
  '中位數':'sal_med'
}

sal_df.columns = ['city', 'town', 'village', 'tax_unit_cnt', 'all_amt', 'sal_mean', 'sal_med', 'Q1', 'Q3', 'std', 's']
sal_df['village'] = sal_df['village'].replace('羣賢里','群賢里')

In [82]:
pop_ratio[pop_ratio['P_DEN']>40000]

Unnamed: 0,city,code_2,M_F_RAT,P_H_CNT,P_DEN,DEPENDENCY_RAT,A0A14_A15A65_RAT,A65UP_A15A64_RAT,A65_A0A14_RAT,INFO_TIME
463,基隆市,A1701-15,86.34,2.07,72233.15,30.43,16.06,14.37,89.44,109Y12M
464,基隆市,A1701-16,91.48,2.14,68962.73,24.73,11.53,13.21,114.57,109Y12M
468,基隆市,A1701-20,102.07,2.61,46820.64,23.14,12.48,10.66,85.37,109Y12M
473,基隆市,A1702-04,96.97,2.53,49092.45,40.95,16.03,24.93,155.56,109Y12M
476,基隆市,A1702-07,91.91,2.43,40484.91,51.20,18.62,32.58,174.92,109Y12M
...,...,...,...,...,...,...,...,...,...,...
7763,高雄市,A6412-B8,94.42,2.52,52895.97,32.81,11.88,20.93,176.13,109Y12M
7765,高雄市,A6412-C0,95.67,2.40,43047.51,25.99,11.86,14.14,119.22,109Y12M
7769,高雄市,A6412-C5,90.73,2.26,54166.55,25.97,9.49,16.48,173.64,109Y12M
7778,高雄市,A6412-D6,91.93,2.61,43065.25,39.07,16.50,22.57,136.79,109Y12M


In [29]:
## 人口指標
brn_gis_df = brn_code_df.merge(pop_ratio[['code_2','P_DEN','A65UP_A15A64_RAT']], how='left', left_on='code_2', right_on='code_2')

## 年收
brn_gis_df = brn_gis_df.merge(sal_df[['city', 'town', 'village','sal_mean', 'sal_med']], how='left', on=['town','city','village'])

## 消費熱度
brn_gis_df = brn_gis_df.merge(e_inv_ratio, how='left', on=['town','city','village'])


## 公司行號工廠
brn_gis_df = brn_gis_df.merge(poi_num_df, how='left', on='brn')



In [30]:
feat_list = [
    'P_DEN', 'A65UP_A15A64_RAT',
    'sal_mean', 'sal_med', 
    'factory_num', 'bu_num', 'cmp_num', 'stk_num', 
    'hotel_ind', 'retail_ind', 'ctring_ind'
]

In [31]:
arr = brn_gis_df[feat_list]
brn_gis_df[['brn','city','town','village']+feat_list].head()

Unnamed: 0,brn,city,town,village,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,factory_num,bu_num,cmp_num,stk_num,hotel_ind,retail_ind,ctring_ind
0,營業部(總行),臺北市,中山區,民安里,19921.56,34.61,1290,685,0,1369,5066,15,60.0,99.0,99.0
1,敦南分行,臺北市,大安區,敦安里,41990.78,34.41,2085,961,0,829,4887,11,73.5,89.75,80.0
2,新生分行,臺北市,中正區,幸市里,27008.2,36.09,2241,935,1,399,2574,14,0.0,92.0,91.75
3,新莊分行,新北市,新莊區,中華里,60972.12,15.98,1021,642,6,1793,1122,0,0.0,63.0,64.75
4,桃園分行,桃園市,桃園區,文明里,15269.66,28.4,915,647,19,656,637,1,0.0,55.5,60.75


In [32]:
brn_gis_df[brn_gis_df.isnull().any(axis=1)]

Unnamed: 0,brn,addr,lon,lat,city,town,village,code_2,code_1,code_min,eco_code3,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,hotel_ind,retail_ind,ctring_ind,factory_num,bu_num,cmp_num,stk_num


In [85]:
brn_gis_df[brn_gis_df['brn']=='汐止分行']

Unnamed: 0,brn,addr,lon,lat,city,town,village,code_2,code_1,code_min,eco_code3,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,hotel_ind,retail_ind,ctring_ind,factory_num,bu_num,cmp_num,stk_num
75,汐止分行,新北市汐止區中興路135號,121.631582,25.065473,新北市,汐止區,中興里,A6511-44,A6511-44-007,A6511-0670-00,A6500011044,43171.45,15.69,804,568,0.0,95.5,94.25,125,593,820,3


## EDA

[觀光景點消費熱度分析-電子發票載具客源地區統計-資料集](https://sip.einvoice.nat.gov.tw/ods-main/ODS303E/691C0280-CEFB-488F-9E71-6AA4F39A41CD/30/Mjs=?FUNCTION_ID=ODS303E&BUILD_INFO=20211008-1333&SYSTEM_ID=ODS&SYSTEM_NAME=%E9%9A%A8%E9%81%B8&ENVIRONMENT_DISPLAY_NAME=&TITLE=%E6%AD%A1%E8%BF%8E%E8%92%9E%E8%87%A8+%E8%B2%A1%E6%94%BF%E9%83%A8%E9%9B%BB%E5%AD%90%E7%99%BC%E7%A5%A8+%E6%99%BA%E6%85%A7%E5%A5%BD%E7%94%9F%E6%B4%BB+%E6%9C%8D%E5%8B%99%E5%B9%B3%E5%8F%B0)

[電信信令人口統計之建置、分析與應用](https://ws.moi.gov.tw/Download.ashx?u=LzAwMS9VcGxvYWQvNDAwL3JlbGZpbGUvMC8xNDk0NS85NzMxZjkxNi01MzU5LTQzZDktYmVlOS0zNjMyYTUwOTcxMDYucGRm&n=6Zu75L%2Bh5L%2Bh5Luk5Lq65Y%2Bj57Wx6KiI5LmL5bu6572u44CB5YiG5p6Q6IiH5oeJ55SoLnBkZg%3D%3D&icon=..pdf)

In [33]:
fnl_df = brn_gis_df[['brn','lon','lat','city','town','village']+feat_list]

In [34]:
fnl_df.insert(0,'lon_lat', brn_gis_df['lat'].astype(str)+','+brn_gis_df['lon'].astype(str))

In [35]:
clstr_feat = [
  'P_DEN',
  'A65UP_A15A64_RAT',
  'sal_mean',
  'factory_num',
  'bu_num',
  'cmp_num',
  'stk_num',
  'hotel_ind',
  'retail_ind',
  'ctring_ind'
]

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
clstr_arr = StandardScaler().fit_transform(fnl_df[clstr_feat])

In [50]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

In [60]:
clustering = KMeans(n_clusters=4, max_iter=5000).fit(clstr_arr)
clustering = AgglomerativeClustering(n_clusters=5).fit(clstr_arr)

In [61]:
clustering.labels_

array([0, 0, 0, 4, 1, 2, 4, 2, 1, 0, 2, 0, 0, 1, 2, 0, 2, 0, 4, 1, 1, 1,
       2, 1, 0, 2, 1, 0, 2, 0, 2, 2, 1, 2, 2, 1, 1, 2, 0, 0, 4, 2, 2, 2,
       3, 0, 1, 2, 2, 2, 4, 4, 0, 0, 0, 0, 1, 2, 2, 0, 1, 0, 0, 4, 0, 1,
       2, 2, 1, 2, 4, 2, 2, 0, 2, 3, 2, 2, 2, 0, 1, 1, 2, 0, 2, 4, 4, 1,
       0, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2])

In [53]:
fnl_df.insert(0,'clstr', clustering.labels_)

ValueError: ignored

In [62]:
fnl_df['clstr']=clustering.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [73]:
for i in [0,1,2,3,4]:
  print(fnl_df[fnl_df['clstr']==i]['brn'])

0      營業部(總行)
1         敦南分行
2         新生分行
9       南京東路分行
11         信託部
12         國外部
15        天母分行
17    國際金融業務分行
24        信義分行
27        永福分行
29        大安分行
38        古亭分行
39        建橋分行
45        內湖分行
52        西門分行
53        敦北分行
54        忠孝分行
55        復興分行
59        建北分行
61       北新店分行
62        府城分行
64       基隆路分行
73        松江分行
79       南松山分行
83        北師分行
88        南港分行
Name: brn, dtype: object
4       桃園分行
8       台南分行
13      苓雅分行
19      嘉義分行
20      豐原分行
21      花蓮分行
23      七賢分行
26      三重分行
32      金華分行
35      彰化分行
36      五甲分行
46      永和分行
56      板南分行
60      三和分行
65      新店分行
68      延平分行
80      淡水分行
81     東基隆分行
87      北大分行
90      竹北分行
92      八德分行
93      員林分行
95      右昌分行
96      沙鹿分行
98      羅東分行
100     文山分行
101    副都心分行
Name: brn, dtype: object
5       台中分行
7       高雄分行
10      中壢分行
14      蘆洲分行
16      大里分行
22      新竹分行
25     北台中分行
28      屏東分行
30      龍潭分行
31      崇德分行
33      後甲分行
34      海佃分行
37      鳳山分行
41      太平分行
42      和平分行
43     北高雄分行
4

In [76]:
fnl_df[fnl_df['clstr']==3]

Unnamed: 0,clstr,lon_lat,brn,lon,lat,city,town,village,P_DEN,A65UP_A15A64_RAT,sal_mean,sal_med,factory_num,bu_num,cmp_num,stk_num,hotel_ind,retail_ind,ctring_ind
44,3,"25.026262,121.419001",南新莊分行,121.419001,25.026262,新北市,新莊區,裕民里,44429.92,16.03,735,541,141,1080,930,1,0.0,74.0,80.75
75,3,"25.065473,121.63158200000001",汐止分行,121.631582,25.065473,新北市,汐止區,中興里,43171.45,15.69,804,568,125,593,820,3,0.0,95.5,94.25


In [68]:
fnl_df.groupby('clstr').describe().T

Unnamed: 0,clstr,0,1,2,3,4
lon,count,26.0,27.0,39.0,2.0,10.0
lon,mean,121.442243,121.024736,120.788111,121.525292,121.391669
lon,std,0.3648,0.558062,0.447719,0.150317,0.377831
lon,min,120.201614,120.191417,120.17996,121.419001,120.323222
lon,25%,121.532617,120.495033,120.349112,121.472146,121.460424
lon,50%,121.541092,121.299504,120.67019,121.525292,121.503275
lon,75%,121.549947,121.504094,121.120587,121.578437,121.541127
lon,max,121.613919,121.769916,121.606744,121.631582,121.575038
lat,count,26.0,27.0,39.0,2.0,10.0
lat,mean,24.888778,24.270807,24.059829,25.045867,24.795244
