In [1]:
import sys,os
sys.path.append("..")
import django
django.setup()
import pandas as pd
from io import StringIO
import requests
import datetime
from crawlers.finlab.data_process_tools import char_filter
from crawlers.models import *

In [3]:
class CrawlBrokerInfoTW:
    def __init__(self):
        self.target_name = "台股券商資訊"
        self.format = "non_time_series"

    @staticmethod
    def headquarter_info():
        r = requests.get('https://www.twse.com.tw/zh/brokerService/brokerServiceAudit')
        html_df = pd.read_html(StringIO(r.text))
        df = pd.DataFrame(html_df[0])
        df['department'] = '總公司'
        df = df.drop(columns='分公司')
        return df

    @staticmethod
    def branch_info(broker_hq_id):
        url = 'https://www.twse.com.tw/brokerService/brokerServiceAudit?showType=list&stkNo=' + broker_hq_id +\
              '&focus=6'
        r = requests.get(url)
        html_df = pd.read_html(StringIO(r.text))
        df = pd.DataFrame(html_df[3])
        return df

    def crawl_main(self):
        broker_hq = self.headquarter_info()
        branch_data = pd.concat([self.branch_info(i) for i in broker_hq['證券商代號'].values])
        branch_data['department'] = '分公司'
        df_all = pd.concat([broker_hq, branch_data])
        df_all = df_all.rename(columns={'證券商代號': 'stock_id', '證券商名稱': 'broker_name',
                                        '開業日': 'date_of_establishment', '地址': 'address',
                                        '電話': 'phone'
                                        })
        df_all=df_all[df_all['stock_id']!='查無資料']
        df_all['broker_name']=df_all['broker_name'].apply(lambda s:s.replace(' ',''))
        return df_all
    
a=CrawlBrokerInfoTW()
df=a.crawl_main()
df
    
            
    

Unnamed: 0,stock_id,broker_name,date_of_establishment,address,phone,department
0,1020,合庫,100/12/02,台北市大安區忠孝東路四段325號2樓(部分)、經紀部複委託科地址：台北市松山區長安東路二段2...,02-27528000,總公司
1,1030,土銀,51/02/09,台北市延平南路八十一號,02-23483948,總公司
2,1040,臺銀證券,97/01/02,台北市重慶南路1段58號4樓、5樓部分,02-23882188,總公司
3,1110,台灣企銀,65/07/01,台北市塔城街30號4樓,02-25597171,總公司
4,1160,日盛,50/12/08,台北市南京東路2段111號3樓及5、6、7、8、12、13樓部分,02-25048888,總公司
...,...,...,...,...,...,...
42,9A9j,永豐金-嘉義,860919,嘉義市中山路386號3樓,05-229-1345,分公司
43,9A9q,永豐金-潮州,820106,屏東縣潮州鎮中正路38之1號,08-789-0122,分公司
44,9A9r,永豐金-北高雄,840401,高雄市鼓山區龍德路473號3樓,07-555-0455,分公司
45,9A9s,永豐金-彰化,850501,彰化市民族路532號5、6樓,04-722-4976,分公司


In [9]:
class GetNTLSxy:

    @classmethod
    def get_xy(cls, address):
        for i in ['科學工業園區','科學園區','大發工業區',
                  '南部科學工業園區','工業區','加工出口區'
                  '南崗工業區']:
            address=address.replace(i,'')

        address=char_filter(address,'及','部分','、',',','（')            
        #解決郵遞區號問題
        filter_num = filter(str.isalpha, address[:7])
        address=''.join(list(filter_num))+address[7:]
        
        url = 'https://moisagis.moi.gov.tw/moiap/gis2010/content/user/matchservice/singleMatch.cfm'
        form = {
            'address': address,
            'matchRange': '0',
            'fuzzyNum': '0',
            'roadEQstreet': 'false',
            'subnumEQnum': 'false',
            'isLockTown': 'false',
            'isLockVillage': 'false',
            'ex_coor': 'EPSG:4326',
            'U02DataYear': '2015',
            'output_xml': '1'
        }
        try:
            r = requests.post(url, data=form)
            html_df = pd.read_html(StringIO(r.text))
        except ValueError:
            return None
        df = pd.DataFrame(html_df[0])
        df = df.where(pd.notnull(df), None)
        return df

    # 地址有些漏區的
    @classmethod
    def main_process(cls, address):
        df = cls.get_xy(address)
        if df is None:
            return None
        elif df['X'].values[0] is None:
            address = address[:3] + '信義區' + address[3:]
            df = cls.get_xy(address)
            return df
        else:
            return df
    # 更新table中經緯度資料,start、end控制更新範圍
    @classmethod
    def update_xy_data(cls, model_name, start=None, end=None, only_null=True):
        bulk_update_data = []
        if only_null is True:
            obj_list = model_name.objects.filter(longitude__isnull=True)[start:end]
        else:
            obj_list = model_name.objects.all()
        for obj_check in obj_list:
            location = obj_check.address
            print(location, obj_check.id)
            df = cls.main_process(location)
            if df is None:
                print('pass')
                continue
            obj_check.city = df['縣市'].values[0]
            obj_check.district = df['鄉鎮'].values[0]
            obj_check.longitude = df['X'].values[0]
            obj_check.latitude = df['Y'].values[0]

            bulk_update_data.append(obj_check)
        update_fields_area = ['city', 'district', 'latitude', 'longitude']
        model_name.objects.bulk_update(bulk_update_data, update_fields_area, batch_size=1000)

GetNTLSxy.main_process('台南市新營區民治路301號3樓及1樓')

Unnamed: 0,縣市,鄉鎮,地址,村里,二級發布區,一級發布區,最小統計區,比對代碼,比對說明,X,Y
0,臺南市,新營區,民治路301號3樓,民生里,A6701-18,A6701-18-007,A6701-0270-00,AL0-F2-0-2019M12,完全比對,120.303184,23.30525


In [6]:

len(CompanyBasicInfoTW.objects.filter(longitude__isnull=True))

146

In [7]:

GetNTLSxy.update_xy_data(BrokerInfoTW)

台中市豐原市水源路669號1樓 2719


KeyboardInterrupt: 

In [None]:
from crawlers.models import *
bulk_update_data=[]
model_name=CompanyBasicInfoTW


# obj_check = model_name.objects.get(stock_id__contains='1020')


obj_list = model_name.objects.all()

for obj_check in obj_list[100:]:
    location=obj_check.address
    print(location,obj_check.id)
    df=GetNTLSxy.main_process(location)
    if df is None:
        print('pass')
        continue

    obj_check.latitude=df['X'].values[0]
    obj_check.longitude=df['Y'].values[0]
    bulk_update_data.append(obj_check)

update_fields_area = ['latitude','longitude']
model_name.objects.bulk_update(bulk_update_data, update_fields_area, batch_size=1000)




In [None]:
a='南部科學工業園區台南市新市區大利一路6號'
a.replace('南部科學工業園區',"")

In [None]:
address='南部科學工業園區台南市新市區大利一路6號'
for i in ['新竹科學工業園區','新竹科學園區','大發工業區','南部科學工業園區']:
    print(i)
    address=address.replace(i,'') 
address

In [None]:
a='(114)台北市內湖區新湖一路151號7樓'
r=a.index(')')
if r<6:
    a=a[r+1:]
a

In [None]:
address='(114)台北市內湖區新湖一路151號7樓'
filter_num = filter(str.isalpha, a[:6])
address=''.join(list(filter_num))+a[6:]
address

In [None]:
a[:6]

In [None]:
def char_slice(target,*trash_key):
    for i in trash_key:
        if i in target:
            target=target[:target.index(i)]
    return target
char_slice('台北市大安區敦化南路2段97號32樓及地下1樓','及','部分')