In [None]:
import math
import json
import logging
import requests
import traceback
import pandas as pd
from datetime import datetime
from time import sleep

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
root_path = "/Users/fang/stock_data/basic_data"

In [None]:
def getStockList(url):
    df = pd.read_html(url,encoding='big5hkscs',header=0)[0]
    raw_list = df['有價證券代號及名稱']
    code_list = []
    for code in raw_list:
        code = code.split('　')[0]
        if len(code) == 4:
            code_list.append(code)
    return code_list

logging.info('get stock list')
twse_url = 'http://isin.twse.com.tw/isin/C_public.jsp?strMode=2'
tpex_url = 'https://isin.twse.com.tw/isin/C_public.jsp?strMode=4'
twse_list = getStockList(twse_url)
tpex_list = getStockList(tpex_url)

logging.info('get stock price')
session = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
final_result = []

# TPEX stock price
query = ''
for i in range(len(tpex_list)):
    if i % 100 == 0 and query != '':
        resp = session.get('https://mis.twse.com.tw/stock/api/getStockInfo.jsp?json=1&delay=0&ex_ch=' + query[:-1], headers=headers)
        final_result.append(resp.text)
        query = ''
        sleep(10)
        logging.info('batch: {}'.format(math.ceil(i / 100)))
    query += 'otc_%s.tw|' % tpex_list[i]
resp = session.get('https://mis.twse.com.tw/stock/api/getStockInfo.jsp?json=1&delay=0&ex_ch=' + query[:-1], headers=headers)
final_result.append(resp.text)
logging.info('twse batch: {}'.format(math.ceil(i / 100)))

# TWSE stock price
query = ''
for i in range(len(twse_list)):
    if i % 100 == 0 and query != '':
        resp = requests.get('https://mis.twse.com.tw/stock/api/getStockInfo.jsp?json=1&delay=0&ex_ch=' + query[:-1])
        final_result.append(resp.text)
        query = ''
        sleep(10)
        logging.info('batch: {}'.format(math.ceil(i / 100)))
    query += 'tse_%s.tw|' % twse_list[i]
resp = requests.get('https://mis.twse.com.tw/stock/api/getStockInfo.jsp?json=1&delay=0&ex_ch=' + query[:-1])
final_result.append(resp.text)
logging.info('tpex batch: {}'.format(math.ceil(i / 100)))

logging.info('parse data')
json_dict = {}
for result in final_result:
    for json_obj in json.loads(result).get('msgArray'):
        data = {}
        data['股票代號'] = json_obj.get('c')
        data['日期'] = datetime.strptime(json_obj.get('d'), '%Y%m%d').strftime('%Y-%m-%d')
        try:
            data['成交股數'] = int(json_obj.get('v'))
            data['成交金額'] = int(float(json_obj.get('z')) * int(json_obj.get('v')) * 1000)
            data['開盤價'] = float(json_obj.get('o'))
            data['最高價'] = float(json_obj.get('h'))
            data['最低價'] = float(json_obj.get('l'))
            data['收盤價'] = float(json_obj.get('h'))
            data['漲跌價差'] = round(float(json_obj.get('z')) - float(json_obj.get('y')), 2)
            data['漲跌幅'] = round(data['漲跌價差']/float(json_obj.get('y'))*100, 2)
        except:
            pass
        json_dict[data['股票代號']] = data