In [1]:
import zipfile

In [2]:
import os

In [3]:
from pathlib import Path

In [4]:
import pandas as pd

In [5]:
import tqdm

In [6]:
#! ls -al dataset/fut_tick_h5/

In [6]:
FUT_ZIP_PATH= 'dataset/fut_zip'
FUT_H5_PATH = 'dataset/fut_tick_h5'
FUT_MSG_PATH = 'dataset/fut_tick_msgpack'

In [7]:
fut_zip_path = Path(FUT_ZIP_PATH)

In [8]:
rptzipfiles = sorted([p for p in fut_zip_path.rglob('*')])

In [9]:
processed_futs = [p.stem for p in Path(FUT_MSG_PATH).rglob('*.msgpack')]

In [13]:
processed_futs = [p.stem for p in Path(FUT_MSG_PATH).rglob('*.msgpack')]
for fut_opt in tqdm.tqdm(rptzipfiles):
    if fut_opt.stem not in processed_futs:
        rtp2h5(fut_opt)

100%|██████████| 2038/2038 [50:54<00:00,  2.04s/it]


In [10]:
def _read_fut_rpt_zip(rptzipfile_path):
    with zipfile.ZipFile(rptzipfile_path) as zipf:
        fut_data = zipf.read(zipf.namelist()[0]).decode('big5')
    fut_data_line = [l for l in fut_data.split('\r\n')]
    header = [r.strip(' ') for r in fut_data_line[0].replace('交易日期', '成交日期').replace('交割年月', '到期月份(週別)').split(',')]
    fut_data_list = [[r.strip(' ') for r in l.split(',')] for l in fut_data_line[1:]]
    return header, fut_data_list

In [11]:
def _fut_data2df(fut_data_list, header):
    df_fut_tick = pd.DataFrame(fut_data_list, columns=header).dropna()
    df_fut_tick['成交日期時間']= pd.to_datetime(df_fut_tick['成交日期'] + df_fut_tick['成交時間'], 
                                          format='%Y%m%d%H%M%S%f')
    df_fut_tick = df_fut_tick[['成交日期時間', '商品代號', '到期月份(週別)', '成交價格', '成交數量(B+S)']]
    df_fut_tick['成交價格'] = df_fut_tick['成交價格'].astype(float)
    df_fut_tick['成交數量(B+S)'] = df_fut_tick['成交數量(B+S)'].astype(int)
    return df_fut_tick

In [12]:
def rtp2h5(rptzipfile_path):
    header, fut_data_list = _read_fut_rpt_zip(f"{rptzipfile_path}")
    df_fut_tick = _fut_data2df(fut_data_list, header)
    df_fut_tick.to_msgpack(f"{FUT_MSG_PATH}/{rptzipfile_path.stem}.msgpack", 
                           encoding='utf-8', compress='zlib')
    df_fut_txf_tick = df_fut_tick[df_fut_tick['商品代號'] == 'TX'].reset_index(drop=True)
    exec_dates = df_fut_txf_tick['到期月份(週別)'].unique()
    for exec_date in exec_dates:
        if '/' not in exec_date:
            df_fut_txf_tick_exec = df_fut_txf_tick[df_fut_txf_tick['到期月份(週別)'] == exec_date]
            df_fut_txf_tick_exec.to_hdf(f"{FUT_H5_PATH}/exec_{exec_date}.h5", 'TXF', 
                            mode='a', format='t', append=True, 
                            complevel=5, comlib='zlib')

In [22]:
rptzipfile_path = str(rptzipfiles[-10])
header, fut_data_list = _read_fut_rpt_zip(rptzipfile_path)
df_fut_tick = _fut_data2df(fut_data_list, header)

In [23]:
df_fut_txf_tick = df_fut_tick[df_fut_tick['商品代號'] == 'TX'].reset_index(drop=True)

In [31]:
exec_dates = df_fut_txf_tick['到期月份(週別)'].unique()

In [52]:
exec_date = exec_dates[5]
if '/' not in exec_date:
    print(exec_date)
df_fut_txf_tick_exec = df_fut_txf_tick[df_fut_txf_tick['到期月份(週別)'] == exec_date]

201811


In [56]:
df_fut_txf_tick_exec.to_hdf(f"{FUT_H5_PATH}/exec_{exec_date}.h5", 'TXF', 
                            mode='a', format='t', append=True, 
                            complevel=5, comlib='zlib')

In [11]:
rptzipfile_path = rptzipfiles[-10]
with zipfile.ZipFile(rptzipfile_path) as zipf:
    fut_data = zipf.read(zipf.namelist()[0]).decode('big5')
fut_data_line = [l for l in fut_data.split('\r\n')]
header = [r.strip(' ') for r in fut_data_line[0].replace('交易日期', '成交日期').replace('交割年月', '到期月份(週別)').split(',')]
fut_data_list = [[r.strip(' ') for r in l.split(',')] for l in fut_data_line[1:]]

In [12]:
df_fut_tick = pd.DataFrame(fut_data_list, columns=header).dropna()

In [13]:
df_fut_tick['成交日期時間']= pd.to_datetime(df_fut_tick['成交日期'] + df_fut_tick['成交時間'], format='%Y%m%d%H%M%S%f')

In [14]:
df_fut_tick = df_fut_tick[['成交日期時間', '商品代號', '到期月份(週別)', '成交價格', '成交數量(B+S)']]

In [15]:
df_fut_tick['成交價格'] = df_fut_tick['成交價格'].astype(float)

In [16]:
df_fut_tick['成交數量(B+S)'] = df_fut_tick['成交數量(B+S)'].astype(int)

In [76]:
df_fut_tick.dtypes

成交日期時間       datetime64[ns]
商品代號                 object
到期月份(週別)             object
成交價格                float64
成交數量(B+S)             int64
dtype: object

In [43]:
fut_data_list = [[r.strip(' ') for r in l.split(',')] for l in fut_data_line[1:]][:10]

In [44]:
fut_data_list[:10]

[['20181016', 'BRF', '201812', '150708', '2484.5', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '153913', '2480.5', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '172756', '2481', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '173133', '2481', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '174412', '2469', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '174432', '2469.5', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '180213', '2470', '4', '-', '-', ''],
 ['20181016', 'BRF', '201812', '180358', '2469.5', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '202904', '2470', '2', '-', '-', ''],
 ['20181016', 'BRF', '201812', '211637', '2470', '2', '-', '-', '']]

In [None]:
pd.Da

In [24]:
print(fut_data[:1000])

交易日期,商品代號,交割年月,成交時間,成交價格,成交數量(B+S),近月價格,遠月價格
20110117,CBF    ,201101     ,11310200,33.2,8,-,-
20110117,CBF    ,201101     ,13345400,33,6,-,-
20110117,CBF    ,201102     ,09450300,33.4,4,-,-
20110117,CBF    ,201102     ,11310200,33.2,8,-,-
20110117,CBF    ,201102     ,13345400,33,6,-,-
20110117,CCF    ,201101     ,08460100,16.7,2,-,-
20110117,CCF    ,201101     ,08472300,16.75,2,-,-
20110117,CCF    ,201101     ,09030800,16.75,2,-,-
20110117,CCF    ,201101     ,09042400,16.85,2,-,-
20110117,CCF    ,201101     ,09111900,16.8,20,-,-
20110117,CCF    ,201101     ,09112000,16.8,20,-,-
20110117,CCF    ,201101     ,09113000,16.8,20,-,-
20110117,CCF    ,201101     ,09114500,16.85,10,-,-
20110117,CCF    ,201101     ,09144400,16.8,70,-,-
20110117,CCF    ,201101     ,09210200,16.85,4,-,-
20110117,CCF    ,201101     ,09210300,16.85,2,-,-
20110117,CCF    ,201101     ,09212900,16.85,60,-,-
20110117,CCF    ,201101     ,09240500,16.85,20,-,-
20110117,CCF    ,201101     ,09280200,16.9,


In [32]:
fut_data[:1000]

'成交日期,商品代號,到期月份(週別),成交時間,成交價格,成交數量(B+S),近月價格,遠月價格,開盤集合競價 \r\n20181016,BRF    ,201812     ,150708,2484.5,2,-,-, \r\n20181016,BRF    ,201812     ,153913,2480.5,2,-,-, \r\n20181016,BRF    ,201812     ,172756,2481,2,-,-, \r\n20181016,BRF    ,201812     ,173133,2481,2,-,-, \r\n20181016,BRF    ,201812     ,174412,2469,2,-,-, \r\n20181016,BRF    ,201812     ,174432,2469.5,2,-,-, \r\n20181016,BRF    ,201812     ,180213,2470,4,-,-, \r\n20181016,BRF    ,201812     ,180358,2469.5,2,-,-, \r\n20181016,BRF    ,201812     ,202904,2470,2,-,-, \r\n20181016,BRF    ,201812     ,211637,2470,2,-,-, \r\n20181016,BRF    ,201812     ,212632,2469.5,2,-,-, \r\n20181016,BRF    ,201812     ,213819,2467.5,2,-,-, \r\n20181016,BRF    ,201812     ,214524,2477,2,-,-, \r\n20181016,BRF    ,201812     ,215046,2472,2,-,-, \r\n20181016,BRF    ,201812     ,215612,2479,2,-,-, \r\n20181016,BRF    ,201812     ,215926,2480,2,-,-, \r\n20181016,BRF    ,201812     ,221456,2484,2,-,-, \r\n20181016,BRF    ,201812     ,224156,2479,2,

In [31]:
print(fut_data[:1000])

成交日期,商品代號,到期月份(週別),成交時間,成交價格,成交數量(B+S),近月價格,遠月價格,開盤集合競價 
20181016,BRF    ,201812     ,150708,2484.5,2,-,-, 
20181016,BRF    ,201812     ,153913,2480.5,2,-,-, 
20181016,BRF    ,201812     ,172756,2481,2,-,-, 
20181016,BRF    ,201812     ,173133,2481,2,-,-, 
20181016,BRF    ,201812     ,174412,2469,2,-,-, 
20181016,BRF    ,201812     ,174432,2469.5,2,-,-, 
20181016,BRF    ,201812     ,180213,2470,4,-,-, 
20181016,BRF    ,201812     ,180358,2469.5,2,-,-, 
20181016,BRF    ,201812     ,202904,2470,2,-,-, 
20181016,BRF    ,201812     ,211637,2470,2,-,-, 
20181016,BRF    ,201812     ,212632,2469.5,2,-,-, 
20181016,BRF    ,201812     ,213819,2467.5,2,-,-, 
20181016,BRF    ,201812     ,214524,2477,2,-,-, 
20181016,BRF    ,201812     ,215046,2472,2,-,-, 
20181016,BRF    ,201812     ,215612,2479,2,-,-, 
20181016,BRF    ,201812     ,215926,2480,2,-,-, 
20181016,BRF    ,201812     ,221456,2484,2,-,-, 
20181016,BRF    ,201812     ,224156,2479,2,-,-, 
20181016,BRF    ,201812     ,2
