In [1]:
import numpy as np
import pandas as pd
import numba as nb
from h5py import File
from tqdm import tqdm
import nutils
import common as cm


In [2]:
with File("/mnt/nas/data/1s数据/stk_strN_0_endN_999.h5", "r") as f:
    print(f.keys())
    timestamp = f["datatime"][:]
    midprice = f["mid"][:]
    codes = f["stock_list"][:]

<KeysViewHDF5 ['datatime', 'mid', 'stock_list', 'volume']>


In [3]:
midprice.shape

(10337940, 1000)

In [4]:
timestamp.shape

(10337940, 2)

In [5]:
def convert_nano(data):
    dates = data[:, 0]
    times = data[:, 1]
    date_time = dates * 1e6 + times
    nanoseconds = pd.to_datetime(date_time, format="%Y%m%d%H%M%S").values.astype(
        np.int64
    )
    return nanoseconds


In [6]:
nano_ts = convert_nano(timestamp)

In [7]:
import nutils
from importlib import reload
reload(nutils)
import nutils

In [8]:
from numba import njit, prange
from tqdm import tqdm


def calculate_return(df, timestamp):
    n, m = df.shape
    result = np.zeros((n, m))
    ts = convert_nano(timestamp)
    for i in tqdm(range(m)):
        px = df[:, i]
        ret = nutils.log_return(ts, px, 180) # 180 log return
        result[:, i] = ret
    return result


ret = calculate_return(midprice, timestamp)

100%|██████████| 1000/1000 [25:34<00:00,  1.53s/it]


In [9]:
np.save("../data/n1000_ret.npy",ret)

In [10]:
ret_no_nan = np.where(np.isnan(ret), 0, ret)
ret_no_nan *= 1e4

In [11]:
mkt_index_ret = np.mean(ret_no_nan, axis=1)

In [12]:
# np.save("../data/n1000_mkt_index_ret.npy",mkt_index_ret)

In [13]:
from numba import prange

nb.jit(nopython=True, parallel=True)


def calculate_excess_returns(
    stock_log_returns, stock_index, index_log_returns, index_index, coef
):
    n_stock_timestamps = stock_log_returns.shape[0]
    n_index_timestamps = index_log_returns.shape[0]
    excess_returns = np.full(n_stock_timestamps, np.nan)
    index_pointer = 0
    for i in prange(n_stock_timestamps):
        stock_date, stock_time = stock_index[i]
        while index_pointer < n_index_timestamps and (
            index_index[index_pointer, 0] < stock_date
            or (
                index_index[index_pointer, 0] == stock_date
                and index_index[index_pointer, 1] < stock_time
            )
        ):
            index_pointer += 1
        if index_pointer > 0:
            beta = index_log_returns[index_pointer - 1]
            excess_returns[i] = stock_log_returns[i] - coef * beta

    return excess_returns

In [14]:
import SharedArray as sa
from h5py import File
from tqdm import tqdm
from sklearn.linear_model import LinearRegression


coef_list = []
for cur_code in tqdm(cm.SELECTED_CODES):
    stk_idx = np.where(codes == int(cur_code))[0]
    train_mask = (timestamp[:, 0] < 20210401) & (timestamp[:, 0] >= 20210101)
    train_x = ret_no_nan[train_mask][:, stk_idx]
    train_y = mkt_index_ret[train_mask]
    lr = LinearRegression()
    lr.fit(train_x, train_y)
    coef = lr.coef_
    print(f"{cur_code}:{coef}")
    coef_list.append(coef)

    raw_label = sa.attach(f"label_{cur_code}")
    # raw_timestamp = sa.attach(f"timestamp_{cur_code}")
    with File(f"/mnt/nas/data/股票数据hdf5/stkCode_{cur_code}.h5", "r") as f:
        raw_timestamp = f["timestamp"][:]
    ret = calculate_excess_returns(
        raw_label, raw_timestamp, mkt_index_ret, timestamp, coef
    )
    np.save(
        f"/mnt/disk2/alpha_label/{cur_code}.npy",
        ret.astype(np.float32),
    )

  0%|          | 0/100 [00:00<?, ?it/s]

000537:[0.16026164]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  1%|          | 1/100 [00:26<43:17, 26.23s/it]

000627:[0.13179676]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  2%|▏         | 2/100 [00:45<35:47, 21.91s/it]

000925:[0.15328174]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  3%|▎         | 3/100 [01:05<34:17, 21.21s/it]

000950:[0.19227236]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  4%|▍         | 4/100 [01:24<32:33, 20.35s/it]

002058:[0.07973737]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  5%|▌         | 5/100 [01:36<27:41, 17.49s/it]

002166:[0.16213876]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  6%|▌         | 6/100 [01:56<28:28, 18.18s/it]

002308:[0.14197577]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  7%|▋         | 7/100 [02:14<28:14, 18.22s/it]

002399:[0.12155109]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  8%|▊         | 8/100 [02:32<27:45, 18.10s/it]

002498:[0.13813454]


  excess_returns[i] = stock_log_returns[i] - coef * beta
  9%|▉         | 9/100 [02:53<28:56, 19.08s/it]

002557:[0.09635673]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 10%|█         | 10/100 [03:14<29:21, 19.58s/it]

002577:[0.17291677]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 11%|█         | 11/100 [03:30<27:16, 18.38s/it]

002594:[0.12256326]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 12%|█▏        | 12/100 [03:51<28:15, 19.27s/it]

002901:[0.11139859]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 13%|█▎        | 13/100 [04:09<27:18, 18.83s/it]

002941:[0.08804911]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 14%|█▍        | 14/100 [04:28<27:17, 19.04s/it]

002946:[0.09512656]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 15%|█▌        | 15/100 [04:47<26:51, 18.96s/it]

300053:[0.14274367]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 16%|█▌        | 16/100 [05:06<26:40, 19.05s/it]

300137:[0.03898725]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 17%|█▋        | 17/100 [05:24<25:51, 18.69s/it]

300141:[0.10411162]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 18%|█▊        | 18/100 [05:43<25:25, 18.60s/it]

300215:[0.14228719]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 19%|█▉        | 19/100 [06:01<24:57, 18.49s/it]

300225:[0.05044437]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 20%|██        | 20/100 [06:20<24:50, 18.63s/it]

300241:[0.14597853]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 21%|██        | 21/100 [06:39<24:51, 18.88s/it]

300252:[0.13381387]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 22%|██▏       | 22/100 [06:58<24:29, 18.83s/it]

300366:[0.13240743]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 23%|██▎       | 23/100 [07:18<24:38, 19.20s/it]

300498:[0.17292186]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 24%|██▍       | 24/100 [07:39<25:04, 19.80s/it]

300564:[0.08016474]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 25%|██▌       | 25/100 [07:54<22:56, 18.36s/it]

300605:[0.10986299]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 26%|██▌       | 26/100 [08:12<22:17, 18.07s/it]

300640:[0.15156549]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 27%|██▋       | 27/100 [08:28<21:16, 17.49s/it]

300688:[0.14500212]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 28%|██▊       | 28/100 [08:46<21:13, 17.69s/it]

300713:[0.0726249]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 29%|██▉       | 29/100 [09:04<20:58, 17.73s/it]

300867:[0.07795653]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 30%|███       | 30/100 [09:22<20:53, 17.91s/it]

300870:[0.13169778]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 31%|███       | 31/100 [09:39<20:10, 17.55s/it]

300908:[0.07348264]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 32%|███▏      | 32/100 [09:56<19:37, 17.31s/it]

300913:[0.05534974]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 33%|███▎      | 33/100 [10:13<19:16, 17.26s/it]

600006:[0.08936568]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 34%|███▍      | 34/100 [10:36<21:03, 19.14s/it]

600012:[0.08004887]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 35%|███▌      | 35/100 [10:55<20:29, 18.92s/it]

600107:[0.06635609]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 36%|███▌      | 36/100 [11:12<19:45, 18.52s/it]

600123:[0.16918399]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 37%|███▋      | 37/100 [11:33<20:00, 19.06s/it]

600127:[0.08373524]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 38%|███▊      | 38/100 [11:53<20:12, 19.56s/it]

600163:[0.07056127]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 39%|███▉      | 39/100 [12:13<20:01, 19.70s/it]

600176:[0.07479304]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 40%|████      | 40/100 [12:34<20:04, 20.08s/it]

600218:[0.12452311]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 41%|████      | 41/100 [12:54<19:33, 19.89s/it]

600232:[0.14261635]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 42%|████▏     | 42/100 [13:10<18:03, 18.69s/it]

600267:[0.1089325]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 43%|████▎     | 43/100 [13:29<18:02, 19.00s/it]

600302:[0.15394634]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 44%|████▍     | 44/100 [13:45<16:49, 18.02s/it]

600395:[0.08876545]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 45%|████▌     | 45/100 [14:05<17:10, 18.74s/it]

600426:[0.06201917]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 46%|████▌     | 46/100 [14:27<17:34, 19.53s/it]

600428:[0.06331044]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 47%|████▋     | 47/100 [14:47<17:30, 19.82s/it]

600493:[0.17170827]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 48%|████▊     | 48/100 [15:04<16:19, 18.84s/it]

600557:[0.10330203]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 49%|████▉     | 49/100 [15:23<16:06, 18.95s/it]

600578:[0.04475684]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 50%|█████     | 50/100 [15:43<15:55, 19.12s/it]

600644:[0.08482161]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 51%|█████     | 51/100 [16:01<15:30, 19.00s/it]

600647:[0.1291762]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 52%|█████▏    | 52/100 [16:16<14:13, 17.79s/it]

600665:[0.10507805]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 53%|█████▎    | 53/100 [16:33<13:39, 17.44s/it]

600704:[0.11825818]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 54%|█████▍    | 54/100 [16:53<14:05, 18.37s/it]

600740:[0.06548602]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 55%|█████▌    | 55/100 [17:14<14:20, 19.12s/it]

600797:[0.21239892]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 56%|█████▌    | 56/100 [17:34<14:06, 19.24s/it]

600817:[0.02488196]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 57%|█████▋    | 57/100 [17:50<13:03, 18.22s/it]

600834:[0.06577956]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 58%|█████▊    | 58/100 [18:08<12:42, 18.16s/it]

600859:[0.08175312]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 59%|█████▉    | 59/100 [18:29<12:59, 19.01s/it]

600862:[0.07604835]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 60%|██████    | 60/100 [18:55<14:03, 21.09s/it]

600893:[0.08467922]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 61%|██████    | 61/100 [19:17<14:01, 21.57s/it]

600984:[0.08498409]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 62%|██████▏   | 62/100 [19:38<13:29, 21.30s/it]

601019:[0.22970706]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 63%|██████▎   | 63/100 [19:56<12:30, 20.28s/it]

601330:[0.08953275]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 64%|██████▍   | 64/100 [20:16<12:09, 20.28s/it]

601881:[0.22867093]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 65%|██████▌   | 65/100 [20:38<12:04, 20.69s/it]

603006:[0.0925511]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 66%|██████▌   | 66/100 [20:54<10:59, 19.41s/it]

603017:[0.16290543]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 67%|██████▋   | 67/100 [21:11<10:13, 18.59s/it]

603018:[0.12473671]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 68%|██████▊   | 68/100 [21:30<09:55, 18.60s/it]

603037:[0.16067655]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 69%|██████▉   | 69/100 [21:46<09:12, 17.83s/it]

603192:[0.17921427]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 70%|███████   | 70/100 [22:02<08:43, 17.45s/it]

603212:[0.10406936]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 71%|███████   | 71/100 [22:22<08:46, 18.15s/it]

603269:[0.18754741]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 72%|███████▏  | 72/100 [22:38<08:09, 17.48s/it]

603357:[0.21901256]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 73%|███████▎  | 73/100 [22:56<07:55, 17.61s/it]

603368:[0.19102623]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 74%|███████▍  | 74/100 [23:15<07:47, 18.00s/it]

603388:[0.15409904]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 75%|███████▌  | 75/100 [23:31<07:14, 17.38s/it]

603390:[0.11796219]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 76%|███████▌  | 76/100 [23:47<06:48, 17.03s/it]

603559:[0.11753058]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 77%|███████▋  | 77/100 [24:04<06:33, 17.09s/it]

603595:[0.0863164]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 78%|███████▊  | 78/100 [24:23<06:29, 17.71s/it]

603693:[0.08437617]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 79%|███████▉  | 79/100 [24:43<06:25, 18.34s/it]

603712:[0.10221839]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 80%|████████  | 80/100 [25:04<06:22, 19.13s/it]

603777:[0.12893438]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 81%|████████  | 81/100 [25:24<06:07, 19.36s/it]

603818:[0.05471472]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 82%|████████▏ | 82/100 [25:43<05:44, 19.15s/it]

603856:[0.12051306]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 83%|████████▎ | 83/100 [26:00<05:18, 18.74s/it]

603878:[0.06186202]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 84%|████████▍ | 84/100 [26:17<04:50, 18.17s/it]

603939:[0.0965888]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 85%|████████▌ | 85/100 [26:38<04:46, 19.08s/it]

603990:[0.07175229]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 86%|████████▌ | 86/100 [26:59<04:32, 19.49s/it]

605128:[0.09472393]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 87%|████████▋ | 87/100 [27:16<04:03, 18.76s/it]

605166:[0.09399218]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 88%|████████▊ | 88/100 [27:34<03:42, 18.55s/it]

688057:[0.10506137]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 89%|████████▉ | 89/100 [27:49<03:11, 17.43s/it]

688165:[0.13318798]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 90%|█████████ | 90/100 [28:04<02:47, 16.77s/it]

688215:[0.11099876]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 91%|█████████ | 91/100 [28:16<02:18, 15.38s/it]

688286:[0.08336096]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 92%|█████████▏| 92/100 [28:32<02:03, 15.40s/it]

688309:[0.08924548]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 93%|█████████▎| 93/100 [28:43<01:39, 14.27s/it]

688313:[0.12445778]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 94%|█████████▍| 94/100 [29:00<01:30, 15.13s/it]

688366:[0.04716276]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 95%|█████████▌| 95/100 [29:18<01:19, 15.99s/it]

688386:[0.03713328]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 96%|█████████▌| 96/100 [29:34<01:03, 15.79s/it]

688668:[0.10395921]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 97%|█████████▋| 97/100 [29:50<00:47, 15.84s/it]

688678:[0.07189467]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 98%|█████████▊| 98/100 [30:06<00:32, 16.04s/it]

688777:[0.08473007]


  excess_returns[i] = stock_log_returns[i] - coef * beta
 99%|█████████▉| 99/100 [30:25<00:17, 17.01s/it]

689009:[0.03650756]


  excess_returns[i] = stock_log_returns[i] - coef * beta
100%|██████████| 100/100 [30:47<00:00, 18.48s/it]


In [15]:
codes = codes.flatten()

In [None]:
import SharedArray as sa

code_ret = sa.attach("label_000537")

In [None]:
timestamp

In [None]:
cur_code = "000537"
tmp = np.load(f"/mnt/disk2/alpha_label/{cur_code}.npy")

In [None]:
tmp