In [7]:
from h5py import File
import numpy as np
from tqdm import tqdm
import SharedArray as sa
import numba as nb
import nutils
import common as cm
import pandas as pd

In [36]:
with File("/mnt/nas/data/1s数据/stk_strN_0_endN_999.h5", "r") as f:
    print(f.keys())
    timestamp = f["datatime"][:]
    mid_price = f["mid"][:]
    codes = f["stock_list"][:]

<KeysViewHDF5 ['datatime', 'mid', 'stock_list', 'volume']>


In [8]:
code_100 = cm.SELECTED_CODES
code_str = [str(code[0]).zfill(6) for code in codes]
col_100 = [True if code in code_100 else False for code in code_str]
col_100 = [code_str.index(code) for code in code_100]
mp_100 = mid_price[:, col_100]

In [9]:
@nb.jit(nopython=True, parallel=True)
def calculate_log_returns(weighted_prices, index_1s):
    n_timestamps = weighted_prices.shape[0]
    log_returns = np.zeros(n_timestamps)
    raw_returns = np.zeros(n_timestamps)

    for i in nb.prange(n_timestamps):
        current_price = weighted_prices[i]
        future_prices = []
        for k in range(1, 181):
            future_index = i + k
            if (
                future_index < n_timestamps
                and index_1s[future_index, 0] == index_1s[i, 0]
            ):
                future_prices.append(weighted_prices[future_index])
            else:
                future_prices.append(current_price)
        future_prices = np.array(future_prices)
        raw_returns[i] = future_prices[-1] / current_price - 1
        log_returns[i] = np.log(future_prices[-1] / current_price) * 1e4

    return raw_returns, log_returns

In [10]:
mp_100 = pd.DataFrame(mp_100).ffill(axis=0).values
w_mp_100 = mp_100.astype(np.float64).mean(1)
rawret, logret = calculate_log_returns(w_mp_100, datetime)

In [11]:
def calculate_regression_alpha(
    stock_log_returns,
    stock_index,
    market_log_returns,
    market_index,
    start_date,
    end_date,
):
    from datetime import datetime as dt
    import statsmodels.api as sm

    # Convert string dates to datetime objects
    start_date = dt.strptime(start_date, "%Y%m%d")
    end_date = dt.strptime(end_date, "%Y%m%d")

    # Filter the data within the sample period
    sample_mask = (stock_index[:, 0] >= start_date) & (stock_index[:, 0] <= end_date)
    sample_stock_log_returns = stock_log_returns[sample_mask]
    sample_market_log_returns = market_log_returns[sample_mask]

    # Calculate alpha and beta using the sample data
    X = sm.add_constant(sample_market_log_returns)
    model = sm.OLS(sample_stock_log_returns, X).fit()
    alpha = model.params[0]
    beta = model.params[1]

    # Calculate excess returns using the estimated alpha and beta
    excess_returns = stock_log_returns - beta * market_log_returns

    return excess_returns, alpha, beta

In [26]:
import statsmodels.api as sm
from datetime import datetime


@nb.jit(nopython=True, parallel=True)
def calculate_log_returns(weighted_prices, index_1s):
    n_timestamps = weighted_prices.shape[0]
    log_returns = np.zeros(n_timestamps)
    raw_returns = np.zeros(n_timestamps)

    for i in nb.prange(n_timestamps):
        current_price = weighted_prices[i]
        future_prices = []
        for k in range(1, 181):
            future_index = i + k
            if (
                future_index < n_timestamps
                and index_1s[future_index, 0] == index_1s[i, 0]
            ):
                future_prices.append(weighted_prices[future_index])
            else:
                future_prices.append(current_price)
        future_prices = np.array(future_prices)
        raw_returns[i] = future_prices[-1] / current_price - 1
        log_returns[i] = np.log(future_prices[-1] / current_price) * 1e4

    return raw_returns, log_returns


@nb.jit(nopython=True, parallel=True)
def calculate_excess_returns(
    stock_log_returns, stock_index, index_log_returns, index_index
):
    n_stock_timestamps = stock_log_returns.shape[0]
    n_index_timestamps = index_log_returns.shape[0]

    # Create arrays to store results
    excess_returns = np.full(n_stock_timestamps, np.nan)

    # Initialize pointers
    index_pointer = 0

    # Iterate through stock timestamps
    for i in nb.prange(n_stock_timestamps):
        stock_date, stock_time = stock_index[i]

        # Move index pointer to the latest log return before or at the stock time
        while index_pointer < n_index_timestamps and (
            index_index[index_pointer, 0] < stock_date
            or (
                index_index[index_pointer, 0] == stock_date
                and index_index[index_pointer, 1] <= stock_time
            )
        ):
            index_pointer += 1

        # Ensure we have a valid pointer to use
        if index_pointer > 0:
            beta = index_log_returns[index_pointer - 1]
            excess_returns[i] = (
                np.log(np.exp(stock_log_returns[i] / 1e4) - np.exp(beta / 1e4) + 1)
            ) * 1e4

    return excess_returns


def find_closest_index(stock_timestamp, market_timestamps):
    # 找到市场数据中与股票数据时间戳最接近的时间戳索引
    closest_indices = np.searchsorted(market_timestamps, stock_timestamp) - 1
    closest_indices = np.clip(closest_indices, 0, len(market_timestamps) - 1)
    return closest_indices

def calculate_regression_params(stock_log_returns, stock_index, market_log_returns, market_index, start_date, end_date):
    # Filter the data within the sample period
    sample_mask = (stock_index >= start_date) & (stock_index <= end_date)
    sample_stock_log_returns = stock_log_returns[sample_mask]
    sample_stock_index = stock_index[sample_mask]

    # 找到与股票数据时间戳最接近的市场数据时间戳索引
    closest_indices = find_closest_index(sample_stock_index, market_index)
    sample_market_log_returns = market_log_returns[closest_indices]

    # Calculate alpha and beta using the sample data
    X = sm.add_constant(sample_market_log_returns)
    model = sm.OLS(sample_stock_log_returns, X).fit()
    alpha = model.params[0]
    beta = model.params[1]

    return alpha, beta

@njit(parallel=True)
def calculate_excess_returns(stock_log_returns, stock_index, index_log_returns, index_index, alpha, beta):
    n_stock_timestamps = stock_log_returns.shape[0]
    n_index_timestamps = index_log_returns.shape[0]

    # Create arrays to store results
    excess_returns = np.full(n_stock_timestamps, np.nan)

    # Initialize pointers
    index_pointer = 0

    # Iterate through stock timestamps
    for i in prange(n_stock_timestamps):
        stock_timestamp = stock_index[i]

        # Move index pointer to the latest log return before or at the stock time
        while index_pointer < n_index_timestamps and index_index[index_pointer] <= stock_timestamp:
            index_pointer += 1

        # Ensure we have a valid pointer to use
        if index_pointer > 0:
            closest_market_log_return = index_log_returns[index_pointer - 1]
            excess_returns[i] = stock_log_returns[i] - (alpha + beta * closest_market_log_return)

    return excess_returns

NameError: name 'njit' is not defined

In [27]:
start_date = 20210101
end_date = 20210401

# 遍历所选股票代码，计算并保存超额收益
for cur_code in tqdm(cm.SELECTED_CODES):
    raw_label = sa.attach(f"label_{cur_code}")
    with File(f"/mnt/nas/data/股票数据hdf5/stkCode_{cur_code}.h5", "r") as f:
        raw_timestamp = f["timestamp"][:]

    # 计算回归系数 alpha 和 beta
    alpha, beta = calculate_regression_params(raw_label, raw_timestamp, logret, datetime, start_date, end_date)
    
    # 计算超额收益
    excess_returns = calculate_excess_returns(raw_label, raw_timestamp, logret, datetime, alpha, beta)
    
    # 保存结果到新的目录
    np.save(f"/mnt/disk2/excess_return_regression/{cur_code}.npy", excess_returns.astype(np.float32))

print("超额收益计算并保存完毕。")

  0%|          | 0/100 [00:02<?, ?it/s]


IndexError: boolean index did not match indexed array along dimension 1; dimension is 1 but corresponding boolean dimension is 2

In [34]:
def combine_timestamp(date, time):
    return date * 1000000 + time

def find_closest_market_returns(stock_index, market_index, market_log_returns):
    n_stock = stock_index.shape[0]
    n_market = market_index.shape[0]
    closest_market_returns = np.empty(n_stock)

    market_pointer = 0
    for i in range(n_stock):
        stock_timestamp = stock_index[i]
        while market_pointer < n_market - 1 and market_index[market_pointer + 1] <= stock_timestamp:
            market_pointer += 1
        closest_market_returns[i] = market_log_returns[market_pointer]

    return closest_market_returns


with File(f"/mnt/nas/data/股票数据hdf5/stkCode_{cur_code}.h5", "r") as f:
    raw_timestamp = f["timestamp"][:]
with File("/mnt/nas/data/1s数据/stk_strN_0_endN_999.h5", "r") as f:
    mkt_timestamp = f["datatime"][:]

index = find_closest_market_returns(raw_timestamp, mkt_timestamp)

TypeError: find_closest_market_returns() missing 1 required positional argument: 'market_log_returns'

In [30]:
raw_timestamp

array([[20210104,    93000],
       [20210104,    93003],
       [20210104,    93006],
       ...,
       [20221230,   145651],
       [20221230,   145654],
       [20221230,   145657]])

In [32]:
mkt_timestamp

array([[20210104,    93000],
       [20210104,    93001],
       [20210104,    93002],
       ...,
       [20231229,   145657],
       [20231229,   145658],
       [20231229,   145659]], dtype=int32)

In [1]:
import numpy as np

In [6]:
code = "000537"
new = np.load(f"/mnt/disk2/factor_0626/000537/000537.npy")

In [9]:
raw = np.load(f"/mnt/nas/data/WY/factor_0527/stkCode_000537.npy")

In [10]:
len(raw)-len(new)

0