In [12]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from factorVAE.factor_VAE import FactorVAE

In [13]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda"
stock = "sp500"

In [14]:
features_dataset = torch.load(f"./dataset/{stock}/train/feat.pt")
returns_dataset = torch.load(f"./dataset/{stock}/train/ret.pt")
val_features_dataset = torch.load(f"./dataset/{stock}/val/feat.pt")
val_returns_dataset = torch.load(f"./dataset/{stock}/val/ret.pt")
test_features_dataset = torch.load(f"./dataset/{stock}/test/feat.pt")
test_returns_dataset = torch.load(f"./dataset/{stock}/test/ret.pt")

print(f"Total step: {features_dataset.shape[0]}")
print(f"Time span: {features_dataset.shape[1]}")
print(f"Stock size: {features_dataset.shape[2]}")
print(f"Feature size: {features_dataset.shape[3]}")

Total step: 948
Time span: 60
Stock size: 487
Feature size: 5


In [15]:
batch_size = 16
characteristic_size = features_dataset.shape[3]
stock_size = features_dataset.shape[2]
latent_size = 64
factor_size = 32
time_span = features_dataset.shape[1]
gru_input_size = 64
hidden_size = 64
lr = 1e-5
epochs = 250

In [16]:
from torch.utils.data import TensorDataset, DataLoader

def get_dataloader(data, label, device=device, batch_size=batch_size):
    data = torch.Tensor(data).to(device)
    label = torch.Tensor(label).to(device).long()
    ds = TensorDataset(data, label)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True)
    return dl

In [17]:
train_dl = get_dataloader(features_dataset, returns_dataset)
val_dl = get_dataloader(val_features_dataset, val_returns_dataset)
test_dl = get_dataloader(test_features_dataset, test_returns_dataset)

In [18]:
factor_VAE = FactorVAE(
    characteristic_size=characteristic_size,
    stock_size=stock_size,
    latent_size=latent_size,
    factor_size=factor_size,
    time_span=time_span,
    gru_input_size=gru_input_size,
    hidden_size=hidden_size
).to(device)

In [19]:
optimizer = torch.optim.Adam(factor_VAE.parameters(), lr=lr)

In [20]:
def train_loop(dataloader, model, optimizer):
    for batch, (feat, ret) in enumerate(dataloader):
        loss = model.run_model(feat, ret, gamma=1)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (batch + 1) % 20 == 0:
            print(f"batch: {batch}, loss: {loss.item()}")

In [21]:
def val_loop(dataloader, model):
    total_loss = 0
    total_batches = 0
    with torch.no_grad():  # 禁用梯度计算
        for batch, (feat, ret) in enumerate(dataloader):
            loss = model.run_model(feat, ret, gamma=1)

            total_loss += loss.item()
            total_batches += 1
            
    average_loss = total_loss / total_batches
    print(f"Validation Loss: {average_loss}")
    return average_loss

In [22]:
early_stop = 5
min_loss = 10
times = 0
for i in range(epochs):
    print(f"=== Epoch: {i} ===")
    train_loop(train_dl, factor_VAE, optimizer)
    val_loss = val_loop(val_dl, factor_VAE)
    if val_loss < min_loss:
        min_loss = val_loss
        times = 0
        best_model = factor_VAE.state_dict()
        torch.save(best_model, "best_model.pt")
    else:
        times += 1
    if times == early_stop:
        break

=== Epoch: 0 ===
batch: 19, loss: 1.970694899559021
batch: 39, loss: 1.6346464157104492
batch: 59, loss: 0.8889301419258118
Validation Loss: 1.3166942993799846
=== Epoch: 1 ===
batch: 19, loss: 1.258047342300415
batch: 39, loss: 1.0256068706512451
batch: 59, loss: 0.7358337044715881
Validation Loss: 0.873470253414578
=== Epoch: 2 ===
batch: 19, loss: 0.8524231314659119
batch: 39, loss: 0.8250256776809692
batch: 59, loss: 0.6811049580574036
Validation Loss: 0.7880741424030728
=== Epoch: 3 ===
batch: 19, loss: 0.7838475108146667
batch: 39, loss: 0.7645266056060791
batch: 59, loss: 0.6326335668563843
Validation Loss: 0.7307743430137634
=== Epoch: 4 ===
batch: 19, loss: 0.727276086807251
batch: 39, loss: 0.7098177671432495
batch: 59, loss: 0.5850622653961182
Validation Loss: 0.6784059007962545
=== Epoch: 5 ===
batch: 19, loss: 0.6367621421813965
batch: 39, loss: 0.6105685830116272
batch: 59, loss: 0.525255560874939
Validation Loss: 0.583411349190606
=== Epoch: 6 ===
batch: 19, loss: 0.5732

In [23]:
factor_VAE = FactorVAE(
    characteristic_size=characteristic_size,
    stock_size=stock_size,
    latent_size=latent_size,
    factor_size=factor_size,
    time_span=time_span,
    gru_input_size=gru_input_size,
    hidden_size=hidden_size
).to(device)
best_model_state = torch.load("best_model.pt")
factor_VAE.load_state_dict(best_model_state)
factor_VAE.eval()
loss = val_loop(test_dl, factor_VAE)

Validation Loss: -8.395851254463196


In [24]:
result = np.empty((len(test_features_dataset), stock_size))
for i in range(len(test_features_dataset)):
    tmp = factor_VAE.prediction(test_features_dataset[i].unsqueeze(0).to(device))
    result[i] = tmp[0].squeeze().cpu().numpy()
    '''
    from torch.distributions import Normal
    n = Normal(tmp[1], tmp[2])
    n.sample()
    '''

In [25]:
result.shape

(249, 487)

In [26]:
ret = test_returns_dataset.cpu().numpy()

In [27]:
def calculate_returns(result, k, n):
    num_days, num_stocks = result.shape
    prev_selection = np.arange(num_stocks)  # 初始选择所有股票
    returns_data = []
    for i in range(num_days - 1):
        today_returns = result[i]
        sorted_indices = np.argsort(today_returns)[::-1]  # 按照收益率从高到低排序
        # 选择前k个股票
        today_selection = sorted_indices[:k]
        # 计算前一天的选择股票与当天选择股票的交集
        intersection_size = len(set(prev_selection) & set(today_selection))
        # 如果交集大小小于 k - n，则交换掉前一天选择的股票中排名靠后的 n 支股票
        if intersection_size < k - n:
            prev_selection = today_selection
        # 如果交集大小大于等于 k - n，则不进行交换
        else:
            today_selection = prev_selection
        # 计算每天持有的 k 支股票的总收益率
        daily_return = np.mean(ret[i+1][today_selection])  # 注意注意 ：这里是 ret 不是 result
        returns_data.append(daily_return)
    df = pd.DataFrame({"daily_return": returns_data})
    return df

In [34]:
k = 400
n = k // 10
pred = calculate_returns(result, k, n)
pred

Unnamed: 0,daily_return
0,0.015230
1,-0.010417
2,0.023965
3,0.000243
4,0.007307
...,...
243,0.012125
244,0.003419
245,0.005955
246,0.001124


In [35]:
ARR = (1 + pred).cumprod()
ARR

Unnamed: 0,daily_return
0,1.015230
1,1.004654
2,1.028731
3,1.028981
4,1.036500
...,...
243,1.143701
244,1.147611
245,1.154446
246,1.155743


In [30]:
ARR.to_csv(f'{stock}.csv', index=None)

In [36]:
portfolio_df_performance = pred

In [37]:
alpha_df_performance = pd.DataFrame()
alpha_df_performance['portfolio_daily_return'] = portfolio_df_performance['daily_return']
alpha_df_performance['portfolio_net_value'] = (alpha_df_performance['portfolio_daily_return'] + 1).cumprod()

net_value_columns = ['portfolio_net_value']

alpha_statistics_df = pd.DataFrame(index=alpha_df_performance[net_value_columns].columns,
                                    columns=["年化收益", "年化波动率", "最大回撤率", "夏普率", "Calmar", "IR"])

# alpha_df_performance.set_index("dt", inplace=True)
alpha_df_performance.index = pd.to_datetime(alpha_df_performance.index)
monthly_statistics_df = alpha_df_performance[net_value_columns].resample('m').last()
monthly_statistics_df = pd.concat([alpha_df_performance[:1][
                                        ['portfolio_net_value']],
                                    monthly_statistics_df])
monthly_statistics_df = monthly_statistics_df.pct_change()
monthly_statistics_df = monthly_statistics_df.dropna()
monthly_statistics_df.index = monthly_statistics_df.index.date
## TODO 补充第一年的数据
yearly_statistics_df = alpha_df_performance[net_value_columns].resample('y').last()
yearly_statistics_df = pd.concat([alpha_df_performance[:1][
                                        ['portfolio_net_value']],
                                    yearly_statistics_df])
yearly_statistics_df = yearly_statistics_df.pct_change()
yearly_statistics_df = yearly_statistics_df.dropna()
yearly_statistics_df.index = yearly_statistics_df.index.date

alpha_statistics_df.loc[:, "年化收益"] = np.mean(
    (alpha_df_performance[net_value_columns].tail(1)) ** (252 / len(alpha_df_performance)) - 1)
alpha_statistics_df.loc[:, "年化波动率"] = np.std(
    alpha_df_performance[net_value_columns] / alpha_df_performance[net_value_columns].shift(1) - 1) * np.sqrt(
    252)
alpha_statistics_df.loc[:, "累积收益"] = np.mean(alpha_df_performance[net_value_columns].tail(1) - 1)
alpha_statistics_df.loc[:, "累积波动率"] = np.std(
    alpha_df_performance[net_value_columns] / alpha_df_performance[net_value_columns].shift(1) - 1)
alpha_statistics_df.loc[:, "最大回撤率"] = np.min(
    (alpha_df_performance[net_value_columns] - alpha_df_performance[net_value_columns].cummax()) /
    alpha_df_performance[net_value_columns].cummax())
alpha_statistics_df.loc[:, "夏普率"] = alpha_statistics_df["年化收益"] / alpha_statistics_df["年化波动率"]
alpha_statistics_df.loc[:, "Calmar"] = alpha_statistics_df["年化收益"] / abs(alpha_statistics_df["最大回撤率"])
alpha_statistics_df.loc[:, "IR"] = np.mean(
    alpha_df_performance[net_value_columns] / alpha_df_performance[net_value_columns].shift(1) - 1) * np.sqrt(
    252) / np.std(alpha_df_performance[net_value_columns] / alpha_df_performance[net_value_columns].shift(1) - 1)

  monthly_statistics_df = alpha_df_performance[net_value_columns].resample('m').last()
  yearly_statistics_df = alpha_df_performance[net_value_columns].resample('y').last()
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [38]:
alpha_statistics_df

Unnamed: 0,年化收益,年化波动率,最大回撤率,夏普率,Calmar,IR,累积收益,累积波动率
portfolio_net_value,0.159973,0.141798,-0.132042,1.128173,1.211524,1.01311,0.157244,0.008932


FactorVAE
ASR = ARR / AVol;   CR = ARR / abs(MDD)
        ARR    AVol    MDD    ASR    CR    IR
hs300 -0.048  0.134  -0.175 -0.355 -0.271 -0.348

zz500  0.006  0.127  -0.147  0.047  0.041  0.112

sp500  0.160  0.142  -0.132  1.128  1.211  1.013
   
nas100 0.356  0.159  -0.119  2.234  2.995  1.907