In [1]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Set the device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print('device:', device)

device: mps


In [2]:
data = pd.read_csv('ready.csv').drop(columns = 'Unnamed: 0')
data['ret_rank'] = data.groupby('year_month')['month_ret'].transform(lambda x: pd.qcut(x, 5, labels = False))

data['Top'] = data['ret_rank'].apply(lambda x: 1 if x == 4 else 0)
data['Bottom'] = data['ret_rank'].apply(lambda x: -1 if x == 0 else 0)
data['rank'] = data['Top'] + data['Bottom'] + 1 # 0:Bottom, 1:Middle, 2:Top
data.drop(columns = ['ret_rank', 'Top', 'Bottom'], inplace = True)

data_for_model = data[data['year_month']<'2018']

data

Unnamed: 0,year_month,PERMNO,RI_Spread,CP_Spread,Skew,chmom,dolvol,Idiovol,indmom,maxret,mom1m,mom12m,mvel1,retvol,month_ret,rank
0,1996-02,10078,0.410379,0.665843,0.660771,0.809163,0.923366,0.163088,1.000000,0.304835,0.567332,0.238016,0.063198,0.534237,0.141306,2
1,1996-02,10104,0.399722,0.651412,0.705032,0.711171,0.844706,0.096106,0.619444,0.165541,0.701425,0.125665,0.156297,0.302605,0.089004,1
2,1996-02,10107,0.395449,0.654715,0.676885,0.689841,0.970092,0.062931,0.619444,0.131657,0.619231,0.113476,0.423907,0.283192,0.066892,1
3,1996-02,10138,0.343600,0.637898,0.639781,0.705431,0.426155,0.079755,0.758034,0.247625,0.667062,0.157208,0.009942,0.301626,-0.004630,1
4,1996-02,10145,0.371552,0.661506,0.671413,0.722396,0.565909,0.024808,0.676100,0.084367,0.614563,0.099848,0.112486,0.139834,0.119871,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395432,2022-12,93356,0.201888,0.719230,0.265991,0.303625,0.391336,0.004968,0.189687,0.045434,0.246141,0.123820,0.001976,0.058314,-0.097215,1
395433,2022-12,93369,0.417082,0.700210,0.265746,0.196569,0.484192,0.028148,0.189687,0.070337,0.019417,0.044460,0.001744,0.275433,0.209944,2
395434,2022-12,93374,0.214812,0.745106,0.227228,0.254677,0.442281,0.001835,0.287259,0.040151,0.231290,0.092570,0.002319,0.054230,-0.032653,1
395435,2022-12,93423,0.182525,0.714803,0.271413,0.247365,0.489572,0.013803,0.189687,0.060868,0.229787,0.079632,0.000772,0.091424,-0.034867,1


# Train With NN3(in > 32 > 16 > 8 > out)

In [3]:
# 設定隨機種子
np.random.seed(42)
torch.manual_seed(42)

# 將資料分為訓練集和測試集
features = data_for_model.drop(columns=['year_month', 'PERMNO', 'month_ret', 'rank'])
labels = data_for_model['rank']
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3, random_state=42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
# 將資料轉換為PyTorch張量並移動到指定的設備
train_features = torch.tensor(train_features.values, dtype=torch.float).to(device)
test_features = torch.tensor(test_features.values, dtype=torch.float).to(device)
train_labels = torch.tensor(train_labels.values, dtype=torch.long).to(device)  # Change to long


class mymodel(nn.Module):
    def __init__(self, input_size, output_size, is_training=True):
        super(mymodel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4= nn.Linear(16, 8)
        self.fc5= nn.Linear(8, output_size)
        self.is_training = is_training

    def forward(self, x):
        x = torch.celu(self.fc1(x))
        x = torch.dropout(x, p=0.05, train=self.is_training)
        x = torch.celu(self.fc2(x))
        x = torch.dropout(x, p=0.05, train=self.is_training)
        x = torch.celu(self.fc3(x))
        x = torch.dropout(x, p=0.05, train=self.is_training)
        x = torch.celu(self.fc4(x))
        x = torch.dropout(x, p=0.05, train=self.is_training)
        x = self.fc5(x)
        return torch.nn.functional.softmax(x, dim=1)  # Use softmax to get the probabilities

# 建立模型並移動到指定的設備
nnmodel = mymodel(train_features.shape[1], 3, True).to(device) 

# 初始化早停相關變數
patience = 200  # 這個數字表示我們要等待多少個epoch沒有改善就停止訓練
stop_early = 0  # 這個數字用來記錄當前已經等待了多少個epoch
best_loss = float('inf')  # 這個數字用來記錄目前最低的loss
n_epoch = 1000  # 這個數字表示我們要訓練多少個epoch

# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nnmodel.parameters(), lr=0.05)

# 訓練模型
# 訓練模型
for epoch in range(n_epoch):
    optimizer.zero_grad()
    outputs = nnmodel(train_features)
    loss = criterion(torch.log(outputs), train_labels)
    loss.backward()
    optimizer.step() 

    # 如果當前的loss比最低的loss還要低，則更新最低的loss，並重置等待的epoch數
    if loss.item() < best_loss:
        best_loss = loss.item()
        stop_early = 0
    else:
        stop_early += 1

    # 如果等待的epoch數達到了我們設定的patience，則停止訓練
    if stop_early >= patience:
        print('Early stopping at epoch:', epoch)
        break

    print('Epoch [%d/%d], Loss: %.4f' %(epoch+1, n_epoch, loss.item()))

Training Features Shape: (210907, 12)
Training Labels Shape: (210907,)
Testing Features Shape: (90390, 12)
Testing Labels Shape: (90390,)
Epoch [1/1000], Loss: 1.2316
Epoch [2/1000], Loss: 1.0372
Epoch [3/1000], Loss: 1.1786
Epoch [4/1000], Loss: 0.9804
Epoch [5/1000], Loss: 1.0794
Epoch [6/1000], Loss: 0.9751
Epoch [7/1000], Loss: 0.9597
Epoch [8/1000], Loss: 0.9621
Epoch [9/1000], Loss: 0.9649
Epoch [10/1000], Loss: 0.9618
Epoch [11/1000], Loss: 0.9586
Epoch [12/1000], Loss: 0.9596
Epoch [13/1000], Loss: 0.9555
Epoch [14/1000], Loss: 0.9520
Epoch [15/1000], Loss: 0.9609
Epoch [16/1000], Loss: 0.9797
Epoch [17/1000], Loss: 0.9794
Epoch [18/1000], Loss: 0.9719
Epoch [19/1000], Loss: 0.9483
Epoch [20/1000], Loss: 0.9454
Epoch [21/1000], Loss: 0.9576
Epoch [22/1000], Loss: 0.9497
Epoch [23/1000], Loss: 0.9471
Epoch [24/1000], Loss: 0.9488
Epoch [25/1000], Loss: 0.9510
Epoch [26/1000], Loss: 0.9498
Epoch [27/1000], Loss: 0.9449
Epoch [28/1000], Loss: 0.9403
Epoch [29/1000], Loss: 0.9442
E

# Backtest v.s. US Equity ETFs

In [4]:
data_for_backtest = data[data['year_month']>='2018'].copy().reset_index(drop = True)

features = data_for_backtest.drop(columns=['year_month', 
                                        'PERMNO', 
                                        'month_ret', 
                                        'rank'])
nnmodel.eval()
nnmodel.is_training = False

with torch.no_grad():
    outputs = nnmodel(torch.tensor(features.values, dtype=torch.float).to(device))
    
outputs = outputs.cpu().numpy()

prob_0 = outputs[:, 0]
prob_1 = outputs[:, 1]
prob_2 = outputs[:, 2]

data_for_backtest['prob_0'] = prob_0 # Bottom
data_for_backtest['prob_1'] = prob_1 # Middle
data_for_backtest['prob_2'] = prob_2 # Top


data_for_backtest.sort_values(['year_month', 'PERMNO'], ascending=[True, True], inplace=True)
data_for_backtest['tmb'] = data_for_backtest['prob_2'] - data_for_backtest['prob_0']

df = data_for_backtest[['year_month', 'PERMNO', 'month_ret', 'prob_0', 'prob_2', 'tmb']].copy()
df.rename(columns = {'prob_0':'Bot_prob_pred', 'prob_2':'Top_prob_pred'}, inplace = True)
df.to_csv('NN3_1model.csv', index = False)
display(df)

Unnamed: 0,year_month,PERMNO,month_ret,Bot_prob_pred,Top_prob_pred,tmb
0,2018-01,10032,-0.015973,0.199374,0.204340,0.004966
1,2018-01,10104,0.095383,0.122411,0.126025,0.003614
2,2018-01,10107,0.110706,0.112416,0.116328,0.003912
3,2018-01,10138,0.063854,0.124467,0.148684,0.024217
4,2018-01,10145,0.041145,0.108326,0.121310,0.012984
...,...,...,...,...,...,...
94135,2022-12,93356,-0.097215,0.226323,0.221248,-0.005075
94136,2022-12,93369,0.209944,0.325047,0.276810,-0.048237
94137,2022-12,93374,-0.032653,0.224475,0.219704,-0.004771
94138,2022-12,93423,-0.034867,0.294543,0.261355,-0.033188


# Training with XGB Model

In [6]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

train_features = data_for_model.drop(columns=['year_month', 'PERMNO', 'month_ret', 'rank'])
train_labels = data_for_model['rank']

data_for_backtest = data[data['year_month']>='2018'].copy().reset_index(drop = True)
test_features = data_for_backtest.drop(columns=['year_month', 'PERMNO', 'month_ret', 'rank'])
test_labels = data_for_backtest['rank']
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

model = xgb.XGBClassifier(objective='multi:softprob', num_class=3, seed=42, 
                          max_depth=6, colsample_bytree=0.8, subsample=0.3, 
                          n_estimators=50, learning_rate=0.1, n_jobs=-1)

model.fit(train_features, train_labels)

preds = model.predict_proba(test_features)

prob_0 = preds[:, 0]
prob_1 = preds[:, 1]
prob_2 = preds[:, 2]

data_for_backtest['prob_0'] = prob_0 # Bottom
data_for_backtest['prob_1'] = prob_1 # Middle
data_for_backtest['prob_2'] = prob_2 # Top


data_for_backtest.sort_values(['year_month', 'PERMNO'], ascending=[True, True], inplace=True)
data_for_backtest['tmb'] = data_for_backtest['prob_2'] - data_for_backtest['prob_0']

df = data_for_backtest[['year_month', 'PERMNO', 'month_ret', 'prob_0', 'prob_2', 'tmb']].copy()
df.rename(columns = {'prob_0':'Bot_prob_pred', 'prob_2':'Top_prob_pred'}, inplace = True)
df.to_csv('XGB_1model.csv', index = False)
display(df)

Training Features Shape: (301297, 12)
Training Labels Shape: (301297,)
Testing Features Shape: (94140, 12)
Testing Labels Shape: (94140,)


Unnamed: 0,year_month,PERMNO,month_ret,Bot_prob_pred,Top_prob_pred,tmb
0,2018-01,10032,-0.015973,0.191600,0.193778,0.002179
1,2018-01,10104,0.095383,0.115451,0.138631,0.023180
2,2018-01,10107,0.110706,0.104313,0.132272,0.027959
3,2018-01,10138,0.063854,0.104247,0.126917,0.022671
4,2018-01,10145,0.041145,0.056241,0.073755,0.017514
...,...,...,...,...,...,...
94135,2022-12,93356,-0.097215,0.192080,0.191660,-0.000420
94136,2022-12,93369,0.209944,0.277053,0.295869,0.018816
94137,2022-12,93374,-0.032653,0.155818,0.152669,-0.003149
94138,2022-12,93423,-0.034867,0.247458,0.297129,0.049672


# Training with LGBM Model

In [7]:
import lightgbm as lgb

train_features = data_for_model.drop(columns=['year_month', 'PERMNO', 'month_ret', 'rank'])
train_labels = data_for_model['rank']

data_for_backtest = data[data['year_month']>='2018'].copy().reset_index(drop = True)
test_features = data_for_backtest.drop(columns=['year_month', 'PERMNO', 'month_ret', 'rank'])
test_labels = data_for_backtest['rank']
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42, n_jobs=-1, verbose = -1, 
                           max_depth=6, num_leaves = 13, 
                            n_estimators = 50, colsample_bytree = 0.5,
                            learning_rate = 0.1)

model.fit(train_features, train_labels)

preds = model.predict_proba(test_features)

prob_0 = preds[:, 0]
prob_1 = preds[:, 1]
prob_2 = preds[:, 2]

data_for_backtest['prob_0'] = prob_0 # Bottom
data_for_backtest['prob_1'] = prob_1 # Middle
data_for_backtest['prob_2'] = prob_2 # Top


data_for_backtest.sort_values(['year_month', 'PERMNO'], ascending=[True, True], inplace=True)
data_for_backtest['tmb'] = data_for_backtest['prob_2'] - data_for_backtest['prob_0']

df = data_for_backtest[['year_month', 'PERMNO', 'month_ret', 'prob_0', 'prob_2', 'tmb']].copy()
df.rename(columns = {'prob_0':'Bot_prob_pred', 'prob_2':'Top_prob_pred'}, inplace = True)
df.to_csv('LGBM_1model.csv', index = False)
display(df)

Training Features Shape: (301297, 12)
Training Labels Shape: (301297,)
Testing Features Shape: (94140, 12)
Testing Labels Shape: (94140,)


Unnamed: 0,year_month,PERMNO,month_ret,Bot_prob_pred,Top_prob_pred,tmb
0,2018-01,10032,-0.015973,0.193547,0.186120,-0.007427
1,2018-01,10104,0.095383,0.122560,0.149633,0.027073
2,2018-01,10107,0.110706,0.102054,0.126042,0.023987
3,2018-01,10138,0.063854,0.114287,0.133100,0.018813
4,2018-01,10145,0.041145,0.078999,0.078144,-0.000855
...,...,...,...,...,...,...
94135,2022-12,93356,-0.097215,0.205429,0.217694,0.012265
94136,2022-12,93369,0.209944,0.271360,0.292348,0.020988
94137,2022-12,93374,-0.032653,0.163826,0.177368,0.013543
94138,2022-12,93423,-0.034867,0.248167,0.251198,0.003031
