# **ReMASTER: Extension of Market-Guided Stock Transformer for Stock Price Forecasting Using Novel Stock Indices**

In [None]:
import torch
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Replace "0" with the GPU ID(s) you want to use

print(f'Can I use GPU now? -- {torch.cuda.is_available()}')
# if torch.cuda.is_available():
#     # device = torch.device("cuda:0")  # Use the first CUDA device
#     device = torch.device("cuda")
#     print("Using CUDA device:", device)

### **Step 1: Install Required Libraries**
Run the following commands to install necessary libraries like pandas, torch, and qlib (from GitHub):

In [None]:
!pip freeze > requirements.txt

In [None]:
# install requirements
# install qlib library to load in datasets
# Install pandas and torch with specific versions
# !pip install pandas==1.5.3 torch==1.11.0
!pip freeze > requirements.txt
!pip install pandas==1.5.3
!pip install torch==1.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html


# Install dependencies for qlib
!pip install --upgrade cython setuptools

# Install qlib from GitHub, as the PyPI version only supports Python 3.7 and 3.8
!pip install git+https://github.com/microsoft/qlib.git

# Install any remaining dependencies
# !pip install pyqlib
# !pip install pylib==0.9.1.99




### **Step 2: Import Libraries**


In [None]:
import time
import os
import torch
import copy
import numpy as np
import pandas as pd
import pickle
import qlib
from torch.utils.data import DataLoader, Sampler
import torch.optim as optim

### **Step 3: Verify GPU Availability and Set Up Data**
Initialize qlib and check GPU status:




In [None]:
qlib.init()
print(f'Can I can use GPU now? -- {torch.cuda.is_available()}')

### **Step 4: Load Data from Google Drive**
Set up the data. Load in from google drive mount.

In [None]:
from google.colab import drive
import os

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Set the path to your data folder
data_path = '/content/drive/My Drive/ECE 570/data'

# Step 3: List files in the folder (to verify)
print("Files in data folder:")
for filename in os.listdir(data_path):
    print(filename)


## **BASE_MODEL.PY**
### **Helper Functions and Classes**
Define the necessary functions and classes for data processing and model setup.

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Sampler
import torch
import torch.optim as optim

def calc_ic(pred, label):
    df = pd.DataFrame({'pred':pred, 'label':label})
    ic = df['pred'].corr(df['label'])
    ric = df['pred'].corr(df['label'], method='spearman')
    return ic, ric

**Custom Data Sampler**

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Sampler
import torch
import torch.optim as optim


# MASTER'S IMPLEMENTATION (untouched)

class DailyBatchSamplerRandom(Sampler):
    def __init__(self, data_source, shuffle=False):
        self.data_source = data_source
        self.shuffle = shuffle
        # calculate number of samples in each batch
        self.daily_count = pd.Series(index=self.data_source.get_index()).groupby("datetime").size().values
        self.daily_index = np.roll(np.cumsum(self.daily_count), 1)  # calculate begin index of each batch
        self.daily_index[0] = 0

    def __iter__(self):
        if self.shuffle:
            index = np.arange(len(self.daily_count))
            np.random.shuffle(index)
            for i in index:
                yield np.arange(self.daily_index[i], self.daily_index[i] + self.daily_count[i])
        else:
            for idx, count in zip(self.daily_index, self.daily_count):
                yield np.arange(idx, idx + count)

    def __len__(self):
        return len(self.data_source)

# ====================================================================================================================================================================================
#
#                           below is experimental
#
# ====================================================================================================================================================================================

# because nq100 is alr pandas df. added helpers as well
# import torch
# from torch.utils.data import Dataset, DataLoader, Sampler
# import pandas as pd
# import numpy as np
# from torch.utils.data import TensorDataset, DataLoader

# class TimeSeriesDataset(Dataset):
#     def __init__(self, df, lookback_window=20, num_stocks=100):  # Add num_stocks as a parameter
#         self.df = df
#         self.lookback_window = lookback_window
#         self.num_stocks = num_stocks  # Store the number of stocks
#         self.data = self._create_3d_tensor()

#     def _create_3d_tensor(self):
#         num_features = self.df.shape[1]  # Get the number of features from the DataFrame

#         # Calculate the correct data_points_per_stock to ensure compatibility
#         data_points_per_stock = len(self.df) // (self.num_stocks * num_features)

#         # Reshape it to (num_stocks, data_points_per_stock, num_features)
#         reshaped_data = self.df.values.reshape(
#             data_points_per_stock, self.num_stocks, num_features  # Corrected order of dimensions
#         )

#         # You can convert back to a NumPy array or keep it as a tensor if needed
#         return torch.tensor(reshaped_data, dtype=torch.float32)

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return self.data[idx]


# class DailyBatchSamplerRandom(torch.utils.data.Sampler):
#     def __init__(self, dataset, shuffle=False):
#         self.dataset = dataset  # Store the dataset object
#         self.shuffle = shuffle
#         self.num_stocks = dataset.num_stocks  # Access num_stocks from the dataset
#         self.len = len(dataset) // self.num_stocks
#         self.indices = torch.arange(self.num_stocks)

#     def __iter__(self):
#         if self.shuffle:
#             rand_idx = torch.randperm(self.num_stocks)
#             self.indices = self.indices[rand_idx]
#         for i in range(self.len):
#             yield self.indices + i * self.num_stocks

#     def __len__(self):
#         return self.len




# **MASTER.PY**
### **Model Definition**
Define the architecture for ReMASTER model with position encoding, self-attention, and gated layers.

In [None]:
# import torch.nn as nn
# from torch.nn.modules.linear import Linear
# from torch.nn.modules.dropout import Dropout
# from torch.nn.modules.normalization import LayerNorm
# import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Linear, LayerNorm, Dropout
import math
import copy
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Sampler

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:x.shape[1], :]

**Implementation of SAttention, TAttention, Gate, TemporalAttention, and MASTER classes**

In [None]:
# -------------------------------------for ReMASTER import TensorDataset and pandas------------------------------
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import datetime
import numpy as np
import torch

class SAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout):
        super().__init__()

        self.d_model = d_model
        self.nhead = nhead
        self.temperature = math.sqrt(self.d_model/nhead)

        self.qtrans = nn.Linear(d_model, d_model, bias=False)
        self.ktrans = nn.Linear(d_model, d_model, bias=False)
        self.vtrans = nn.Linear(d_model, d_model, bias=False)

        attn_dropout_layer = []
        for i in range(nhead):
            attn_dropout_layer.append(Dropout(p=dropout))
        self.attn_dropout = nn.ModuleList(attn_dropout_layer)

        # input LayerNorm
        self.norm1 = LayerNorm(d_model, eps=1e-5)

        # FFN layerNorm
        self.norm2 = LayerNorm(d_model, eps=1e-5)
        self.ffn = nn.Sequential(
            Linear(d_model, d_model),
            nn.ReLU(),
            Dropout(p=dropout),
            Linear(d_model, d_model),
            Dropout(p=dropout)
        )

    def forward(self, x):
        x = self.norm1(x)
        q = self.qtrans(x).transpose(0,1)
        k = self.ktrans(x).transpose(0,1)
        v = self.vtrans(x).transpose(0,1)

        dim = int(self.d_model/self.nhead)
        att_output = []
        for i in range(self.nhead):
            if i==self.nhead-1:
                qh = q[:, :, i * dim:]
                kh = k[:, :, i * dim:]
                vh = v[:, :, i * dim:]
            else:
                qh = q[:, :, i * dim:(i + 1) * dim]
                kh = k[:, :, i * dim:(i + 1) * dim]
                vh = v[:, :, i * dim:(i + 1) * dim]

            atten_ave_matrixh = torch.softmax(torch.matmul(qh, kh.transpose(1, 2)) / self.temperature, dim=-1)
            if self.attn_dropout:
                atten_ave_matrixh = self.attn_dropout[i](atten_ave_matrixh)
            att_output.append(torch.matmul(atten_ave_matrixh, vh).transpose(0, 1))
        att_output = torch.concat(att_output, dim=-1)

        # FFN
        xt = x + att_output
        xt = self.norm2(xt)
        att_output = xt + self.ffn(xt)

        return att_output


class TAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout):
        super().__init__()
        self.d_model = d_model
        self.nhead = nhead
        self.qtrans = nn.Linear(d_model, d_model, bias=False)
        self.ktrans = nn.Linear(d_model, d_model, bias=False)
        self.vtrans = nn.Linear(d_model, d_model, bias=False)

        self.attn_dropout = []
        if dropout > 0:
            for i in range(nhead):
                self.attn_dropout.append(Dropout(p=dropout))
            self.attn_dropout = nn.ModuleList(self.attn_dropout)

        # input LayerNorm
        self.norm1 = LayerNorm(d_model, eps=1e-5)
        # FFN layerNorm
        self.norm2 = LayerNorm(d_model, eps=1e-5)
        # FFN
        self.ffn = nn.Sequential(
            Linear(d_model, d_model),
            nn.ReLU(),
            Dropout(p=dropout),
            Linear(d_model, d_model),
            Dropout(p=dropout)
        )

    def forward(self, x):
        x = self.norm1(x)
        q = self.qtrans(x)
        k = self.ktrans(x)
        v = self.vtrans(x)

        dim = int(self.d_model / self.nhead)
        att_output = []
        for i in range(self.nhead):
            if i==self.nhead-1:
                qh = q[:, :, i * dim:]
                kh = k[:, :, i * dim:]
                vh = v[:, :, i * dim:]
            else:
                qh = q[:, :, i * dim:(i + 1) * dim]
                kh = k[:, :, i * dim:(i + 1) * dim]
                vh = v[:, :, i * dim:(i + 1) * dim]
            atten_ave_matrixh = torch.softmax(torch.matmul(qh, kh.transpose(1, 2)), dim=-1)
            if self.attn_dropout:
                atten_ave_matrixh = self.attn_dropout[i](atten_ave_matrixh)
            att_output.append(torch.matmul(atten_ave_matrixh, vh))
        att_output = torch.concat(att_output, dim=-1)

        # FFN
        xt = x + att_output
        xt = self.norm2(xt)
        att_output = xt + self.ffn(xt)

        return att_output


class Gate(nn.Module):
    def __init__(self, d_input, d_output,  beta=1.0):
        super().__init__()
        self.trans = nn.Linear(d_input, d_output)
        self.d_output =d_output
        self.t = beta

    def forward(self, gate_input):
        output = self.trans(gate_input)
        output = torch.softmax(output/self.t, dim=-1)
        return self.d_output*output


class TemporalAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.trans = nn.Linear(d_model, d_model, bias=False)

    def forward(self, z):
        h = self.trans(z) # [N, T, D]
        query = h[:, -1, :].unsqueeze(-1)
        lam = torch.matmul(h, query).squeeze(-1)  # [N, T, D] --> [N, T]
        lam = torch.softmax(lam, dim=1).unsqueeze(1)
        output = torch.matmul(lam, z).squeeze(1)  # [N, 1, T], [N, T, D] --> [N, 1, D]
        return output


class MASTER(nn.Module):
    def __init__(self, d_feat=158, d_model=256, t_nhead=4, s_nhead=2, T_dropout_rate=0.5, S_dropout_rate=0.5,
                 gate_input_start_index=158, gate_input_end_index=221, beta=None):
        super(MASTER, self).__init__()
        # market
        self.gate_input_start_index = gate_input_start_index
        self.gate_input_end_index = gate_input_end_index
        self.d_gate_input = (gate_input_end_index - gate_input_start_index) # F'
        self.feature_gate = Gate(self.d_gate_input, d_feat, beta=beta)

        self.layers = nn.Sequential(
            # feature layer
            nn.Linear(d_feat, d_model),
            PositionalEncoding(d_model),
            # intra-stock aggregation
            TAttention(d_model=d_model, nhead=t_nhead, dropout=T_dropout_rate),
            # inter-stock aggregation
            SAttention(d_model=d_model, nhead=s_nhead, dropout=S_dropout_rate),
            TemporalAttention(d_model=d_model),
            # decoder
            nn.Linear(d_model, 1)
        )

    def forward(self, x):
        src = x[:, :, :self.gate_input_start_index] # N, T, D
        gate_input = x[:, -1, self.gate_input_start_index:self.gate_input_end_index]
        src = src * torch.unsqueeze(self.feature_gate(gate_input), dim=1)

        output = self.layers(src).squeeze(-1)

        return output


# --------------------------------------------------------------------------------------------------------------
# class sequencemodel
# ORIGINALLY PART OF BASE_MODEL.PY
#   ** moved here to avoid import since this doesn't work in colab notebooks
# --------------------------------------------------------------------------------------------------------------
class SequenceModel():
    def __init__(self, n_epochs, lr, GPU=None, seed=None, train_stop_loss_thred=None, save_path = 'model/', save_prefix= ''):
        self.n_epochs = n_epochs
        self.lr = lr
        self.device = torch.device(f"cuda:{GPU}" if torch.cuda.is_available() else "cpu")
        self.seed = seed
        self.train_stop_loss_thred = train_stop_loss_thred

        if self.seed is not None:
            np.random.seed(self.seed)
            torch.manual_seed(self.seed)
        self.fitted = False

        self.model = None
        self.train_optimizer = None

        self.save_path = save_path
        self.save_prefix = save_prefix


    def init_model(self):
        if self.model is None:
            raise ValueError("model has not been initialized")

        self.train_optimizer = optim.Adam(self.model.parameters(), self.lr)
        self.model.to(self.device)

    def loss_fn(self, pred, label):
        mask = ~torch.isnan(label)
        loss = (pred[mask]-label[mask])**2
        return torch.mean(loss)

    def train_epoch(self, data_loader):
        self.model.train()
        losses = []

        for data in data_loader:
            data = torch.squeeze(data, dim=0)
            '''
            data.shape: (N, T, F)
            N - number of stocks
            T - length of lookback_window, 8
            F - 158 factors + 63 market information + 1 label           
            '''
            feature = data[:, :, 0:-1].to(self.device)
            label = data[:, -1, -1].to(self.device)

            pred = self.model(feature.float())
            loss = self.loss_fn(pred, label)
            losses.append(loss.item())

            self.train_optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_value_(self.model.parameters(), 3.0)
            self.train_optimizer.step()

        return float(np.mean(losses))

    def test_epoch(self, data_loader):
        self.model.eval()
        losses = []

        for data in data_loader:
            data = torch.squeeze(data, dim=0)
            feature = data[:, :, 0:-1].to(self.device)
            label = data[:, -1, -1].to(self.device)
            pred = self.model(feature.float())
            loss = self.loss_fn(pred, label)
            losses.append(loss.item())

        return float(np.mean(losses))

    def _init_data_loader(self, data, shuffle=True, drop_last=True):
        sampler = DailyBatchSamplerRandom(data, shuffle)
        data_loader = DataLoader(data, sampler=sampler, drop_last=drop_last)
        return data_loader

    def load_param(self, param_path):
        self.model.load_state_dict(torch.load(param_path, map_location=self.device))
        self.fitted = True

    def fit(self, dl_train, dl_valid):
        train_loader = self._init_data_loader(dl_train, shuffle=True, drop_last=True)
        valid_loader = self._init_data_loader(dl_valid, shuffle=False, drop_last=True)

        self.fitted = True
        best_param = None
        for step in range(self.n_epochs):
            train_loss = self.train_epoch(train_loader)
            val_loss = self.test_epoch(valid_loader)

            print("Epoch %d, train_loss %.6f, valid_loss %.6f " % (step, train_loss, val_loss))
            best_param = copy.deepcopy(self.model.state_dict())

            if train_loss <= self.train_stop_loss_thred:
                break
        torch.save(best_param, f'{self.save_path}{self.save_prefix}master_{self.seed}.pkl')

    def predict(self, dl_test):
        if not self.fitted:
            raise ValueError("model is not fitted yet!")

        test_loader = self._init_data_loader(dl_test, shuffle=False, drop_last=False)

        preds = []
        ic = []
        ric = []

        self.model.eval()
        for data in test_loader:
            data = torch.squeeze(data, dim=0)
            feature = data[:, :, 0:-1].to(self.device)
            label = data[:, -1, -1]
            with torch.no_grad():
                pred = self.model(feature.float()).detach().cpu().numpy()
            preds.append(pred.ravel())

            daily_ic, daily_ric = calc_ic(pred, label.detach().numpy())
            ic.append(daily_ic)
            ric.append(daily_ric)

        predictions = pd.Series(np.concatenate(preds), index=dl_test.get_index())

        metrics = {
            'IC': np.mean(ic),
            'ICIR': np.mean(ic)/np.std(ic),
            'RIC': np.mean(ric),
            'RICIR': np.mean(ric)/np.std(ric)
        }

        return predictions, metrics


#--------------------------------------------------------------------------------------------------------------------------------------------
# BACK TO MASTER.PY
#--------------------------------------------------------------------------------------------------------------------------------------------

class MASTERModel(SequenceModel):
    def __init__(
            self, d_feat: int = 20, d_model: int = 64, t_nhead: int = 4, s_nhead: int = 2, gate_input_start_index=None, gate_input_end_index=None,
            T_dropout_rate=0.5, S_dropout_rate=0.5, beta=5.0, **kwargs,
    ):
        super(MASTERModel, self).__init__(**kwargs)
        self.d_model = d_model
        self.d_feat = d_feat

        self.gate_input_start_index = gate_input_start_index
        self.gate_input_end_index = gate_input_end_index

        self.T_dropout_rate = T_dropout_rate
        self.S_dropout_rate = S_dropout_rate
        self.t_nhead = t_nhead
        self.s_nhead = s_nhead
        self.beta = beta

        self.init_model()

    def init_model(self):
        self.model = MASTER(d_feat=self.d_feat, d_model=self.d_model, t_nhead=self.t_nhead, s_nhead=self.s_nhead,
                                   T_dropout_rate=self.T_dropout_rate, S_dropout_rate=self.S_dropout_rate,
                                   gate_input_start_index=self.gate_input_start_index,
                                   gate_input_end_index=self.gate_input_end_index, beta=self.beta)
        super(MASTERModel, self).init_model()

## **MAIN.PY**
### **Model Initialization and Training**

In [None]:
import pickle

universe = 'csi800' # or 'csi800'

# Please install qlib first before load the data.
with open(f'/content/drive/My Drive/ECE 570/data/{universe}/{universe}_dl_train.pkl', 'rb') as f:
    dl_train = pickle.load(f)
with open(f'/content/drive/My Drive/ECE 570/data/{universe}/{universe}_dl_valid.pkl', 'rb') as f:
    dl_valid = pickle.load(f)
with open(f'/content/drive/My Drive/ECE 570/data/{universe}/{universe}_dl_test.pkl', 'rb') as f:
    dl_test = pickle.load(f)
print("Data Loaded.")

d_feat = 158
d_model = 256
t_nhead = 4
s_nhead = 2
dropout = 0.5
gate_input_start_index=158
gate_input_end_index = 221

if universe == 'csi300':
    beta = 10
elif universe == 'csi800':
    beta = 5

n_epoch = 5
lr = 8e-6
GPU = 0
seed = 0
train_stop_loss_thred = 0.95

model = MASTERModel(
    d_feat = d_feat, d_model = d_model, t_nhead = t_nhead, s_nhead = s_nhead, T_dropout_rate=dropout, S_dropout_rate=dropout,
    beta=beta, gate_input_end_index=gate_input_end_index, gate_input_start_index=gate_input_start_index,
    n_epochs=n_epoch, lr = lr, GPU = GPU, seed = seed, train_stop_loss_thred = train_stop_loss_thred,
    save_path='/content/drive/My Drive/ECE 570/data/model/', save_prefix=universe
)

# Train
model.fit(dl_train, dl_valid)
print("Model Trained.")

# Test
predictions, metrics = model.predict(dl_test)
print(metrics)

### **Model Loading and Testing**

In [None]:
# Load and Test
# universe
# Step 2: Set the path to your data folder
data_path = '/content/drive/My Drive/ECE 570/data'

# Step 3: List files in the folder (to verify)
print("Files in data folder:")
for filename in os.listdir(data_path):
    print(filename)
param_path = f'/content/drive/My Drive/ECE 570/model/{universe}master_0.pkl.'
print(f'Model Loaded from {param_path}')
model.load_param(param_path)
predictions, metrics = model.predict(dl_test)
print(metrics)


# This is where the official ReMASTER ends. happy investing! :)