In [1]:
# Configuration variables
dir = '/kaggle/input/your-data-folder'  # Adjust this path for your Kaggle setup
import os

num_companies = 150 # max is 1026
num_days = 1245
num_features = 5
window_size = 20 # if you change this is not changed everywhere yet unfortunately todo
calculate_correlation = False
train_batch = 1
val_batch = 1
K = 5
epochs = 20
val_min_num = 10
use_kfold = False

if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    print("Running on Kaggle!")
    dir = '/kaggle/input/rsr-dataset/Data/'
    train_batch = 32
    val_batch = 32
    epochs = 100
    num_companies = 1026
    calculate_correlation = True

else:
    dir = '/home/study/IdeaProjects/Graph-Machine-Learning/Temporal_RSR/data' # Samuel's directory
    print("Running locally!")
    # turn

SAVE_PREPROCESSED_DATA = False  # Set to True to save preprocessed data for faster loading


Running locally!


In [2]:
import torch
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os

In [3]:
"""
COPIED FROM THE PAPER
source code: https://github.com/fulifeng/Temporal_Relational_Stock_Ranking
"""
def load_EOD_data(data_path, market_name, tickers, steps=1):
    eod_data = []
    masks = []
    ground_truth = []
    base_price = []

    # Determine the expected number of rows based on the first ticker's data
    first_ticker_path = os.path.join(data_path, market_name + '_' + tickers[0] + '_1.csv')
    try:
        first_df = pd.read_csv(first_ticker_path, header=None)
        num_days = first_df.shape[0] - (1 if market_name == 'NASDAQ' else 0) # Remove last row for NASDAQ
        num_features = first_df.shape[1] - 1 # Exclude the date column
    except Exception as e:
        print(f"Error reading first ticker file {first_ticker_path}: {e}")
        return None, None, None, None

    eod_data = np.zeros([len(tickers), num_days, num_features], dtype=np.float32)
    masks = np.ones([len(tickers), num_days], dtype=np.float32)
    ground_truth = np.zeros([len(tickers), num_days], dtype=np.float32) # We're not using this one
    base_price = np.zeros([len(tickers), num_days], dtype=np.float32)

    for index, ticker in enumerate(tickers):
        if index % 50 == 0:
          print(f"Processed [{index}/{tickers.shape[0]}] tickers")
        single_EOD_path = os.path.join(data_path, market_name + '_' + ticker + '_1.csv')

        try:
            single_df = pd.read_csv(single_EOD_path, header=None)
            if market_name == 'NASDAQ':
                single_df = single_df[:-1] # remove the last day since lots of missing data

            # Handle missing values (-1234)
            single_EOD = single_df.values
            mask_row_indices, mask_col_indices = np.where(np.abs(single_EOD + 1234) < 1e-8)
            single_EOD[mask_row_indices, mask_col_indices] = 1.1 # Replace missing values

            # Update masks based on missing closing price
            missing_close_indices = np.where(np.abs(single_EOD[:, -1] + 1234) < 1e-8)[0]
            masks[index, missing_close_indices] = 0.0

            eod_data[index, :, :] = single_EOD[:, 1:] # Exclude date column
            base_price[index, :] = single_EOD[:, -1]

        except Exception as e:
            print(f"Error reading ticker file {single_EOD_path}: {e}")
            # Mark all days for this ticker as invalid if file reading fails
            masks[index, :] = 0.0


    print('eod data shape:', eod_data.shape)
    return eod_data, masks, ground_truth, base_price

In [4]:
"""
COPIED FROM THE PAPER
source code: https://github.com/fulifeng/Temporal_Relational_Stock_Ranking
"""
def load_relation_data(relation_file):
    relation_encoding = np.load(relation_file)
    print('relation encoding shape:', relation_encoding.shape)
    rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]]
    mask_flags = np.equal(np.zeros(rel_shape, dtype=int),
                          np.sum(relation_encoding, axis=2))
    mask = np.where(mask_flags, np.ones(rel_shape) * -1e9, np.zeros(rel_shape))
    return relation_encoding, mask

# Loading data

In [5]:
# market = "NYSE"
market = "NASDAQ"

In [6]:
industry_encodings, industry_mask = load_relation_data(dir+f'/relation/sector_industry/{market}_industry_relation.npy')

relation encoding shape: (1026, 1026, 97)


In [7]:
wiki_encodings, wiki_mask = load_relation_data(dir+f'/relation/wikidata/{market}_wiki_relation.npy')

relation encoding shape: (1026, 1026, 43)


In [8]:
# Load company names
tickers = np.loadtxt(dir+f'/{market}_tickers.csv', dtype=str)
print('tickers shape (# of companies):', tickers.shape)

tickers shape (# of companies): (1026,)


In [9]:
eod_data, eod_masks, eod_ground_truth, eod_base_price = load_EOD_data(dir+"/2013-01-01", market, tickers)

Processed [0/1026] tickers
Processed [50/1026] tickers
Processed [100/1026] tickers
Processed [150/1026] tickers
Processed [200/1026] tickers
Processed [250/1026] tickers
Processed [300/1026] tickers
Processed [350/1026] tickers
Processed [400/1026] tickers
Processed [450/1026] tickers
Processed [500/1026] tickers
Processed [550/1026] tickers
Processed [600/1026] tickers
Processed [650/1026] tickers
Processed [700/1026] tickers
Processed [750/1026] tickers
Processed [800/1026] tickers
Processed [850/1026] tickers
Processed [900/1026] tickers
Processed [950/1026] tickers
Processed [1000/1026] tickers
eod data shape: (1026, 1245, 5)


In [10]:
# Use subset of data for the experiments
n_companies = 150

wiki_encodings = wiki_encodings[:n_companies, :n_companies, :]
wiki_mask = wiki_mask[:n_companies, :n_companies]
industry_encodings = industry_encodings[:n_companies, :n_companies, :]
industry_mask = industry_mask[:n_companies, :n_companies]

eod_data, eod_masks, eod_ground_truth, eod_base_price = load_EOD_data(dir+"/2013-01-01", "NASDAQ", tickers[:n_companies])

Processed [0/150] tickers
Processed [50/150] tickers
Processed [100/150] tickers
eod data shape: (150, 1245, 5)


# Graph based Models

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [12]:
# ============================================================================
# Data Preparation Functions
# ============================================================================

def build_adjacency_matrix(industry_encodings, industry_mask, wiki_encodings, wiki_mask, device):
    """
    Build normalized adjacency matrix from relation encodings and masks

    Args:
        industry_encodings: [num_companies, num_companies, num_relation_types]
        industry_mask: [num_companies, num_companies] (-1e9 for no relation, 0 for valid)
        wiki_encodings: [num_companies, num_companies, num_relation_types]
        wiki_mask: [num_companies, num_companies]

    Returns:
        adjacency_matrix: [num_companies, num_companies] - normalized adjacency
    """
    # Combine relation encodings by summing across relation types
    industry_adj = torch.sum(industry_encodings, dim=-1)  # [companies, companies]
    wiki_adj = torch.sum(wiki_encodings, dim=-1)

    # Combine both relation types
    combined_adj = industry_adj + wiki_adj

    # Apply masks: where mask is -1e9 (no relation), set adjacency to 0
    combined_mask = industry_mask + wiki_mask
    combined_adj = torch.where(combined_mask < -1e8, torch.zeros_like(combined_adj), combined_adj)

    # Normalize: row-wise normalization (each row sums to 1)
    row_sums = combined_adj.sum(dim=1, keepdim=True)
    adjacency_matrix = combined_adj / (row_sums + 1e-8) # [0, 1)

    return adjacency_matrix.to(device)


def prepare_data(eod_data, masks, base_price, device, window_size=20, prediction_horizon=1):
    """
    Create sliding windows for time series prediction with mask handling

    Args:
        eod_data: [num_companies, num_days, num_features]
        masks: [num_companies, num_days] - 1.0 for valid, 0.0 for missing
        base_price: [num_companies, num_days] - closing price of stock
        window_size: Number of historical days to use as input
        prediction_horizon: Number of days ahead to predict (usually 1)

    Returns:
        X: Input windows [num_samples, num_companies, window_size, num_features]
        y: Target returns [num_samples, num_companies, prediction_horizon]
        sample_masks: Valid sample indicators [num_samples, num_companies]
    """
    num_companies, num_days, num_features = eod_data.shape
    num_samples = num_days - window_size - prediction_horizon + 1

    X = torch.zeros(num_samples, num_companies, window_size, num_features, device=device)
    y = torch.zeros(num_samples, num_companies, prediction_horizon, device=device)
    sample_masks = torch.zeros(num_samples, num_companies, device=device)

    for i in range(num_samples):
        X[i] = eod_data[:, i:i+window_size, :]
        y[i, :, :] = base_price[:, i+window_size : i+window_size+prediction_horizon] #

        # A sample is valid if all days in the window AND the target day are valid
        window_valid = masks[:, i:i+window_size].min(dim=1)[0]  # [num_companies]
        target_valid = masks[:, i+window_size : i+window_size+prediction_horizon].min(dim=1)[0]
        sample_masks[i] = window_valid * target_valid

    return X, y, sample_masks

In [13]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Load data
# num_companies = eod_data.shape[0] # Change if using subset
num_companies = 150
num_days = 1245
num_features = 5

# Subsample to only use the first num_companies
eod_data = torch.tensor(eod_data[:num_companies])
masks = torch.tensor(eod_masks[:num_companies])
price_prediction = torch.tensor(eod_base_price[:num_companies])  # FIXED: subsample this too!

# Relation data - subsample both dimensions since it's company x company
industry_encodings = torch.tensor(industry_encodings[:num_companies, :num_companies])
industry_mask = torch.tensor(industry_mask[:num_companies, :num_companies])
wiki_encodings = torch.tensor(wiki_encodings[:num_companies, :num_companies])
wiki_mask = torch.tensor(wiki_mask[:num_companies, :num_companies])

print(f"EOD data shape: {eod_data.shape}")
print(f"Masks shape: {masks.shape}")
print(f"Ground truth shape: {price_prediction.shape}")
print(f"Industry encodings shape: {industry_encodings.shape}")


Using device: cuda
EOD data shape: torch.Size([150, 1245, 5])
Masks shape: torch.Size([150, 1245])
Ground truth shape: torch.Size([150, 1245])
Industry encodings shape: torch.Size([150, 150, 97])


In [15]:
def get_adjacency_matrix(prediction_horizon=1, window_size=100):
  # Build adjacency matrix from relations
  adjacency_matrix = build_adjacency_matrix(
      industry_encodings, industry_mask,
      wiki_encodings, wiki_mask,
      device=device
  )
  #print(f"Adjacency matrix shape: {adjacency_matrix.shape}")

  # Prepare temporal data with masks
  X_train, y_train, train_masks = prepare_data(
      eod_data, masks, price_prediction,
      window_size=window_size,
      device=device,
      prediction_horizon=prediction_horizon
  )
  #print(f"Training data: X={X_train.shape}, y={y_train.shape}, masks={train_masks.shape}")

  return (adjacency_matrix, X_train, y_train, train_masks)

## Simple G-Var

In [16]:
"""
Simple G-VAR (Graph Vector AutoRegression) for Stock Price Prediction
Combines temporal dependencies (VAR) with graph structure (GNN)
Updated to match the paper's data format
"""

# ============================================================================
# Simple G-Var
# ============================================================================

class GVarModel(nn.Module):
    def __init__(self, input_dim, output_dim, num_companies, device, K=2):
        """
        Args:
            input_dim: Number of features * window_size per company
            output_dim: Prediction dimension (1 for return prediction)
            num_companies: Number of stocks (e.g., 150)
            K: Number of graph hops
        """
        super(GVarModel, self).__init__()
        self.device = device
        self.K = K
        self.num_companies = num_companies
        self.output_dim = output_dim

        self.graph_layers = nn.ModuleList([
            nn.Linear(input_dim, 1) for _ in range(K + 1)
        ])

    def forward(self, x, adjacency_matrix):
        """
        Args:
            x: Historical data [batch, num_companies, time_steps, input_dim]
            adjacency_matrix: Graph structure [num_companies, num_companies]
        Returns:
            predictions: [batch, num_companies, output_dim]
        """
        batch_size = x.shape[0]

        # Step 1: Extract temporal features for each stock independently
        # Reshape to process all companies' time series
        x_reshaped = x.view(x.shape[0], x.shape[1], -1)  # [batch, companies, time_steps * features]

        # Compute powers of adjacency matrix: A^0 (self), A^1 (neighbors), A^2 (2-hop), ...
        S_powers = [torch.eye(self.num_companies, device=adjacency_matrix.device)]
        for k in range(self.K):
            S_powers.append(torch.matmul(S_powers[-1], adjacency_matrix))

        # Step 2: Aggregate information from k-hop neighbors
        output = torch.zeros(x.shape[0], x.shape[1], self.output_dim, device=self.device)
        for k in range(self.K + 1):
            # Transform features at each hop level
            transformed = self.graph_layers[k](x_reshaped)  # [batch, companies, hidden_dim]

            # Aggregate from k-hop neighbors: S^k @ transformed
            aggregated = torch.matmul(S_powers[k], transformed)  # [batch, companies, hidden_dim]
            output += aggregated

        return output

In [17]:
torch.cuda.empty_cache()

torch.cuda.memory_allocated()

#import gc
#gc.collect()

0

In [18]:
# prediction_horizon=1
# window_size=100

# adjacency_matrix, X_train, y_train, train_masks = get_adjacency_matrix(prediction_horizon, window_size)

# print("train_masks.sum():", train_masks.sum())
# print("train_masks size:", train_masks.shape[0]*train_masks.shape[1])


# # Initialize model
# model = GVarModel(
#     input_dim=num_features*window_size,
#     output_dim=prediction_horizon, # = prediction_horizon
#     num_companies=num_companies,
#     device=device,
#     K=1
# ).to(device)

# # Training with masked loss
# criterion = nn.MSELoss(reduction='none')  # Don't reduce yet, we'll apply masks
# #criterion = nn.L1Loss(reduction='none')  # Don't reduce yet, we'll apply masks
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# epochs = 500
# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()

#     # Forward pass
#     #predictions = model(X_train, adjacency_matrix)  # [batch, companies, 1]
#     predictions = model(X_train, torch.eye(num_companies, device=device))  # [batch, companies, 1]


#     # Calculate masked loss (only on valid samples)
#     loss_per_sample = criterion(predictions, y_train)  # [batch, companies, 1]
#     masked_loss = loss_per_sample * train_masks.unsqueeze(-1)  # Apply mask

#     # Average loss over valid samples only
#     num_valid = train_masks.sum() + 1e-8
#     #loss = masked_loss[:,:,model.output_dim-1].sum() / num_valid # Loss only for prediction_horizon day in future (1 day)
#     loss = masked_loss.sum() / num_valid # Loss for all days up to prediction_horizon

#     # Backward pass
#     loss.backward()
#     optimizer.step()

#     if (epoch + 1) % 50 == 0:
#         print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')

# print("Training compled")

# GNN

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def build_graph_matrix(industry_encodings, industry_mask, wiki_encodings, wiki_mask, device):
    """
    Build adjacency and degree matrices from relation encodings and masks

    Returns:
        adjacency_matrix: [num_companies, num_companies]
        degree_matrix: [num_companies, num_companies]
    """
    # Combine relation encodings by summing across relation types
    industry_adj = torch.sum(industry_encodings, dim=-1)  # [companies, companies]
    wiki_adj = torch.sum(wiki_encodings, dim=-1)

    combined_adj = industry_adj + wiki_adj

    combined_mask = industry_mask + wiki_mask
    #combined_adj = torch.where(combined_mask < -1e8, torch.zeros_like(combined_adj), combined_adj)
    degree_matrix = torch.diag(torch.pow(torch.sum(combined_adj, dim=1), -0.5))  # [companies, companies]

    graph_shift_operator =  degree_matrix @ combined_adj.float() @  degree_matrix
    return graph_shift_operator.to(device)

def prepare_data(eod_data, masks, base_price, device, window_size=20, prediction_horizon=1):
    """
    Create sliding windows for time series prediction with mask handling

    Returns:
        X: Input windows [num_samples, num_companies, window_size, num_features]
        y: Target returns [num_samples, num_companies, prediction_horizon]
        sample_masks: Valid sample indicators [num_samples, num_companies]
    """
    num_companies, num_days, num_features = eod_data.shape
    num_samples = num_days - window_size - prediction_horizon + 1

    X = torch.zeros(num_samples, num_companies, window_size, num_features, device=device)
    y = torch.zeros(num_samples, num_companies, prediction_horizon, device=device)
    sample_masks = torch.zeros(num_samples, num_companies, device=device)

    for i in range(num_samples):
        X[i] = eod_data[:, i:i+window_size, :]
        y[i, :, :] = base_price[:, i+window_size : i+window_size+prediction_horizon] #

        # A sample is valid if all days in the window AND the target day are valid
        window_valid = masks[:, i:i+window_size].min(dim=1)[0]  # [num_companies]
        target_valid = masks[:, i+window_size : i+window_size+prediction_horizon].min(dim=1)[0]
        sample_masks[i] = window_valid * target_valid

    return X, y, sample_masks

class GCNModel(nn.Module):
    def __init__(self, layers_dim, num_companies, S, device, K=1, L=1):
        """
        Args:
            input_dim: Number of features * window_size per company
            output_dim: Prediction dimension (1 for return prediction)
            num_companies: Number of stocks (e.g., 150)
            K: Number of graph hops
        """
        super(GCNModel, self).__init__()
        self.device = device
        self.K = K
        self.L = L
        self.num_companies = num_companies
        self.layers_dim = layers_dim

        # Compute powers of adjacency matrix: A^0 (self), A^1 (neighbors), A^2 (2-hop), ...
        self.S_powers = [S]
        for k in range(self.K):
            self.S_powers.append(self.S_powers[-1] @ S)

        self.gcn_layer1 = nn.ModuleList([
            nn.Linear(layers_dim[0][0], layers_dim[0][1]) for _ in range(K)
        ])

        self.activation1 = F.ReLU()

        self.gcn_layer2 = nn.ModuleList([
            nn.Linear(layers_dim[1][0], layers_dim[1][1]) for _ in range(K)
        ])

        self.activation2 = F.ReLU()

    def forward(self, x):
        """
        Args:
            x: Historical data [batch, num_companies, time_steps, input_dim]
            adjacency_matrix: Graph structure [num_companies, num_companies]
        Returns:
            predictions: [batch, num_companies, output_dim]
        """
        batch_size = x.shape[0]
        x_reshaped = x.view(x.shape[0], x.shape[1], -1)  # [batch, companies, time_steps * features]

        x_i = x_reshaped
        output = torch.zeros(batch_size, self.num_companies, self.layers_dim[0][1], device=self.device)
        for k in range(self.K):
            # Transform features at each hop level
            transformed = self.gcn_layer1[k](self.S_powers[k] @ x_i)

            # Aggregate from k-hop neighbors using graph structure
            output += transformed
        x_i = self.activation1(output)

        output = torch.zeros(batch_size, self.num_companies, self.layers_dim[1][1], device=self.device)
        for k in range(self.K):
            # Transform features at each hop level
            transformed = self.gcn_layer2[k](self.S_powers[k] @ x_i)

            # Aggregate from k-hop neighbors using graph structure
            output += transformed
        x_i = self.activation2(output)
        return x_i

## GCNGATModel

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def build_graph_matrix(industry_encodings, industry_mask, wiki_encodings, wiki_mask, device):
    """
    Build adjacency and degree matrices from relation encodings and masks

    Returns:
        adjacency_matrix: [num_companies, num_companies]
        degree_matrix: [num_companies, num_companies]
    """
    # Combine relation encodings by summing across relation types
    industry_adj = torch.sum(industry_encodings, dim=-1)  # [companies, companies]
    wiki_adj = torch.sum(wiki_encodings, dim=-1)

    combined_adj = industry_adj + wiki_adj

    combined_mask = industry_mask + wiki_mask
    #combined_adj = torch.where(combined_mask < -1e8, torch.zeros_like(combined_adj), combined_adj)
    degree_matrix = torch.diag(torch.pow(torch.sum(combined_adj, dim=1), -0.5))  # [companies, companies]

    graph_shift_operator =  degree_matrix @ combined_adj.float() @  degree_matrix
    return graph_shift_operator.to(device)

def prepare_data(eod_data, masks, base_price, device, window_size=20, prediction_horizon=1):
    """
    Create sliding windows for time series prediction with mask handling

    Returns:
        X: Input windows [num_samples, num_companies, window_size, num_features]
        y: Target returns [num_samples, num_companies, prediction_horizon]
        sample_masks: Valid sample indicators [num_samples, num_companies]
    """
    num_companies, num_days, num_features = eod_data.shape
    num_companies = 150
    num_samples = num_days - window_size - prediction_horizon + 1

    X = torch.zeros(num_samples, num_companies, window_size, num_features, device=device)
    y = torch.zeros(num_samples, num_companies, prediction_horizon, device=device)
    sample_masks = torch.zeros(num_samples, num_companies, device=device)

    for i in range(num_samples):
        X[i] = eod_data[:, i:i+window_size, :]
        y[i, :, :] = base_price[:, i+window_size : i+window_size+prediction_horizon] #

        # A sample is valid if all days in the window AND the target day are valid
        window_valid = masks[:, i:i+window_size].min(dim=1)[0]  # [num_companies]
        target_valid = masks[:, i+window_size : i+window_size+prediction_horizon].min(dim=1)[0]
        sample_masks[i] = window_valid * target_valid

    return X, y, sample_masks

class GCNGATModel(nn.Module):
    def __init__(self, layers_dim, num_companies, adjacency_matrix, S, device, K=1, L=1):
        """
        Args:
            input_dim: Number of features * window_size per company
            output_dim: Prediction dimension (1 for return prediction)
            num_companies: Number of stocks (e.g., 150)
            K: Number of graph hops
        """
        super(GCNGATModel, self).__init__()
        self.device = device
        self.K = K
        self.L = L
        self.num_companies = num_companies
        self.layers_dim = layers_dim
        self.w_linear = nn.Linear(layers_dim[0][0], 15)
        self.a = nn.Parameter(torch.randn(size=(2*15, 1))) # size should be 2*out feature size? but what is the out feature size? # todod maybe zero initialize does not work, maybe ones is better
        # todo maybe use seed to prevent randomness in initialization
        
        # todo do i need to initialize the a parameter?


        
        self.S = S
        self.adjacency_matrix = adjacency_matrix
        self.mask = (self.adjacency_matrix == 0)

        # Compute powers of adjacency matrix: A^0 (self), A^1 (neighbors), A^2 (2-hop), ...
        # self.S_powers = [S]
        # for k in range(self.K):
        #     self.S_powers.append(self.S_powers[-1] @ S)

        # self.gcn_layer1 = nn.ModuleList([
        #     nn.Linear(layers_dim[0][0], layers_dim[0][1]) for _ in range(K) # todo I think normal gcn would only have k of 1, and then multiple layers would allow higher k
        # ])

        # self.activation1 = F.ReLU()

        # self.gcn_layer2 = nn.ModuleList([
        #     nn.Linear(layers_dim[1][0], layers_dim[1][1]) for _ in range(K)
        # ])

        # self.activation2 = F.ReLU()

    def forward(self, x):
        """
        Args:
            x: Historical data [batch, num_companies, time_steps, input_dim]
            adjacency_matrix: Graph structure [num_companies, num_companies]
        Returns:
            predictions: [batch, num_companies, output_dim]
        """
        batch_size = x.shape[0]
        x_reshaped = x.view(x.shape[0], x.shape[1], -1)  # [batch, companies, time_steps * features]
        
        x_i = x_reshaped

        h = self.w_linear(x_i) # this is first linear transformation of the x input 
        # so now we have all hi*W, but now we need to concatenate all pairs for in the 

        # h contains all node(companies) embeddings
        # first calculate the first part dot procut with a, after the second part
        first_part = h @ self.a[:15, :]
        second_part = h @ self.a[15:, :] # check if i should mention inspiration https://epichka.com/blog/2023/gat-paper-explained/
        # print(first_part.shape)
        # print(second_part.shape)
        partly = first_part+ second_part.mT
        # print(partly.shape)
        e = F.leaky_relu(partly)
        # this above was the a^t[Whi || Whj]
        
        # print(e.shape)

        # print(mask.shape)
        e = e.masked_fill(self.mask, float('-inf'))
        attention = F.softmax(e, dim=-1)
        output = attention @ h
        
        
        # output = torch.zeros(batch_size, self.num_companies, self.layers_dim[0][1], device=self.device)
        # for k in range(self.K):
        #     # Transform features at each hop level
        #     transformed = self.gcn_layer1[k](self.S_powers[k] @ x_i)

        #     # Aggregate from k-hop neighbors using graph structure
        #     output += transformed
        # x_i = self.activation1(output)

        # output = torch.zeros(batch_size, self.num_companies, self.layers_dim[1][1], device=self.device)
        # for k in range(self.K):
        #     # Transform features at each hop level
        #     transformed = self.gcn_layer2[k](self.S_powers[k] @ x_i)

        #     # Aggregate from k-hop neighbors using graph structure
        #     output += transformed
        # x_i = self.activation2(output)
        return output

In [24]:
window_size=30
graph_shift_operator = build_graph_matrix(industry_encodings, industry_mask, wiki_encodings, wiki_mask, device)
adjacency_matrix = build_adjacency_matrix(
      industry_encodings, industry_mask,
      wiki_encodings, wiki_mask,
      device=device
  )
X_train, y_train, train_masks = prepare_data(
    eod_data=eod_data,
    masks=masks,
    base_price=price_prediction,
    device=device,
    window_size=window_size)

# Initialize model
model = GCNGATModel(
    layers_dim=[(num_features*window_size, 15), (15, 1)],
    num_companies=num_companies,
    adjacency_matrix=adjacency_matrix,
    S=graph_shift_operator,
    device=device,
    K=1,
    L=1
).to(device)

# Training with masked loss
criterion = nn.MSELoss(reduction='none')  # Don't reduce yet, we'll apply masks
#criterion = nn.L1Loss(reduction='none')  # Don't reduce yet, we'll apply masks
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 1000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    #predictions = model(X_train, adjacency_matrix)  # [batch, companies, 1]
    predictions = model(X_train)  # [batch, companies, 1]

    # Calculate masked loss (only on valid samples)
    loss_per_sample = criterion(predictions, y_train)  # [batch, companies, 1]
    masked_loss = loss_per_sample * train_masks.unsqueeze(-1)  # Apply mask

    # Average loss over valid samples only
    num_valid = train_masks.sum() + 1e-8
    #loss = masked_loss[:,:,model.output_dim-1].sum() / num_valid # Loss only for prediction_horizon day in future (1 day)
    loss = masked_loss.sum() / num_valid # Loss for all days up to prediction_horizon

    # Backward pass
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')

print("Training complted")

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [10/1000], Loss: 0.620821
Epoch [20/1000], Loss: 0.599193
Epoch [30/1000], Loss: 0.251606
Epoch [40/1000], Loss: 0.091520
Epoch [50/1000], Loss: 0.091906
Epoch [60/1000], Loss: 0.060357
Epoch [70/1000], Loss: 0.059873
Epoch [80/1000], Loss: 0.056290
Epoch [90/1000], Loss: 0.055382
Epoch [100/1000], Loss: 0.054681
Epoch [110/1000], Loss: 0.054128
Epoch [120/1000], Loss: 0.053693
Epoch [130/1000], Loss: 0.053249
Epoch [140/1000], Loss: 0.052836
Epoch [150/1000], Loss: 0.052438
Epoch [160/1000], Loss: 0.052053
Epoch [170/1000], Loss: 0.051680
Epoch [180/1000], Loss: 0.051318
Epoch [190/1000], Loss: 0.050967
Epoch [200/1000], Loss: 0.050626
Epoch [210/1000], Loss: 0.050295
Epoch [220/1000], Loss: 0.049973
Epoch [230/1000], Loss: 0.049660
Epoch [240/1000], Loss: 0.049355
Epoch [250/1000], Loss: 0.049058
Epoch [260/1000], Loss: 0.048768
Epoch [270/1000], Loss: 0.048485
Epoch [280/1000], Loss: 0.048208
Epoch [290/1000], Loss: 0.047938
Epoch [300/1000], Loss: 0.047674
Epoch [310/1000], L

In [22]:
# window_size=30
# graph_shift_operator = build_graph_matrix(industry_encodings, industry_mask, wiki_encodings, wiki_mask, device)
# X_train, y_train, train_masks = prepare_data(
#     eod_data=eod_data,
#     masks=masks,
#     base_price=price_prediction,
#     device=device,
#     window_size=window_size)

# # Initialize model
# model = GCNModel(
#     layers_dim=[(num_features*window_size, 15), (15, 1)],
#     num_companies=num_companies,
#     S=graph_shift_operator,
#     device=device,
#     K=1,
#     L=1
# ).to(device)

# # Training with masked loss
# criterion = nn.MSELoss(reduction='none')  # Don't reduce yet, we'll apply masks
# #criterion = nn.L1Loss(reduction='none')  # Don't reduce yet, we'll apply masks
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# epochs = 1000
# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()

#     # Forward pass
#     #predictions = model(X_train, adjacency_matrix)  # [batch, companies, 1]
#     predictions = model(X_train)  # [batch, companies, 1]


#     # Calculate masked loss (only on valid samples)
#     loss_per_sample = criterion(predictions, y_train)  # [batch, companies, 1]
#     masked_loss = loss_per_sample * train_masks.unsqueeze(-1)  # Apply mask

#     # Average loss over valid samples only
#     num_valid = train_masks.sum() + 1e-8
#     #loss = masked_loss[:,:,model.output_dim-1].sum() / num_valid # Loss only for prediction_horizon day in future (1 day)
#     loss = masked_loss.sum() / num_valid # Loss for all days up to prediction_horizon

#     # Backward pass
#     loss.backward()
#     optimizer.step()

#     if (epoch + 1) % 10 == 0:
#         print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.6f}')

# print("Training complted")