In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import os
import pandas as pd

In [None]:
# Dataset preprocessing
# we'll use close and volume
# embeddings of different stocks name

In [None]:
# Define the dataset path
data_dir = "/Users/ayushkushwaha/Desktop/Sem-8-final-project/dataset"

# Define the split ratios
train_ratio = 0.7
val_ratio = 0.15  # Test ratio will be 1 - train_ratio - val_ratio

# Iterate through each CSV file
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        df = pd.read_csv(file_path, parse_dates=['Date'])  # Ensure 'Date' column is parsed as datetime
        df = df.sort_values(by='Date')  # Sort by date if not already sorted
        
        # Split indices
        train_size = int(len(df) * train_ratio)
        val_size = int(len(df) * val_ratio)
        
        train_data = df.iloc[:train_size]
        val_data = df.iloc[train_size:train_size + val_size]
        test_data = df.iloc[train_size + val_size:]

        # Save or process further
        train_data.to_csv(f"dataset/splits/train_{file}", index=False)
        val_data.to_csv(f"dataset/splits/val_{file}", index=False)
        test_data.to_csv(f"dataset/splits/test_{file}", index=False)

        print(f"Processed {file}: Train({len(train_data)}), Val({len(val_data)}), Test({len(test_data)})")

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
SEQ_LEN = 30  # Number of past days used for prediction
PRED_LEN = 1  # Predict the next day's price
D_MODEL = 64  # Transformer model dimension
NHEAD = 4  # Multi-head attention heads
NUM_LAYERS = 3  # Transformer layers
BATCH_SIZE = 32
EPOCHS = 20
LR = 0.001

# Load dataset
data_dir = "dataset/historical_data"

# Function to load and preprocess stock data
def load_stock_data(file_path):
    df = pd.read_csv(file_path, parse_dates=['Date'])
    df = df.sort_values(by='Date')
    df['Return'] = df['Close'].pct_change()  # Use returns instead of raw prices
    df = df.dropna()
    return df[['Return']].values  # Return as NumPy array

# Prepare dataset for Transformer
class StockDataset(Dataset):
    def __init__(self, data, seq_len=SEQ_LEN, pred_len=PRED_LEN):
        self.data = data
        self.seq_len = seq_len
        self.pred_len = pred_len

    def __len__(self):
        return len(self.data) - self.seq_len - self.pred_len + 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.seq_len]
        y = self.data[idx + self.seq_len:idx + self.seq_len + self.pred_len]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# Load all stock data and concatenate
all_data = []
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        stock_data = load_stock_data(file_path)
        all_data.append(stock_data)

# Stack all data
all_data = np.concatenate(all_data, axis=0)

# Train-validation-test split
train_size = int(len(all_data) * 0.7)
val_size = int(len(all_data) * 0.15)
train_data = all_data[:train_size]
val_data = all_data[train_size:train_size + val_size]
test_data = all_data[train_size + val_size:]

# Create PyTorch datasets and dataloaders
train_dataset = StockDataset(train_data)
val_dataset = StockDataset(val_data)
test_dataset = StockDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Transformer model
class TransformerModel(nn.Module):
    def __init__(self, d_model=D_MODEL, nhead=NHEAD, num_layers=NUM_LAYERS, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, PRED_LEN)

    def forward(self, x):
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch, features)
        out = self.transformer_encoder(x)
        out = out[-1]  # Take the last output for prediction
        return self.fc(out)

# Initialize model
model = TransformerModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.MSELoss()

# Training loop
def train_model(model, train_loader, val_loader, epochs=EPOCHS):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                y_pred = model(x)
                loss = criterion(y_pred, y)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

# Train the model
train_model(model, train_loader, val_loader)

# Test the model
def evaluate_model(model, test_loader):
    model.eval()
    preds, actuals = [], []
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            preds.append(y_pred.cpu().numpy())
            actuals.append(y.cpu().numpy())

    return np.concatenate(preds), np.concatenate(actuals)

# Get predictions
preds, actuals = evaluate_model(model, test_loader)
print(f"Test MSE: {np.mean((preds - actuals)**2):.6f}")

In [None]:
class customdataset(Dataset):
    def __init__(self,data) -> None:
        super().__init__()
        self.data = data
    def __len__(self):
        return len(self.data[0])
    def __getitem__(self, index):
        return self.data[index]

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, ff_dim, output_dim,transformer_num = 2, dropout=0.1 ):
        super(TransformerModel,self).__init__()
        self.embedding = nn.Embedding(input_dim, model_dim)
        self.transformers = nn.Module()
        for _ in range(transformer_num):
            self.transformers.append(
                nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout, batch_first=True),
                num_layers=n_layers
            )
        self.fc = nn.Linear(d_model,1)
        
    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.fc_out(x[:, -1, :])  # Output from the last token
        return x

In [None]:
batch_size = 32
epochs = 25
lr = 0.001
transformer_num = 3

In [None]:
dataset = customdataset(data)
dataloader = DataLoader(dataset, batch_size= batch_size, shuffle= True)

In [None]:
# Model, Loss, Optimizer
model = StockTransformer(D_MODEL, N_HEADS, N_LAYERS, DROPOUT,transformer_num = transformer_num)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
# Training Loop
for epoch in range(epochs):
    total_loss = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.unsqueeze(-1), targets.unsqueeze(-1)  # Add feature dim
        tgt_inputs = torch.zeros_like(targets)  # Decoder input (can be improved)
        optimizer.zero_grad()
        outputs = model(inputs, tgt_inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

print("Training complete.")