In [1]:
import csv
import math

import torch
import torch.nn as nn

# from torch_geometric.nn.pool import global_mean_pool

from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
from dataset import CVFConfigForTransformerDataset, CVFConfigForTransformerTestDataset

In [3]:
device = "cuda"

In [4]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7534561c8ad0>

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]

In [6]:
# pe = PositionalEncoding(10)
# src = torch.randint(0, 5, (2, 2, 10))  # Random token IDs as input
# pe(src).shape

In [7]:
batch_size = 256

dataset = CVFConfigForTransformerDataset(
    device,
    "implicit_graph_n5",
    "implicit_graph_n5_pt_adj_list.txt",
    "implicit_graph_n5_config_rank_dataset.csv",
    D=5,
    program="dijkstra",
)


train_size = int(1.0 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Total configs: 243.


In [8]:
class EmbeddingProjectionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(EmbeddingProjectionModel, self).__init__()
        self.projection = nn.Linear(input_dim, output_dim)  # Project Z to D
    
    def forward(self, x):
        # Apply the linear transformation to the input tensor
        return self.projection(x)  # Output shape: (B, S, D)

In [9]:
class SimpleTransformer(nn.Module):
    def __init__(
        self,
        d_model,
        nhead,
        num_encoder_layers,
        dim_feedforward,
        max_len=5000,
    ):
        super(SimpleTransformer, self).__init__()

        # Word Embeddings Layer
        self.embedding = EmbeddingProjectionModel(dataset.D, d_model)

        # Positional Encoding
        self.positional_encoding = PositionalEncoding(d_model, max_len)

        # Transformer Encoder Layer
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            batch_first=True,
        )

        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_encoder_layer, num_layers=num_encoder_layers
        )

        # Output Layer: For Language Modeling (next token prediction)
        self.output_layer = nn.Linear(d_model, 1)

    def forward(self, src, attention_mask):
        # src shape: (batch_size, seq_len)

        # Embed the input
        src = self.embedding(src)  # (batch_size, seq_len, d_model)

        # Add positional encoding
        src = self.positional_encoding(src)  # (batch_size, seq_len, d_model)

        # Transform the input using Transformer Encoder
        memory = self.transformer_encoder(
            src.transpose(0, 1), src_key_padding_mask=attention_mask.transpose(0, 1)
        )  # (seq_len, batch_size, d_model)

        # Final output layer
        output = self.output_layer(
            memory.transpose(0, 1)
        )  # (batch_size, seq_len, vocab_size)

        # output = torch.relu(output)

        # output = global_mean_pool(output, torch.zeros(output.size(1)).to(device).long())

        return output

    def fit(self, epochs, dataloader):
        criterion = torch.nn.MSELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=0.00001)
        for epoch in range(1, epochs + 1):
            self.train()
            total_loss = 0
            count = 0
            for batch in dataloader:
                x = batch[0][0]
                attention_mask = batch[0][1]
                y = batch[1]
                out = self(x, attention_mask)
                out = out.squeeze(-1)
                out = out[attention_mask]
                y = y[attention_mask]
                # print("out", out)
                optimizer.zero_grad()
                loss = criterion(out, y)
                total_loss += loss
                count += 1
                loss.backward()
                optimizer.step()

            print(
                "Training set | Epoch %s | MSE Loss: %s"
                % (
                    epoch,
                    round((total_loss / count).item(), 4),
                )
            )

In [10]:
# Example usage
d_model = 8  # Embedding dimension (also used in Transformer)
nhead = 2  # Number of attention heads
num_encoder_layers = 6  # Number of layers in the Transformer Encoder
dim_feedforward = 512  # Feedforward layer dimension
max_len = 100  # Max sequence length
N = dataset.D


epochs = 10

# Initialize the model
model = SimpleTransformer(d_model, nhead, num_encoder_layers, dim_feedforward, max_len)
model.to(device)

model.fit(epochs, loader)

Training set | Epoch 1 | MSE Loss: 59.744
Training set | Epoch 2 | MSE Loss: 42.9202
Training set | Epoch 3 | MSE Loss: 29.3638
Training set | Epoch 4 | MSE Loss: 22.0744
Training set | Epoch 5 | MSE Loss: 16.2885
Training set | Epoch 6 | MSE Loss: 9.7014
Training set | Epoch 7 | MSE Loss: 7.4842
Training set | Epoch 8 | MSE Loss: 5.8663
Training set | Epoch 9 | MSE Loss: 4.7507
Training set | Epoch 10 | MSE Loss: 4.1998


In [12]:
model.eval()

criterion = torch.nn.MSELoss()

f = open(
    f"test_results/test_result_trans.csv",
    "w",
    newline="",
)
csv_writer = csv.writer(f)
csv_writer.writerow(["Dataset", "Actual", "Predicted"])


test_dataset = CVFConfigForTransformerTestDataset(
    device,
    "implicit_graph_n5",
    "implicit_graph_n5_config_rank_dataset.csv",
    D=5,
    program="dijkstra",
)

with torch.no_grad():
    test_dataloader = DataLoader(test_dataset, batch_size=1024)

    total_loss = 0
    total_matched = 0
    count = 0
    total_seq_count = 0
    for batch in test_dataloader:
        x = batch[0][:, 0, :]
        padd = torch.full((dataset.sequence_length - 1, dataset.D), -1).to(device)
        padded_batches = [torch.cat([batch.unsqueeze(0), padd]) for batch in x]
        x = torch.stack(padded_batches)
        attention_mask = torch.full(
            (x.shape[0], dataset.sequence_length), 0, dtype=torch.bool
        ).to(device)
        attention_mask[:, 0] = True
        y = batch[1]
        out = model(x, attention_mask)
        # out = out.squeeze(-1)
        out = out[attention_mask]
        csv_writer.writerows(
            (j.item(), k.item())
            for (j, k) in zip(y.detach().cpu().numpy(), out.detach().cpu().numpy())
        )
        loss = criterion(out, y)
        total_loss += loss
        out = torch.round(out)
        matched = (out == y).sum().item()
        total_seq_count += out.numel()
        total_matched += matched
        count += 1

    print(
        f"Test set | MSE loss: {round((total_loss / count).item(), 4)} | Total matched: {total_matched:,} out of {total_seq_count:,} (Accuracy: {round(total_matched / total_seq_count * 100, 2):,}%)",
    )

f.close()

Total configs: 243.


Test set | MSE loss: 66.3446 | Total matched: 45 out of 243 (Accuracy: 18.52%)
