In [1]:
from train.tomita import *
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import random
from train.config import read_config

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# read config
config = read_config('train/config.json')

In [2]:
from train.dset import get_dataloader
train_loader, test_loader = get_dataloader(
        config.num_samples, config.seq_length, config.test_split, config.func_name, config.batch_size
    )
# get one batch
for i, (x, y) in enumerate(train_loader):
    print(x)
    print(y)
    break

tensor([[2, 1, 1, 2, 1, 2, 2, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 2, 2, 1, 2, 2, 2, 1, 0, 0],
        [2, 1, 2, 2, 2, 2, 1, 2, 2, 0],
        [2, 2, 2, 1, 2, 2, 1, 0, 0, 0],
        [2, 1, 1, 1, 2, 1, 2, 0, 0, 0],
        [2, 1, 1, 2, 1, 2, 2, 2, 0, 0],
        [1, 2, 1, 0, 0, 0, 0, 0, 0, 0],
        [2, 2, 1, 2, 2, 1, 1, 2, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1]])


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].detach().to(x.device)

class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_classes, num_encoder_layers, dim_feedforward, max_seq_length):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_length)
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)
        self.output = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(x.size(1))
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.output(x)
        return x

# Hyperparameters and input data
input_dim = 3  # Assuming input tokens can be 0, 1, or 2
d_model = 512
nhead = 8
num_classes = 2
num_encoder_layers = 3
dim_feedforward = 2048
max_seq_length = 10

model = TransformerClassifier(input_dim, d_model, nhead, num_classes, num_encoder_layers, dim_feedforward, max_seq_length)

model.to(device)



TransformerClassifier(
  (embedding): Embedding(3, 512)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output): Linear(in_features=512, out_features=2, bias=True)
)

In [5]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]


class TransformerModel(nn.Module):
    def __init__(self, ntoken, ninput, nhead, nhid, nlayers, noutput):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninput)
        encoder_layers = nn.TransformerEncoderLayer(d_model=ninput, nhead=nhead, dim_feedforward=nhid)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=nlayers)
        self.encoder = nn.Embedding(ntoken, ninput)
        self.ninput = ninput
        self.decoder = nn.Linear(ninput, noutput)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.encoder(src) * torch.sqrt(torch.tensor(self.ninput, dtype=torch.float))
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output.mean(dim=0))
        return output


# Parameters for the model and data
ntokens = 3  # Size of vocabulary, 0, 1 and 2 in this task
ninput = 16  # Input dimension (embedding size)
nhead = 2  # Number of heads in the multi-head attention models
nhid = 32  # Dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # Number of nn.TransformerEncoderLayer in nn.TransformerEncoder
noutput = 1  # Number of output classes, in this case binary classification

# Initialize model
model = TransformerModel(ntokens, ninput, nhead, nhid, nlayers, noutput)
model.to(device)



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [4]:
# define loss and optimizer
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# train
model.train()
running_loss = 0.0
for epoch in range(200):
    for i, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y.flatten())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    if epoch % 10 == 0:
        print('[%d] loss: %.3f' % (epoch, running_loss / 10))
        running_loss = 0.0

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [12]:
# test
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for i, (x, y) in enumerate(test_loader):
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        _, predicted = torch.max(outputs.data, 1)
        total += y.size(0)
        correct += (predicted == y.flatten()).sum().item()
print('Accuracy of the network on the %d test samples: %d %%' % (total, 100 * correct / total))

Accuracy of the network on the 20 test samples: 40 %
