In [1]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
# 데이터만 keras에서 가져옴

from tensorflow import keras

vocab_size = 20000
maxlen = 200
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=vocab_size)

def pad_sequence(x, max_len):
    x = np.asarray(x)
    if len(x) < max_len:
        x = np.concatenate([np.zeros(max_len - len(x)), x])
    return x[len(x) - max_len:]

x_train = np.array(list(map(lambda x:pad_sequence(x,maxlen),x_train)))
x_test = np.array(list(map(lambda x:pad_sequence(x,maxlen),x_test)))

# transformer model 정의

In [24]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim),
        )
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    def forward(self, x):
        attn_output = self.attention(x, x, x)[0]
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)
        ff_output = self.ff(out1)
        ff_output = self.dropout2(ff_output)
        out2 = self.norm2(out1 + ff_output)
        return out2

In [29]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, embed_size, dtype=torch.float32, **kwargs):
        super().__init__(**kwargs)
        p,i = torch.meshgrid(torch.arange(max_len), 2*torch.arange(embed_size//2))
        self.pos_emb = torch.zeros(1, max_len, embed_size)
        self.pos_emb[:, :, 0::2] = torch.sin(p / 10_000 ** (i / embed_size))
        self.pos_emb[:, :, 1::2] = torch.cos(p / 10_000 ** (i / embed_size))
        self.pos_emb = self.pos_emb.type(dtype)

    def forward(self, x):
        _, batch_max_length = x.size()
        return self.pos_emb[:,:batch_max_length, :]


In [35]:
class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = PositionalEncoding(maxlen, embed_dim)
        # 논문에서처럼 고정된 positional embedding을 사용하지 않고 학습 가능한 embedding을 사용할 경우
        # self.pos_emb = nn.Embedding(maxlen, embed_dim)
        
    def forward(self, x):
        maxlen = x.shape[-1]
        positions = torch.arange(start=0, end=maxlen, dtype=torch.long)
        # 다음 부분이 없으면 position은 model.to(device) 해도 gpu로 넘어가지 않음
        positions = positions.to(x.device)
        positions = positions.unsqueeze(0).expand(x.shape)
        x = self.token_emb(x) + self.pos_emb(positions).to(x.device)
        return x

In [36]:
class Transformer(nn.Module):
    def __init__(self,maxlen,embed_dim,num_heads,ff_dim,hidden_dim=20,dropout=0.1):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen,vocab_size,embed_dim)
        self.transformer_block = TransformerBlock(embed_dim,num_heads,ff_dim)
        self.first_linear = nn.Linear(embed_dim,hidden_dim)
        self.last_linear = nn.Linear(hidden_dim,2)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        x = self.embedding_layer(x)
        x = self.transformer_block(x)
        x = torch.mean(x, dim=1)   # global average pooling
        x = self.dropout(x)
        x = self.first_linear(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.last_linear(x)
        return x


In [37]:
# for check
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
max_len = 200
transformer = Transformer(max_len, embed_dim,num_heads,ff_dim)
a = transformer(torch.Tensor(x_train[:4]).long())
a.shape

torch.Size([4, 2])

In [38]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss().to(device)
trainDS = torch.utils.data.TensorDataset(torch.Tensor(x_train).long(), torch.Tensor(y_train).long())
testDS = torch.utils.data.TensorDataset(torch.Tensor(x_test).long(), torch.Tensor(y_test).long())
trainDL = torch.utils.data.DataLoader(trainDS, batch_size=64, shuffle=True)
testDL = torch.utils.data.DataLoader(testDS, batch_size=64, shuffle=False)

In [39]:
epochs = 10
transformer = transformer.to(device)

for epoch in range(epochs):
    train_correct = 0
    train_loss = 0
    transformer.train()
    for inputs, targets in trainDL:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        pred = transformer(inputs)
        loss = criterion(pred, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*inputs.size(0)
        train_correct += (pred.argmax(1) == targets).sum().item()
    train_loss /= len(trainDL.dataset)
    train_acc = train_correct / len(trainDL.dataset)

    val_correct = 0 
    val_loss = 0
    transformer.eval()
    with torch.no_grad():
        for inputs, targets in testDL:
            inputs, targets = inputs.to(device), targets.to(device)
            pred = transformer(inputs)
            loss = criterion(pred, targets)
            val_loss += loss.item()*inputs.size(0)
            val_correct += (pred.argmax(1) == targets).sum().item()
    val_loss /= len(testDL.dataset)
    val_acc = val_correct / len(testDL.dataset)
    print(f'Epoch {epoch+1} of {epochs}')
    print(f'Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {val_loss:.4f} | Val Acc: {val_acc*100:.2f}%')
    print('---------------------------------')


Epoch 1 of 10
Train Loss: 0.6717 | Train Acc: 57.50%
Val Loss: 0.5841 | Val Acc: 68.85%
---------------------------------
Epoch 2 of 10
Train Loss: 0.4771 | Train Acc: 77.59%
Val Loss: 0.4385 | Val Acc: 79.83%
---------------------------------


# skorch

In [164]:
from skorch import NeuralNetClassifier
from skorch.dataset import ValidSplit
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net = NeuralNetClassifier(
                Transformer,
                module__maxlen=max_len,
                module__embed_dim=embed_dim,
                module__num_heads=num_heads,
                module__ff_dim=ff_dim,
                max_epochs=10,
                lr=0.001,
                batch_size=32,
                criterion=nn.CrossEntropyLoss,
                optimizer=torch.optim.Adam,
                train_split=ValidSplit(10),
                device=device)

In [165]:
net.fit(torch.Tensor(x_train).type(torch.long), 
        torch.Tensor(y_train).type(torch.long))
print('completed')

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.6350[0m       [32m0.7248[0m        [35m0.5414[0m  7.6202
      2        [36m0.4462[0m       [32m0.7808[0m        [35m0.4642[0m  7.4182
      3        [36m0.3560[0m       [32m0.8004[0m        [35m0.4427[0m  7.4269
      4        [36m0.3042[0m       [32m0.8052[0m        [35m0.4340[0m  7.6639
      5        [36m0.2628[0m       [32m0.8164[0m        0.4447  7.8591
      6        [36m0.2415[0m       [32m0.8204[0m        0.4381  7.9238
      7        [36m0.2196[0m       [32m0.8248[0m        0.4625  7.6630
      8        [36m0.2070[0m       0.8200        0.4918  7.8470
      9        [36m0.1850[0m       0.8140        0.5291  7.6456
     10        [36m0.1648[0m       0.8116        0.5599  7.8527
completed
