from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k
from typing import Iterable, List


SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.
# pip install -U spacy
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from numpy import load
from torch.utils.data import Dataset

In [2]:
# load array
Data = load('Data.npy')
# print the array

In [3]:
Data=np.nan_to_num(Data)

In [4]:
Xdata=[]
Ydata=[]
Xtrain_data=[]
Ytrain_data=[]
Xtest_data=[]
Ytest_data=[]

for idx in range(len(Data)//39):
#     print(np.isinf(Data[39*idx:39*(idx)+38][:,:].tolist()).any())
    if(np.isinf(Data[39*idx:39*(idx)+38][:,:].tolist()).any()):
        print(np.isinf(Data[39*idx:39*(idx)+38][:,:].tolist()).any())
        continue
    if idx< 3735:
        Xtrain_data.append(Data[39*idx:39*(idx)+38][:,:].tolist())
        Ytrain_data.append(Data[39*idx:39*(idx)+38][:,5].tolist())
    else:
        Xtest_data.append(Data[39*idx:39*(idx)+38][:,:].tolist())
        Ytest_data.append(Data[39*idx:39*(idx)+38][:,5].tolist())

Xtrain_data=np.vstack(Xtrain_data)
Ytrain_data=np.vstack(Ytrain_data)

Xtrain_data=torch.FloatTensor(Xtrain_data)
Ytrain_data=torch.IntTensor(Ytrain_data)
Ytrain_data=Ytrain_data.view(-1)


Xtest_data=np.vstack(Xtest_data)
Ytest_data=np.vstack(Ytest_data)
Xtest_data=torch.FloatTensor(Xtest_data)
Ytest_data=torch.IntTensor(Ytest_data)
Ytest_data=Ytest_data.view(-1)

In [5]:
Xtrain_data.size()

torch.Size([141930, 6])

In [6]:
Ytrain_data=2*(Ytrain_data>0).long()+(Ytrain_data==0).long()
Ytest_data=2*(Ytest_data>0).long()+(Ytest_data==0).long()

Ydata=(torch.stack([Ydata>0,Ydata==0, Ydata<0],axis=0)).float()

In [7]:
Ytrain_data=Ytrain_data.T
Ytest_data=Ytest_data.T

from torch.utils.data import DataLoader, random_split
Xtrain_data,Xtest_data=random_split(Xdata, [157700-15770, 15770])
Ytrain_data,Ytest_data=random_split(Ydata, [157700-15770, 15770])

Xtrain_data=Xtrain_data.dataset.data
Xtest_data=Xtest_data.dataset.data
# Ytrain_data=Ytrain_data.dataset.data.long()
# Ytest_data=Ytest_data.dataset.data.long()
Ytrain_data=Ytrain_data.dataset.data
Ytest_data=Ytest_data.dataset.data

In [8]:
Xtrain_data.size()

torch.Size([141930, 6])

In [9]:
np.isinf(Data).any()

False

In [10]:
Xtrain_data[54763//6]

tensor([7.9934e+04,        inf,        inf, 3.4947e+03, 2.1360e+03, 2.1000e+01])

In [11]:
np.argmax(np.isinf(Xtrain_data))

tensor(54763)

In [12]:
np.isinf(Xtest_data).any()

tensor(1, dtype=torch.uint8)

In [13]:
print(np.isinf(Xtest_data).any())

tensor(1, dtype=torch.uint8)


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    # 데이터셋을 bsz 파트들로 나눕니다.
    nbatch = data.size(0) // bsz
    # 깔끔하게 나누어 떨어지지 않는 추가적인 부분(나머지들) 은 잘라냅니다.
    data = data.narrow(0, 0, nbatch * bsz)
    # 데이터에 대하여 bsz 배치들로 동등하게 나눕니다.
    data = data.view(bsz, -1,data.size(1)).transpose(0,1).contiguous()
    return data.to(device)

# batch_size = 25
# eval_batch_size = 25

# Xtrain_data = batchify(Xtrain_data, batch_size)
# Xtest_data = batchify(Xtest_data, eval_batch_size)
# Ytrain_data = batchify(Ytrain_data, batch_size)
# Ytest_data = batchify(Ytest_data, eval_batch_size)

In [15]:
bptt = 38
def get_batch(source, i,bs):
    seq_len = min(bptt*bs, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [16]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
#         self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.src_tok_emb = nn.Linear(6, emb_size)
#         self.tgt_tok_emb = nn.Linear(3, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
#         tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
#         src_emb = self.positional_encoding(src)
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg).squeeze())
#         outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
#                                 src_padding_mask, tgt_padding_mask, memory_key_padding_mask)


        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [17]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)
#     src_padding_mask = (src == PAD_IDX).transpose(0, 1)
#     tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, None, None
#     src_padding_mask = (src == PAD_IDX).transpose(0, 1)
#     tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
#     return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [18]:
torch.manual_seed(0)

# SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
# TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
SRC_VOCAB_SIZE = 6
TGT_VOCAB_SIZE = 3
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 10
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [19]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
# def tensor_transform(token_ids: List[int]):
#     return torch.cat((torch.tensor([BOS_IDX]),
#                       torch.tensor(token_ids),
#                       torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
# text_transform = {}
# for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
#     text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
#                                                vocab_transform[ln], #Numericalization
#                                                tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [20]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
#     train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
#     train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

#     for src, tgt in train_dataloader:
    for batch, i in enumerate(range(0, Xtrain_data.size(0) - 1, BATCH_SIZE*bptt)):
        data, org_targets = get_batch(Xtrain_data, i,BATCH_SIZE)
        if (data.isnan().any() or data.isinf().any()):
#            print(data)
            continue
#         _,targets = get_batch(Ytrain_data,i)
        targets,_ = get_batch(Ytrain_data,i,BATCH_SIZE)
#         src = src.to(DEVICE)
#         tgt = tgt.to(DEVICE)
        targets=torch.unsqueeze(targets,1)
        src=batchify(data,BATCH_SIZE)
        tgt=batchify(targets,BATCH_SIZE)
#         print(targets.size())
#         print(tgt.size())
#         print(data.size())
#         print(src.size())
#         src = data.to(DEVICE)
#         tgt = targets.to(DEVICE)
#         src=torch.unsqueeze(src,0)
#         tgt=torch.unsqueeze(tgt,0)
#         tgt_input = tgt[:-1, :]
        tgt_input = tgt[:-1]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
#         print(tgt_input.size())
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

#         tgt_out = tgt[1:, :]
        tgt_out = tgt[1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        
    return losses / Xtrain_data.size(0)


def evaluate(model):
    model.eval()
    losses = 0
    total=0
    correct=0
    correct1=0
    tot1=0
    correct2=0
    tot2=0
    correct0=0
    tot0=0
    with torch.no_grad():
        for i in range(0, Xtest_data.size(0) - 1, bptt):

            data, org_targets = get_batch(Xtest_data, i,BATCH_SIZE)
            if (data.isnan().any() or data.isinf().any()):
#            print(data)
                continue
            targets,_ = get_batch(Ytest_data,i,BATCH_SIZE)
            targets=torch.unsqueeze(targets,1)
            src=batchify(data,BATCH_SIZE)
            tgt=batchify(targets,BATCH_SIZE)

            tgt_input = tgt[:-1]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

            logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

            
            tgt_out = tgt[1:]
#             print(logits.size())
#             print(tgt_out)
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            if(loss.isnan()):
                print(src,tgt_input)
                break
#            print(loss)
            losses += loss.item()
#             output_flat = logits.view(-1)
            _,predicted=torch.max(logits,-1)
#             print(predicted)
#             print(logits.size())
#             print(tgt_out.size())
            
            correct += (tgt_out.squeeze() == predicted).sum().item()
            total+=len(predicted)*BATCH_SIZE
            tot0+=(0== tgt_out.squeeze()).sum().item()
            tot1+=(1== tgt_out.squeeze()).sum().item()
            tot2+=(2== tgt_out.squeeze()).sum().item()
            correct0+=((0== predicted) &(0==tgt_out.squeeze())).sum().item()
            correct1+=((1== predicted)&(1==tgt_out.squeeze())).sum().item()
            correct2+=((2== predicted)&(2==tgt_out.squeeze())).sum().item()
            
    print(total)
    print(correct)
#     print("Acc0:",correct0,tot0)     
#     print("Acc1:",correct1,tot1)     
#     print("Acc2:",correct2,tot2)   
    print("Acc:",correct/total*100)            
    print("Acc0:",correct0/tot0*100,correct0,tot0)     
    print("Acc1:",correct1/tot1*100,correct1,tot1)     
    print("Acc2:",correct2/tot2*100,correct2,tot2)     
    return losses / Xtest_data.size(0)

In [21]:
evaluate(transformer)

148080
20690
Acc: 13.972177201512695
Acc0: 0.0 0 32749
Acc1: 0.12874223061743506 122 94763
Acc2: 100.0 20568 20568


0.07646666717967733

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 200
best_val_loss=100000000
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = transformer
PATH='best_model_seq'
torch.save(best_model.state_dict(), PATH)

# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

148080
129928
Acc: 87.74176121015668
Acc0: 85.46215151607683 27988 32749
Acc1: 90.59231978725873 85848 94763
Acc2: 78.23803967327888 16092 20568
Epoch: 1, Train loss: 0.001, Val loss: 0.009, Epoch time = 12.556s
148080
131512
Acc: 88.81145326850351
Acc0: 85.15069162417173 27886 32749
Acc1: 92.3039582959594 87470 94763
Acc2: 78.54920264488527 16156 20568
Epoch: 2, Train loss: 0.001, Val loss: 0.008, Epoch time = 12.604s
148080
132656
Acc: 89.58400864397623
Acc0: 85.07129988701945 27860 32749
Acc1: 94.41554193092242 89471 94763
Acc2: 74.50894593543367 15325 20568
Epoch: 3, Train loss: 0.001, Val loss: 0.008, Epoch time = 12.490s
148080
132760
Acc: 89.65424095083738
Acc0: 85.09572811383553 27868 32749
Acc1: 95.52673511813683 90524 94763
Acc2: 69.85608712563204 14368 20568
Epoch: 4, Train loss: 0.001, Val loss: 0.008, Epoch time = 12.551s
148080
132458
Acc: 89.45029713668288
Acc0: 86.39347766344011 28293 32749
Acc1: 95.38005339636779 90385 94763
Acc2: 66.99727732399843 13780 20568
Epoch: 5