In [1]:
import torch
from konlpy.tag import Okt
import spacy
import numpy as np
import time
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable

In [2]:
okt = Okt()
nlp = spacy.load("en_core_web_sm")
special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]

In [3]:
parallel_corpus = [
    {"en": "Hello.", "ko": "안녕하세요."},
    {"en": "How are you?", "ko": "잘 지내세요?"},
    {"en": "I am a student.", "ko": "저는 학생입니다."},
    {"en": "What is your name?", "ko": "당신의 이름은 무엇인가요?"},
    {"en": "My name is John.", "ko": "제 이름은 존입니다."},
    {"en": "Nice to meet you.", "ko": "만나서 반갑습니다."},
    {"en": "Good morning!", "ko": "좋은 아침이에요!"},
    {"en": "Good night.", "ko": "안녕히 주무세요."},
    {"en": "Where are you going?", "ko": "어디 가세요?"},
    {"en": "I’m going home.", "ko": "집에 가는 중이에요."},
    {"en": "See you later.", "ko": "나중에 봐요."},
    {"en": "Thank you very much.", "ko": "정말 감사합니다."},
    {"en": "You're welcome.", "ko": "천만에요."},
    {"en": "Excuse me.", "ko": "실례합니다."},
    {"en": "I’m sorry.", "ko": "죄송합니다."},
    {"en": "No problem.", "ko": "괜찮아요."},
    {"en": "Do you speak English?", "ko": "영어 할 줄 아세요?"},
    {"en": "I speak a little Korean.", "ko": "한국어 조금 할 줄 알아요."},
    {"en": "How much is this?", "ko": "이거 얼마예요?"},
    {"en": "Where is the restroom?", "ko": "화장실 어디예요?"},
    {"en": "Can you help me?", "ko": "도와주실 수 있나요?"},
    {"en": "I don’t understand.", "ko": "이해하지 못했어요."},
    {"en": "Please speak slowly.", "ko": "천천히 말씀해 주세요."},
    {"en": "I’m hungry.", "ko": "배고파요."},
    {"en": "I’m tired.", "ko": "피곤해요."},
    {"en": "What time is it?", "ko": "지금 몇 시예요?"},
    {"en": "Today is Monday.", "ko": "오늘은 월요일이에요."},
    {"en": "It’s raining.", "ko": "비가 오고 있어요."},
    {"en": "I like music.", "ko": "저는 음악을 좋아해요."},
    {"en": "Let’s go together.", "ko": "같이 가요."},
    {"en": "Have a nice day!", "ko": "좋은 하루 되세요!"},
    {"en": "Goodbye!", "ko": "안녕히 가세요!"}
]

In [4]:
class En2KoDataset(Dataset):
    def __init__(self, corpus, okt, nlp, random=False, special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]):
        
        self.random = random
        
        self.src_sentences = []
        self.tgt_sentences = []
        
        self.en_tokens = set()
        self.ko_tokens = set()
        for line in parallel_corpus:
            self.src_sentences.append(line['en'])
            self.tgt_sentences.append(line['ko'])
            self.ko_tokens.update(okt.morphs(line['ko']))
            en_list = [ token.text for token in nlp(line['en'])]
            self.en_tokens.update(en_list)
    
        self.special_tokens = special_tokens
    
        self.en_vocab = special_tokens + sorted(list(self.en_tokens))
        self.en_token2idx = {token: idx for idx, token in enumerate(self.en_vocab)}
        self.en_idx2token = {idx: token for token, idx in self.en_token2idx.items()}
        
        self.ko_vocab = special_tokens + sorted(list(self.ko_tokens))
        self.ko_token2idx = {token: idx for idx, token in enumerate(self.ko_vocab)}
        self.ko_idx2token = {idx: token for token, idx in self.ko_token2idx.items()}

        

        self.srcs = []
        self.trgs = []
        #self.trgs_y = []
        
        for line in parallel_corpus:
            trg = self.en_encode(line['en'], self.en_token2idx)
            self.trgs.append(trg)
            #self.trgs_y.append(trg[1:])
            src = self.ko_encode(line['ko'], self.ko_token2idx) 
            self.srcs.append(src)

        self.n_sentences = len(self.src_sentences) 
        print(self.n_sentences)
            
    def ko_encode(self, sentence, token2idx):
        tokens = okt.morphs(sentence)
        ids = [token2idx.get(tok, token2idx["<unk>"]) for tok in tokens]
        out = None
        out = [token2idx["<sos>"]] + ids + [token2idx["<eos>"]]   
        return out


    def en_encode(self, sentence, token2idx):
        tokens = [ token.text for token in nlp(sentence)]
        ids = [token2idx.get(tok, token2idx["<unk>"]) for tok in tokens]
        out = None
        out = [token2idx["<sos>"]] + ids + [token2idx["<eos>"]]   
        return out

    def __len__(self):
        return len(self.src_sentences)


    def __getitem__(self, index):
        #print(f"index: {index}")
        # if self.random:
        #     t = int(time.time() * 1000000)
        #     np.random.seed(((t & 0xff000000) >> 24) +
        #                    ((t & 0x00ff0000) >> 8) +
        #                    ((t & 0x0000ff00) << 8) +
        #                    ((t & 0x000000ff) << 24))
        #     index = np.random.randint(self.n_sentences)
        # else:
        #     index = index % self.n_sentences
        #     pass
        #index = index % self.__len__()
        
        sample = self.srcs[index], self.trgs[index]
        return sample


In [5]:
dataset = En2KoDataset(parallel_corpus, okt, nlp)

32


In [12]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

    
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        #print('tgt_mask.shape', tgt_mask.shape)
        #print('tgt_mask', tgt_mask)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask
        
def custom_collate_fn(raw_data):
    srcs, trgs = zip(*raw_data)  # unpack batch list of triples

    srcs = [torch.tensor(seq) for seq in srcs]
    trgs = [torch.tensor(seq) for seq in trgs]

    srcs_padded = pad_sequence(srcs, batch_first=True, padding_value=0)
    trgs_padded = pad_sequence(trgs, batch_first=True, padding_value=0)
    
    return Batch(srcs_padded, trgs_padded)

In [14]:
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)

In [15]:
for batch in loader:
    #src_batch, trg_batch  = batch
    print("Source:", batch.src.shape)     # (B, T_src)
    print("Target:", batch.trg.shape)     # (B, T_trg)

Source: torch.Size([4, 8])
Target: torch.Size([4, 7])
Source: torch.Size([4, 8])
Target: torch.Size([4, 6])
Source: torch.Size([4, 6])
Target: torch.Size([4, 6])
Source: torch.Size([4, 8])
Target: torch.Size([4, 6])
Source: torch.Size([4, 7])
Target: torch.Size([4, 6])
Source: torch.Size([4, 7])
Target: torch.Size([4, 6])
Source: torch.Size([4, 9])
Target: torch.Size([4, 6])
Source: torch.Size([4, 7])
Target: torch.Size([4, 6])
