In [12]:
import pandas as pd
import re
from dataloader import *
from transformer import *
from bleu import *
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from datasets import load_dataset
import string
import torch
import time
from tqdm import tqdm
import spacy
from collections import Counter

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [111]:
tokenizer = Tokenizer()
dataloader = DataLoader(tokenize = tokenizer.tokenize)

# load English data
filenames = 'Data/multi30k-dataset/data/task1/tok/train.lc.norm.tok.en'
textENG = dataloader.load_doc(filenames)
sentencesENG = dataloader.to_sentence(textENG)

# load French data
filenames = 'Data/multi30k-dataset/data/task1/tok/train.lc.norm.tok.fr'
textFR = dataloader.load_doc(filenames)
sentencesFR = dataloader.to_sentence(textFR)

# load German data
filenames ='Data/multi30k-dataset/data/task1/tok/train.lc.norm.tok.de'
textDEU = dataloader.load_doc(filenames)
sentencesDEU = dataloader.to_sentence(textDEU)

dataset initializing start


In [116]:
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")
spacy_fr = spacy.load("fr_core_news_sm")

def tokenize_eng(text):
   return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]
    
def tokenize_fr(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

In [120]:
def to_vocab(lines):
    vocab = Counter()
    for line in tqdm(lines):
        vocab.update(line)
    return vocab

def trim_vocab(vocab, min_occurance):
    tokens = [k for k,c in vocab.items() if c >= min_occurance]
    tokens = ['__PAD__', '__SOS__', '__UNK__'] + tokens 
    return {tok : idx for idx, tok in enumerate(tokens)}

def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        new_tokens.append(vocab['__SOS__'])
        for token in line:
            if token in vocab.keys():
                new_tokens.append(vocab[token])
            else:
                new_tokens.append(vocab['__UNK__'])
        while len(new_tokens) < 64:
            new_tokens.append(vocab['__PAD__'])
        new_lines.append(new_tokens)
    return new_lines

In [121]:
vocab_eng = to_vocab([tokenize_eng(sentence) for sentence in sentencesENG])
vocab_eng = trim_vocab(vocab_eng, 2)
tokenized_eng = update_dataset([tokenize_eng(sentence) for sentence in sentencesENG], vocab_eng)

vocab_ger = to_vocab([tokenize_ger(sentence) for sentence in sentencesDEU])
vocab_ger = trim_vocab(vocab_ger, 2)
tokenized_ger = update_dataset([tokenize_ger(sentence) for sentence in sentencesDEU], vocab_ger)

vocab_fr = to_vocab([tokenize_fr(sentence) for sentence in sentencesFR])
vocab_fr = trim_vocab(vocab_fr, 2)
tokenized_fr = update_dataset([tokenize_fr(sentence) for sentence in sentencesFR], vocab_fr)

100%|██████████| 29000/29000 [00:00<00:00, 169601.52it/s]
100%|██████████| 29000/29000 [00:00<00:00, 184704.32it/s]
100%|██████████| 29000/29000 [00:00<00:00, 202325.44it/s]


In [158]:
# Custom Iterator Generator
def create_Iterators(source, target, batch_size, train_size=0.9):
    df = pd.DataFrame(data = {"source": source, "target": target})

    train, rem = train_test_split(df, train_size=train_size, random_state = 42)
    valid_size = 0.5
    valid, test = train_test_split(rem, train_size=valid_size, random_state = 42)
    
    train_x, train_y = torch.tensor(list(train["source"].values)), torch.tensor(list(train["target"].values))
    valid_x, valid_y = torch.tensor(list(valid["source"].values)), torch.tensor(list(valid["target"].values))
    test_x, test_y = torch.tensor(list(test["source"].values)), torch.tensor(list(test["target"].values))

    train = torch.utils.data.TensorDataset(train_x, train_y)
    validate = torch.utils.data.TensorDataset(valid_x, valid_y)
    test = torch.utils.data.TensorDataset(test_x, test_y)

    train_iterator = torch.utils.data.DataLoader(dataset = train, batch_size = batch_size, shuffle = True)
    valid_iterator = torch.utils.data.DataLoader(dataset = validate, batch_size = batch_size, shuffle = True)
    test_iterator = torch.utils.data.DataLoader(dataset = test, batch_size = batch_size, shuffle = True)
    
    return train_iterator, valid_iterator, test_iterator 

In [176]:
train_iter_A, valid_iter_A, test_iter_A = create_Iterators(tokenized_fr, tokenized_eng, batch_size=64)
train_iter_B, valid_iter_B, test_iter_B = create_Iterators(tokenized_ger, tokenized_eng, batch_size=64)

In [177]:
next(iter(train_iter_A))

[tensor([[   1,    3,  104,  ...,    0,    0,    0],
         [   1,   18,   29,  ...,    0,    0,    0],
         [   1,   70,  144,  ...,    0,    0,    0],
         ...,
         [   1,   18,   29,  ...,    0,    0,    0],
         [   1,   87, 2531,  ...,    0,    0,    0],
         [   1,   18,   94,  ...,    0,    0,    0]]),
 tensor([[   1,    3, 2681,  ...,    0,    0,    0],
         [   1,   20,   30,  ...,    0,    0,    0],
         [   1,  117,    8,  ...,    0,    0,    0],
         ...,
         [   1,   30,   63,  ...,    0,    0,    0],
         [   1,   83, 1527,  ...,    0,    0,    0],
         [   1,   20,   89,  ...,    0,    0,    0]])]

In [168]:
embedding_enc_A = torch.nn.Embedding(len(vocab_fr), 128)
embedding_dec_A = torch.nn.Embedding(len(vocab_eng), 128)

In [169]:
transformerA = Transformer(
    src_pad_idx = vocab_fr['__PAD__'],
    trg_pad_idx = vocab_eng['__PAD__'],
    trg_sos_idx = vocab_eng['__SOS__'],
    embedding_enc = embedding_enc_A,
    embedding_dec = embedding_dec_A,
    enc_voc_size = len(vocab_fr), 
    dec_voc_size = len(vocab_eng), 
    d_model = 128, 
    n_head = 8, 
    max_len = 72,
    ffn_hidden = 512, 
    n_layers = 2, 
    drop_prob = 0.2, 
    device = device
)

In [178]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm(iterator)):
        src = batch[0]
        trg = batch[1]
        print(src.shape)
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        #print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

    return epoch_loss / len(iterator)


def evaluate(model, iterator, target_vocab, criterion):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(iterator)):
            src = batch[0]
            trg = batch[1]
            output = model(src, trg[:, :-1])
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []
            for j in range(len(batch[0])):
                trg_words = idx_to_word(batch[1][j], target_vocab)
                output_words = output[j].max(dim=1)[1]
                output_words = idx_to_word(output_words, target_vocab)
                bleu = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                total_bleu.append(bleu)

            total_bleu = sum(total_bleu) / len(total_bleu)
            batch_bleu.append(total_bleu)

    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu


def run(model, train_iter, valid_iter, target_vocab, optimizer, criterion, clip, scheduler, total_epoch, best_loss):
    train_losses, test_losses, bleus = [], [], []
    for step in range(total_epoch):
        start_time = time.time()
        train_loss = train(model, train_iter, optimizer, criterion, clip)
        valid_loss, bleu = evaluate(model, valid_iter, target_vocab, criterion)
        end_time = time.time()

        if step > 5:
            scheduler.step(valid_loss)

        train_losses.append(train_loss)
        test_losses.append(valid_loss)
        bleus.append(bleu)
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # if valid_loss < best_loss:
        #     best_loss = valid_loss
        #     torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

        # f = open('result/train_loss.txt', 'w')
        # f.write(str(train_losses))
        # f.close()

        # f = open('result/bleu.txt', 'w')
        # f.write(str(bleus))
        # f.close()

        # f = open('result/test_loss.txt', 'w')
        # f.write(str(test_losses))
        # f.close()

        print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
        print(f'\tBLEU Score: {bleu:.3f}')

In [179]:
optimizer = torch.optim.Adam(params=transformerA.parameters(),
                 lr=0.1,
                 weight_decay=5e-4,
                 eps=5e-9)
criterion = criterion = nn.CrossEntropyLoss(ignore_index=vocab_fr['__PAD__'])

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 verbose=True,
                                                 factor=0.5,
                                                 patience=5)

run(transformerA, 
    train_iter=train_iter_A,
    valid_iter=valid_iter_A, 
    target_vocab=vocab_eng, 
    optimizer=optimizer, 
    criterion=criterion, 
    clip=1.0, 
    scheduler=scheduler,
    total_epoch=20,
    best_loss=float('inf'))

  0%|          | 0/408 [00:00<?, ?it/s]

torch.Size([64, 64])


  0%|          | 0/408 [00:05<?, ?it/s]


KeyboardInterrupt: 