In [1]:
from modeling_lstm_seq2seq import *
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import MultiplicativeLR
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle

from datasets import load_dataset
from torchtext.data.utils import get_tokenizer
import torchtext



In [2]:
# import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
NUM_EPOCHS = 1
LEARNING_RATE = 0.001
BATCH_SIZE = 12
MAX_LENGTH = 128

LSTM_LOCAL_PATH = "local_lstm"
LSTM_GLOBAL_PATH = "global_lstm"
LSTM_PATH = "classic_lstm"

In [4]:
train_ds = load_dataset('wmt16', 'de-en', split='train[:1%]')
val_ds = load_dataset('wmt16', 'de-en', split='validation')
test_ds = load_dataset('wmt16', 'de-en', split='test')

Reusing dataset wmt16 (/home/neemesh20529/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0)
Reusing dataset wmt16 (/home/neemesh20529/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0)


In [5]:
train_ds = train_ds.train_test_split(test_size = 0.5)["train"]

In [7]:
de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

2023-04-04 04:23:02.567994: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-04-04 04:23:03.527399: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-04-04 04:23:03.527508: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-04-04 04:23:03.528797: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:3b:00.0 name: Tesla V100-PCIE-32GB computeCapability: 7.0
coreClock: 1.38GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-04-04 04:23:03.528843: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-04-04 04:23:03.530214: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2

In [8]:
def tokenize_de(text):
    return [tok for tok in de_tokenizer(text)]

def tokenize_en(text):
    return [tok for tok in en_tokenizer(text)]

In [9]:
tokenized_train_eng = [tokenize_en(text['en']) for text in tqdm(train_ds['translation'])]
tokenized_train_ger = [tokenize_de(text['de']) for text in tqdm(train_ds['translation'])]

100%|██████████| 22744/22744 [00:02<00:00, 9873.87it/s] 
100%|██████████| 22744/22744 [00:03<00:00, 7380.36it/s]


In [10]:
tokenized_val_eng = [tokenize_en(text['en']) for text in tqdm(val_ds['translation'])]
tokenized_val_ger = [tokenize_de(text['de']) for text in tqdm(val_ds['translation'])]

100%|██████████| 2169/2169 [00:00<00:00, 8184.05it/s]
100%|██████████| 2169/2169 [00:00<00:00, 6100.19it/s]


In [11]:
# create a vocabulary of the training samples for only the top 50000 most common words without torchtext

eng_vocab = {}
ger_vocab = {}

for text in tqdm(tokenized_train_eng):
    for word in text:
        if word in eng_vocab:
            eng_vocab[word] += 1
        else:
            eng_vocab[word] = 1

for text in tqdm(tokenized_train_ger):
    for word in text:
        if word in ger_vocab:
            ger_vocab[word] += 1
        else:
            ger_vocab[word] = 1

100%|██████████| 22744/22744 [00:00<00:00, 153833.28it/s]
100%|██████████| 22744/22744 [00:00<00:00, 130299.66it/s]


In [12]:
eng_vocab = {k: v for k, v in sorted(eng_vocab.items(), key=lambda item: item[1], reverse=True)}
ger_vocab = {k: v for k, v in sorted(ger_vocab.items(), key=lambda item: item[1], reverse=True)}

In [13]:
eng_vocab = dict(list(eng_vocab.items())[:50000])
ger_vocab = dict(list(ger_vocab.items())[:50000])

eng_vocab = {k: i+2 for i, k in enumerate(eng_vocab.keys())}
ger_vocab = {k: i+2 for i, k in enumerate(ger_vocab.keys())}

eng_vocab['<unk>'] = 0
ger_vocab['<unk>'] = 0

eng_vocab['<eos>'] = 1
ger_vocab['<eos>'] = 1

eng_vocab['<pad>'] = 0
ger_vocab['<pad>'] = 0

In [14]:
### pad the sequences to the same length

def pad_seq(seq, max_length):
    seq += ["<pad>" for i in range(max_length - len(seq))]
    return seq

tokenized_train_eng = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_train_eng)]
tokenized_train_ger = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_train_ger)]

tokenized_val_eng = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_val_eng)]
tokenized_val_ger = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_val_ger)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45489/45489 [00:02<00:00, 18382.03it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45489/45489 [00:02<00:00, 17444.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2169/2169 [00:00<00:00, 18342.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2169/2169 [00:00<00:00, 18745.17it/s]


In [15]:
### pad the sequences to the same length

def pad_seq(seq, max_length):
    if len(seq)>max_length:
        return seq[:max_length-1] + ['<eos>']
    elif len(seq) == max_length:
        return seq
    seq += ["<pad>" for i in range(max_length - len(seq))]
    return seq

tokenized_train_eng = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_train_eng)]
tokenized_train_ger = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_train_ger)]

tokenized_val_eng = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_val_eng)]
tokenized_val_ger = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_val_ger)]

100%|██████████| 22744/22744 [00:00<00:00, 190047.79it/s]
100%|██████████| 22744/22744 [00:00<00:00, 76710.99it/s]
100%|██████████| 2169/2169 [00:00<00:00, 98543.59it/s]
100%|██████████| 2169/2169 [00:00<00:00, 216998.51it/s]


In [14]:
def encode_eng(text):
    encoded = []
    for token in text:
        try:
            encoded.append(eng_vocab[token])
        except:
            encoded.append(eng_vocab['<unk>'])
    return encoded

def encode_ger(text):
    encoded = []
    for token in text:
        try:
            encoded.append(ger_vocab[token])
        except:
            encoded.append(ger_vocab['<unk>'])
    return encoded

In [15]:
encoded_train_eng = [encode_eng(text) for text in tqdm(tokenized_train_eng)]
encoded_train_ger = [encode_ger(text) for text in tqdm(tokenized_train_ger)]

100%|██████████| 22744/22744 [00:00<00:00, 67712.92it/s]
100%|██████████| 22744/22744 [00:00<00:00, 67985.38it/s]


In [16]:
encoded_val_eng = [encode_eng(text) for text in tqdm(tokenized_val_eng)]
encoded_val_ger = [encode_ger(text) for text in tqdm(tokenized_val_ger)]

100%|██████████| 2169/2169 [00:00<00:00, 51083.41it/s]
100%|██████████| 2169/2169 [00:00<00:00, 58577.93it/s]


In [18]:
tokenized_train_dataloader = DataLoader(list(zip(encoded_train_eng, encoded_train_ger)), batch_size=BATCH_SIZE, shuffle=True)
tokenized_val_dataloader = DataLoader(list(zip(encoded_val_eng, encoded_val_ger)), batch_size=BATCH_SIZE, shuffle=True)

In [18]:
config = Config(
    input_size=len(ger_vocab),
    embedding_size=1000,
    hidden_size=1000,
    num_layers=4,
    vocab_size=len(ger_vocab),
    dropout=0.2,
    device="cuda",
    max_length=MAX_LENGTH,
)

In [19]:
model = LSTMSeq2Seq(config, attention = False, alignment = "local-m", scoring_function = "general")
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [20]:
model.to(config.device)

LSTMSeq2Seq(
  (encoder): LSTMEncoder(
    (embedding): Embedding(35138, 1000)
    (lstm): LSTM(1000, 1000, num_layers=4, batch_first=True, dropout=0.2)
  )
  (decoder): LSTMDecoder(
    (embedding): Embedding(35138, 1000)
    (lstm): LSTM(1000, 1000, num_layers=4, batch_first=True, dropout=0.2)
  )
  (lm_head): Linear(in_features=1000, out_features=35138, bias=True)
)

In [21]:
import evaluate
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

In [22]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(preds, labels):
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds, decoded_labels = postprocess_text(preds, labels)

    bleu_1 = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order=1)
    bleu_2 = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order=2)
    rouge_l = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rougeL"])
    result = {"bleu_1": bleu_1["bleu"], "bleu_2": bleu_2["bleu"], "rouge_l": rouge_l["rougeL"]}

    return result

In [23]:
def train(model, dataloader, optimizer, criterion, device):
    # model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        src = torch.stack(batch[0]).to(torch.int64).to(device)
        trg = torch.stack(batch[1]).to(torch.int64).to(device)
        src = src.transpose(0, 1)
        trg = trg.transpose(0, 1)
        optimizer.zero_grad()
        output = model(src, trg)
        # output = output[1:].view(-1, output.shape[2])
        # trg = trg[1:].view(-1)
        # output = output.max(dim = 2)

        output = output.permute(0, 2, 1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return model, optimizer, epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    # model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            src = torch.stack(batch[0]).to(torch.int64).to(device)
            trg = torch.stack(batch[1]).to(torch.int64).to(device)
            output = model(src, trg, 0)
            # output = output[1:].view(-1, output.shape[2])
            # trg = trg[1:].view(-1)
            output = output.permute(0, 2, 1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [24]:
for epoch in range(NUM_EPOCHS):
    model, optimizer, train_loss = train(model, tokenized_train_dataloader, optimizer, criterion, config.device)
    val_loss = evaluate(model, tokenized_val_dataloader, criterion, config.device)
    train_losses.append(train_loss)
    dev_losses.append(dev_loss)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {val_loss:.3f}')

100%|██████████| 1896/1896 [1:47:00<00:00,  3.39s/it]
100%|██████████| 181/181 [01:24<00:00,  2.15it/s]

Epoch: 01
	Train Loss: 10.404
	 Val. Loss: 10.405





In [26]:
torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    },
    LSTM_PATH + "/model.pt"
)
with open(LSTM_PATH + "/eng_vocab.pkl", "wb+") as f:
    pickle.dump(eng_vocab, f)

with open(LSTM_PATH + "/ger_vocab.pkl", "wb+") as f:
    pickle.dump(ger_vocab, f)

In [26]:
# model = LSTMSeq2Seq(config, attention = True, alignment = "local-m", scoring_function = "dot")
# ckpt = torch.load(LSTM_LOCAL_PATH + "/model.pt")
# model.load_state_dict(ckpt['model_state_dict'])

In [36]:
model.eval()

LSTMSeq2Seq(
  (encoder): LSTMEncoder(
    (embedding): Embedding(35224, 1000)
    (lstm): LSTM(1000, 1000, num_layers=4, batch_first=True, dropout=0.2)
  )
  (decoder): LSTMDecoder(
    (embedding): Embedding(35224, 1000)
    (lstm): LSTM(1000, 1000, num_layers=4, batch_first=True, dropout=0.2)
  )
  (lm_head): Linear(in_features=1000, out_features=35224, bias=True)
)

In [22]:
tokenized_test_eng = [tokenize_en(text['en']) for text in tqdm(test_ds['translation'])]
tokenized_test_ger = [tokenize_de(text['de']) for text in tqdm(test_ds['translation'])]

tokenized_test_eng = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_test_eng)]
tokenized_test_ger = [pad_seq(text + ['<eos>'], MAX_LENGTH) for text in tqdm(tokenized_test_ger)]

encoded_test_eng = [encode_eng(text) for text in tqdm(tokenized_test_eng)]
encoded_test_ger = [encode_ger(text) for text in tqdm(tokenized_test_ger)]

tokenized_test_dataloader = DataLoader(list(zip(encoded_test_eng, encoded_test_ger)), batch_size=BATCH_SIZE, shuffle=True)

100%|██████████| 2999/2999 [00:00<00:00, 8820.21it/s]
100%|██████████| 2999/2999 [00:00<00:00, 5632.16it/s]
100%|██████████| 2999/2999 [00:00<00:00, 140432.93it/s]
100%|██████████| 2999/2999 [00:00<00:00, 166607.74it/s]
100%|██████████| 2999/2999 [00:00<00:00, 55951.42it/s]
100%|██████████| 2999/2999 [00:00<00:00, 44763.64it/s]


In [23]:
eng_idx2word = {v: k for k, v in eng_vocab.items()}
ger_idx2word = {v: k for k, v in ger_vocab.items()}

In [47]:
bleu_1_scores = []
bleu_2_scores = []
rouge_scores = []

decoded_sent = []
trg_sent = []

with torch.no_grad():
    for batch in tqdm(tokenized_test_dataloader):
        src = torch.stack(batch[0]).to(torch.int64).to(config.device)
        trg = torch.stack(batch[1]).to(torch.int64).to(config.device)
        src = src.transpose(0, 1)
        trg = trg.transpose(0, 1)
        output = model(src, trg, 0)
        # output = output[1:].view(-1, output.shape[2])
        # trg = trg[1:].view(-1)
        # output = output.permute(1, 0, 2)
        for i in range(output.shape[0]):
            # print(output[i].shape)
            decoded_tokens = output[i].argmax(dim = 1)
            decoded_sent.append(' '.join([ger_idx2word[i.item()] for i in decoded_tokens]))
            trg_sent.append([' '.join([ger_idx2word[i.item()] for i in trg[i]])])

            # bleu_1_scores.append(bleu.compute(predictions = [decoded_sent], references = [trg_sent], max_order = 1))
bleu.compute(predictions = decoded_sent, references = trg_sent, max_order = 1)

100%|██████████| 250/250 [02:45<00:00,  1.51it/s]


{'bleu': 0.006034572437462622,
 'precisions': [0.03267755918639546],
 'brevity_penalty': 0.18467023204031024,
 'length_ratio': 0.3718600729470295,
 'translation_length': 389870,
 'reference_length': 1048432}

In [20]:
temp_model = LSTMSeq2Seq(config, attention = False, alignment = "local-m", scoring_function = "general")

In [25]:
temp_model = temp_model.to(config.device)

In [26]:
bleu_1_scores = []
bleu_2_scores = []
rouge_scores = []

decoded_sent = []
trg_sent = []

with torch.no_grad():
    for batch in tqdm(tokenized_test_dataloader):
        src = torch.stack(batch[0]).to(torch.int64).to(config.device)
        trg = torch.stack(batch[1]).to(torch.int64).to(config.device)
        src = src.transpose(0, 1)
        trg = trg.transpose(0, 1)
        output = model(src, trg, 0)
        # output = output[1:].view(-1, output.shape[2])
        # trg = trg[1:].view(-1)
        # output = output.permute(1, 0, 2)
        for i in range(output.shape[0]):
            # print(output[i].shape)
            decoded_tokens = output[i].argmax(dim = 1)
            decoded_sent.append(' '.join([ger_idx2word[i.item()] for i in decoded_tokens]))
            trg_sent.append([' '.join([ger_idx2word[i.item()] for i in trg[i]])])


            # bleu_1_scores.append(bleu.compute(predictions = [decoded_sent], references = [trg_sent], max_order = 1))

100%|██████████| 250/250 [02:40<00:00,  1.55it/s]


NameError: name 'bleu' is not defined

In [29]:
bleu.compute(predictions = decoded_sent, references = trg_sent, max_order = 1)

{'bleu': 0.004277348394593872,
 'precisions': [0.023072549582035733],
 'brevity_penalty': 0.18538689794058183,
 'length_ratio': 0.37239644183165366,
 'translation_length': 390464,
 'reference_length': 1048517}

In [20]:
model = LSTMSeq2Seq(config, attention = False, alignment = "local-m", scoring_function = "general")
ckpt = torch.load(LSTM_PATH + "/model.pt")
model.load_state_dict(ckpt['model_state_dict'])
model = model.to(config.device)

RuntimeError: Error(s) in loading state_dict for LSTMSeq2Seq:
	size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([35224, 1000]) from checkpoint, the shape in current model is torch.Size([35061, 1000]).
	size mismatch for decoder.embedding.weight: copying a param with shape torch.Size([35224, 1000]) from checkpoint, the shape in current model is torch.Size([35061, 1000]).
	size mismatch for lm_head.weight: copying a param with shape torch.Size([35224, 1000]) from checkpoint, the shape in current model is torch.Size([35061, 1000]).
	size mismatch for lm_head.bias: copying a param with shape torch.Size([35224]) from checkpoint, the shape in current model is torch.Size([35061]).

In [None]:
torch.save(
    {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    },
    LSTM_SAVE_PATH
)