In [None]:
# python standard
import json
import numpy as np
import pandas as pd

# huggingface
from tokenizers import Tokenizer

# torch
from torch.utils.data import DataLoader
from torch import Tensor
import torch

from torchmetrics.functional.text import sacre_bleu_score

# custom
from utils import decoding_utils, data_utils, transformer_utils as tfu
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

with open("params.json", "r") as fp:
    params = json.load(fp)

PAD_IDX = params["PAD_IDX"]
vocab_size = params["vocab_size"]
tokens_per_batch = params["tokens_per_batch"]
epochs = params["epochs"]
grad_accumulation = params["grad_accumulation"]
d_model = params["d_model"]
n_heads = params["n_heads"]
d_ff = params["d_ff"]
n_layers = params["n_layers"]
dropout = params["dropout"]

with open("wmt14.json", "r") as fp:
    wmt14 = json.load(fp)

tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

In [None]:
test_data = [[src, tgt] for src, tgt, _ in wmt14["test"]]
test_dataloader = DataLoader(test_data, batch_size=512, collate_fn=data_utils.collate_fn, pin_memory=True, shuffle=False)

model = tfu.Transformer(
    vocab_size=vocab_size,
    d_model=d_model,
    n_heads=n_heads,
    d_ff=d_ff,
    n_encoder_layers=n_layers,
    n_decoder_layers=n_layers,
    dropout=dropout
)
model = model.to(DEVICE)

path = "drive/MyDrive/nmt/snapshot.tar"
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['MODEL_STATE'])

In [None]:
bleu, preds = decoding_utils.get_bleu_score(tokenizer, model, test_dataloader, 3, DEVICE)
bleu.item()

In [None]:
test_data_numpy = np.array(test_data)
test_data_numpy = test_data_numpy.transpose((1, 0))
df = pd.DataFrame(dict(src=test_data_numpy[0], pred=preds, tgt=test_data_numpy[1]))
df