# GENERATING BLEU SCORES FOR THE MODEL

### Loading the model and the vocabulary files

In [1]:
import torch, pickle
from torch.utils.data import DataLoader, Dataset
from nltk.translate.bleu_score import sentence_bleu

from prompt_model import Encoder, EncoderLayer, PositionwiseFeedforwardLayer, MultiHeadAttentionLayer, Decoder, DecoderLayer, Seq2Seq

model_path = 'models/conversational-ai-model-cpu.pt'
model = torch.load(model_path)

with open('./vocabs/source_vocab.pkl', 'rb') as f:
        src_vocab = pickle.load(f)

with open('./vocabs/target_vocab.pkl', 'rb') as f:
        trg_vocab = pickle.load(f)

In [2]:
from torchtext.data import Field

SRC = Field(tokenize=lambda x: x.split(),
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)

TRG = Field(tokenize=lambda x: x.split(),
            init_token='<sos>',
            eos_token='<eos>',
            lower=True)
SRC.vocab = src_vocab
TRG.vocab = trg_vocab

<hr>

### Function to convert the prompts and replies to Torch Tensors

In [3]:
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50000):
    model.eval()
    if isinstance(sentence, str):
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for _ in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(
                trg_tensor, enc_src, trg_mask, src_mask)
        pred_token = output.argmax(2)[:, -1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    return trg_tokens[1:], attention

In [4]:
from tokenize import untokenize

def eng_to_python(src):
    src = src.split(" ")
    translation, _ = translate_sentence(src, SRC, TRG, model, device)
    return untokenize(translation[:-1]).decode('utf-8')

<hr>

### Preparing our testing data

In [5]:
prompts = []
original_replies = []

with open("data/test.txt", "r") as file:
    current_prompt = None
    current_reply = []

    for line in file:
        if line.startswith("#"):
            if current_prompt is not None:
                original_replies.append(''.join(current_reply).strip())

            current_prompt = line.strip("# ").strip()
            prompts.append(current_prompt)
            current_reply = []
        else:
            current_reply.append(line)
    
    if current_prompt is not None:
        original_replies.append(''.join(current_reply).strip())

In [9]:
bot_reply = []

for prompt in prompts:
    bot_reply.append(eng_to_python(prompt))

<hr>

### Generating the Scores

In [24]:
bleu_scores = []

for original, bot in zip(original_replies, bot_reply):
    reference = [original.split()]  
    candidate = bot.split() 
    score = sentence_bleu(reference, candidate)
    bleu_scores.append(score)

for i, score in enumerate(bleu_scores):
    print(f"BLEU score for reply pair {i+1}: {round(score, 1000)}")

BLEU score for reply pair 1: 1.2882297539194154e-231
BLEU score for reply pair 2: 1.1200407237786664e-231
BLEU score for reply pair 3: 1.3091834502273125e-231
BLEU score for reply pair 4: 1.2882297539194154e-231
BLEU score for reply pair 5: 1.2183324802375697e-231
BLEU score for reply pair 6: 8.844844403089352e-232
BLEU score for reply pair 7: 6.373704167435469e-155
BLEU score for reply pair 8: 1.2882297539194154e-231
BLEU score for reply pair 9: 7.601159375410181e-232
BLEU score for reply pair 10: 1.2882297539194154e-231
BLEU score for reply pair 11: 1.0518351895246305e-231
BLEU score for reply pair 12: 1.2882297539194154e-231
BLEU score for reply pair 13: 1.154647032204335e-231
BLEU score for reply pair 14: 1.0832677820940877e-231
BLEU score for reply pair 15: 1.2508498911928379e-231
BLEU score for reply pair 16: 1.2183324802375697e-231
BLEU score for reply pair 17: 1.2882297539194154e-231
BLEU score for reply pair 18: 1.2882297539194154e-231
BLEU score for reply pair 19: 1.288229753

<hr><hr>