In [1]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def load_model(base_model, legacy=False):
    tokens_to_add = ['<|Eng|>', '<|CajFr|>']
    tokenizer = AutoTokenizer.from_pretrained(base_model, clean_up_tokenization_spaces=True, legacy=legacy, additional_special_tokens=tokens_to_add)
    model = AutoModelForSeq2SeqLM.from_pretrained(base_model)
    model.cuda()
    return tokenizer, model

def data_generator(dataset, tokenizer, batch_size=32):
    np.random.shuffle(dataset)
    for i in range(0, len(dataset), batch_size):
        raw_batch = dataset[i:i + batch_size]
        batch_data = transform_batch(raw_batch, tokenizer)
        yield batch_data

def transform_batch(batch, tokenizer):
    inputs = []
    targets = []
    longest_seq = max([max(len(sentence['English']), len(sentence['Cajun French'])) for sentence in batch])
    max_seq_len = min(128, longest_seq + 4)
    for sentence_pair in batch:
        input_ids, target_ids = format_translation_data(sentence_pair, tokenizer, max_seq_len)
        inputs.append(input_ids)
        targets.append(target_ids)
    batch_input_ids = torch.cat(inputs).cuda()
    batch_target_ids = torch.cat(targets).cuda()
    return batch_input_ids, batch_target_ids

def format_translation_data(data, tokenizer, max_seq_len=128):
    input_lang, target_lang = np.random.choice(['English', 'Cajun French'], size=2, replace=False)
    input_text = data[input_lang]
    target_text = data[target_lang]
    input_ids = tokenizer.encode(text=input_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_len).cuda()
    target_ids = tokenizer.encode(text=target_text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_seq_len).cuda()
    return input_ids, target_ids

def eval_model(model, tokenizer, test_data, batch_size):
    model.eval()
    eval_generator = data_generator(test_data, tokenizer, batch_size)
    with torch.no_grad():
        eval_loss = []
        for batch in eval_generator:
            input_ids, target_ids = batch
            loss = model(input_ids, labels=target_ids).loss
            eval_loss.append(loss.item())
    return np.mean(eval_loss)

def calculate_bleu(reference, hypothesis):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference], hypothesis, smoothing_function=smoothie)

with open('Data/corpus.json', 'r', encoding='utf-8') as file:
    corpus_data = json.load(file)['data']
    testing_data = corpus_data[:100]

  from .autonotebook import tqdm as notebook_tqdm


### Calculate BLEU scores for local models

In [2]:
base_model = 'facebook/nllb-200-1.3B'
tokenizer, model = load_model(base_model)
model.load_state_dict(torch.load('../Models/nllb_1_model.pt'))

  model.load_state_dict(torch.load('../Models/nllb_1_model.pt'))


<All keys matched successfully>

In [None]:
eval_generator = data_generator(testing_data, tokenizer, 5)

bleu_scores = []

for input_ids, target_ids in eval_generator:
    output = model.generate(
        input_ids,
        do_sample=True,
        temperature=0.75,
        repetition_penalty=1.5,
        max_length=128,
        num_beams=15,
        num_return_sequences=1,
        early_stopping=True
    )

    # Decode the target and output sequences for comparison
    target_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in target_ids]
    output_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output]

    # Calculate BLEU scores for the batch
    for target, output in zip(target_texts, output_texts):
        reference = target.split()
        hypothesis = output.split()
        bleu_score = calculate_bleu(reference, hypothesis)
        bleu_scores.append(bleu_score)

# Print average BLEU score
average_bleu_score = np.mean(bleu_scores)
print(f"Average BLEU score: {average_bleu_score:.4f}")

### Use Gemini API to calculate BLEU score on fine-tuned model.

In [None]:
import requests
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

api_endpoint = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
api_key = os.getenv("GEMINI_API_KEY")

# Function to call the Google Gemini API
def translate_with_gemini(input_text):
    headers = {
        "Content-Type": "application/json"
    }
    payload = {
        "contents": [
            {
                "parts": [
                    {
                        "text": input_text
                    }
                ]
            }
        ]
    }
    response = requests.post(f"{api_endpoint}?key={api_key}", headers=headers, json=payload)
    response_data = response.json()    
    print(response_data)
    
    # Handle prohibited content rejection
    if "candidates" in response_data and response_data["candidates"][0]["finishReason"] == "PROHIBITED_CONTENT":
        print("Prohibited content detected, skipping this item.")
        return None
    
    # Check if the expected keys are present in the response
    if "candidates" in response_data and len(response_data["candidates"]) > 0:
        if "content" in response_data["candidates"][0] and "parts" in response_data["candidates"][0]["content"]:
            if len(response_data["candidates"][0]["content"]["parts"]) > 0:
                return response_data["candidates"][0]["content"]["parts"][0]["text"]
    
    # Handle 'resource exhausted' error and other unexpected responses
    print("Unexpected response structure, skipping this item.")
    return None


In [None]:
bleu_scores = []

for item in testing_data:
    input_text = item['English']
    reference_text = item['Cajun French']

    # Get translation from Gemini API
    translated_text = translate_with_gemini(input_text)

    # Do not include prohibited content or other unexpected responses
    if translated_text is None:
        continue

    # Calculate BLEU score
    reference = reference_text.split()
    hypothesis = translated_text.split()
    bleu_score = calculate_bleu(reference, hypothesis)
    bleu_scores.append(bleu_score)

if bleu_scores:
    average_bleu_score = np.mean(bleu_scores)
    print(f"Average BLEU score: {average_bleu_score:.4f}")
else:
    print("No valid translations to calculate BLEU score.")