In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers==4.51.3 datasets==3.5.0 peft==0.15.2
!pip install accelerate==1.6.0 evaluate==0.4.3 sacrebleu==2.5.1
!pip install datasets sacremoses torch pandas numpy
!pip install unbabel-comet

In [2]:
import numpy as np
import pandas as pd
import sacrebleu
import torch
import gc
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
from torch.utils.data import DataLoader
from comet import download_model, load_from_checkpoint

In [None]:
# comet_path = '/content/drive/MyDrive/wmt22-comet-da/checkpoints/model.ckpt'
comet_path = download_model('Unbabel/wmt22-comet-da')
comet_model = load_from_checkpoint(comet_path)

In [4]:
model_name = 'facebook/nllb-200-distilled-600M'
downloaded = '/content/drive/MyDrive/nllb-200-distilled-600M'

lora_path = '/content/drive/MyDrive/NLLB Checkpoints/MY-EN/Clementine/Final Model'

train_data_dir = '/content/drive/MyDrive/Corpus/train.xlsx'
test_data_dir = '/content/drive/MyDrive/Corpus/test.xlsx'
val_data_dir = '/content/drive/MyDrive/Corpus/val.xlsx'

In [None]:
!ls '/content/drive/MyDrive/NLLB Checkpoints/MY-EN/Clementine/Final Model'

In [None]:
!ls '/content/drive/MyDrive/nllb-200-distilled-600M'

In [7]:
torch.cuda.empty_cache()
gc.collect()

1061

In [8]:
MAX_SRC_LENGTH = 256
MAX_TGT_LENGTH = 256

In [9]:
src_lang="mya_Mymr"
tgt_lang="eng_Latn"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="mya_Mymr", tgt_lang="eng_Latn")
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model = PeftModel.from_pretrained(base_model, lora_path)
model = model.merge_and_unload()
model = model.to("cuda")

model.eval()

In [11]:
def translate(sentence):
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=MAX_SRC_LENGTH
    ).to("cuda")

    # Generate with attention mask
    generated = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # IMPORTANT!
        num_beams=5,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=MAX_TGT_LENGTH,
    )

    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

In [12]:
val_df  = pd.read_excel('/content/drive/MyDrive/Corpus/val.xlsx')
test_df = pd.read_excel('/content/drive/MyDrive/Corpus/test.xlsx')
open_df = pd.read_excel('/content/drive/MyDrive/Corpus/open/thesis_data.xlsx')

## Validation Set

In [13]:
my_sentences =  val_df["my"].tolist()
en_references = val_df["en"].tolist()

In [None]:
my_sentences[0]

In [15]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3000/3000 [37:21<00:00,  1.34it/s]


In [16]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 25.43786873577925
chrF++: 38.14507851365012
ter: 104.42204667211476
------------------
Detailed BLEU analysis:
Precision: [100.0, 56.666666666666664, 20.689655172413794, 3.5714285714285716]
BP (Brevity Penalty): 1.0
Ratio: 31/31


In [None]:
for index, h in enumerate(hypotheses):
  x = val_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [18]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

In [19]:
comet_output = comet_model.predict(data, batch_size=8, gpus=1)
print(comet_output)

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 375/375 [01:59<00:00,  3.13it/s]


Prediction({'scores': [0.7462993860244751, 0.844404399394989, 0.804506242275238, 0.8383184671401978, 0.8098359107971191, 0.8459204435348511, 0.8304367661476135, 0.8874509930610657, 0.8282618522644043, 0.8375798463821411, 0.786034107208252, 0.8548389673233032, 0.7793421149253845, 0.7653186321258545, 0.8889714479446411, 0.9084994792938232, 0.8732972145080566, 0.9271437525749207, 0.9201406836509705, 0.8868620991706848, 0.3090861141681671, 0.7313103079795837, 0.866609513759613, 0.8292749524116516, 0.8453717827796936, 0.7716851830482483, 0.8946549296379089, 0.8476064205169678, 0.7612476944923401, 0.7289016842842102, 0.8751383423805237, 0.8371801376342773, 0.8338136076927185, 0.8028976321220398, 0.8857662677764893, 0.8933863639831543, 0.8442670702934265, 0.8323000073432922, 0.9066194891929626, 0.9052395224571228, 0.8521187901496887, 0.8249386548995972, 0.8372147679328918, 0.7736263871192932, 0.7644525170326233, 0.691825270652771, 0.8565258979797363, 0.9449997544288635, 0.8411936163902283, 0.

## Closed Test Set

In [20]:
my_sentences =  test_df["my"].tolist()
en_references = test_df["en"].tolist()

In [21]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3000/3000 [34:54<00:00,  1.43it/s]


In [22]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 25.450938600202846
chrF++: 62.28774100620197
ter: 21.280368859726902
------------------
Detailed BLEU analysis:
Precision: [92.3076923076923, 50.0, 18.181818181818183, 5.0]
BP (Brevity Penalty): 1.0
Ratio: 13/13


In [None]:
for index, h in enumerate(hypotheses):
  x = test_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [24]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

comet_output = comet_model.predict(data, batch_size=8, gpus=1)

print(comet_output)

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 375/375 [01:44<00:00,  3.60it/s]


Prediction({'scores': [0.8879945278167725, 0.9872603416442871, 0.7319658398628235, 0.8368770480155945, 0.7725635766983032, 0.5394095778465271, 0.8270629048347473, 0.8424126505851746, 0.7042601704597473, 0.8804618716239929, 0.7681321501731873, 0.762924075126648, 0.730593204498291, 0.6617562174797058, 0.6894875168800354, 0.6239281296730042, 0.7101783156394958, 0.3467797338962555, 0.6362544894218445, 0.7936367392539978, 0.841148316860199, 0.7127338647842407, 0.7404908537864685, 0.8022779226303101, 0.7218369245529175, 0.8275503516197205, 0.89219731092453, 0.8349968791007996, 0.8665211796760559, 0.7757955193519592, 0.40735265612602234, 0.8256471157073975, 0.5971366763114929, 0.6767626404762268, 0.4331584870815277, 0.3986790180206299, 0.7380669116973877, 0.6161659955978394, 0.7212344408035278, 0.7856491208076477, 0.8129657506942749, 0.7483763098716736, 0.7019874453544617, 0.8089272379875183, 0.5837937593460083, 0.7219492197036743, 0.5924434065818787, 0.6276965737342834, 0.7752797603607178, 0

## Opened Test Set

In [25]:
my_sentences =  open_df["my"].tolist()
en_references = open_df["en"].tolist()

In [None]:
my_sentences[0]

In [27]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1317/1317 [11:22<00:00,  1.93it/s]


In [28]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 7.267884212102741
chrF++: 38.901698492118804
ter: 63.339058742700104
------------------
Detailed BLEU analysis:
Precision: [90.0, 5.555555555555555, 3.125, 1.7857142857142858]
BP (Brevity Penalty): 1.0
Ratio: 10/10


In [None]:
for index, h in enumerate(hypotheses):
  x = open_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [30]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

comet_output = comet_model.predict(data, batch_size=8, gpus=1)

print(comet_output)

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 165/165 [00:30<00:00,  5.39it/s]


Prediction({'scores': [0.7706853747367859, 0.6731371283531189, 0.7076800465583801, 0.9247676730155945, 0.6821689605712891, 0.6770399212837219, 0.9590702652931213, 0.663489043712616, 0.5163574814796448, 0.6211312413215637, 0.543829619884491, 0.6486284732818604, 0.9814432263374329, 0.5654116868972778, 0.6364551782608032, 0.7187997102737427, 0.7528113126754761, 0.7476404309272766, 0.8134772181510925, 0.7831705808639526, 0.8170834183692932, 0.6166976094245911, 0.7084717750549316, 0.8090780377388, 0.408481627702713, 0.5315746665000916, 0.8863703608512878, 0.9183129668235779, 0.5466567277908325, 0.9422414302825928, 0.7157169580459595, 0.8448777794837952, 0.491978257894516, 0.9168345332145691, 0.48648419976234436, 0.8958908319473267, 0.5457499623298645, 0.6984469294548035, 0.954015851020813, 0.5102893114089966, 0.5250628590583801, 0.6030157208442688, 0.6229382157325745, 0.795525312423706, 0.863481879234314, 0.6327207684516907, 0.6452838778495789, 0.5898043513298035, 0.7535742521286011, 0.5126