In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sacrebleu
!pip install unbabel-comet

In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sacrebleu
import torch
from tqdm import tqdm
from comet import download_model, load_from_checkpoint

In [None]:
comet_path = download_model('Unbabel/wmt22-comet-da')
comet_model = load_from_checkpoint(comet_path)

In [4]:
model_name = 'facebook/nllb-200-distilled-600M'
downloaded = '/content/drive/MyDrive/nllb-200-distilled-600M'

train_data_dir = '/content/drive/MyDrive/Corpus/train.xlsx'
test_data_dir = '/content/drive/MyDrive/Corpus/test.xlsx'
val_data_dir = '/content/drive/MyDrive/Corpus/val.xlsx'

In [5]:
MAX_SRC_LENGTH = 256
MAX_TGT_LENGTH = 256

In [6]:
src_lang = "mya_Mymr"
tgt_lang = "eng_Latn"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="mya_Mymr", tgt_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
model.eval()

In [8]:
def translate(sentence):

    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_SRC_LENGTH
    ).to("cuda")

    generated = model.generate(
        **inputs,
        num_beams=5,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=MAX_TGT_LENGTH,
    )

    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

In [9]:
val_df  = pd.read_excel('/content/drive/MyDrive/Corpus/val.xlsx')
test_df = pd.read_excel('/content/drive/MyDrive/Corpus/test.xlsx')
open_df = pd.read_excel('/content/drive/MyDrive/Corpus/open/thesis_data.xlsx')

## Validation Set

In [10]:
my_sentences = val_df["my"].tolist()
en_references = val_df["en"].tolist()

In [None]:
my_sentences[0]

In [12]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3000/3000 [42:12<00:00,  1.18it/s]


In [13]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("Ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 30.848851559207468
chrF++: 42.313631815171355
Ter: 108.96213565785891
------------------
Detailed BLEU analysis:
Precision: [94.11764705882354, 45.45454545454545, 21.875, 9.67741935483871]
BP (Brevity Penalty): 1.0
Ratio: 34/34


In [None]:
for index, h in enumerate(hypotheses):
  x = val_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [15]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

In [16]:
comet_output = comet_model.predict(data, batch_size=8, gpus=1)

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 375/375 [02:07<00:00,  2.94it/s]


In [17]:
comet_output

Prediction([('scores',
             [0.7126298546791077,
              0.719385027885437,
              0.7428751587867737,
              0.691418468952179,
              0.7601484656333923,
              0.7004582285881042,
              0.790925145149231,
              0.6430992484092712,
              0.7109576463699341,
              0.7398127913475037,
              0.7071685791015625,
              0.7549905180931091,
              0.706490695476532,
              0.7867183089256287,
              0.8677572011947632,
              0.8550317883491516,
              0.8568609952926636,
              0.8831838369369507,
              0.8922865390777588,
              0.7591705918312073,
              0.6948071122169495,
              0.7588083744049072,
              0.6733438372612,
              0.8240727186203003,
              0.82406085729599,
              0.7764924168586731,
              0.7818686366081238,
              0.7554206252098083,
              0.70354163646698,
  

## Closed Test Set

In [18]:
my_sentences = test_df["my"].tolist()
en_references = test_df["en"].tolist()

In [None]:
my_sentences[0]

In [20]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3000/3000 [39:58<00:00,  1.25it/s]


In [21]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("Ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 22.416933501922298
chrF++: 27.140611260387836
Ter: 37.24064550452208
------------------
Detailed BLEU analysis:
Precision: [83.33333333333333, 54.54545454545455, 10.0, 5.555555555555555]
BP (Brevity Penalty): 1.0
Ratio: 12/12


In [None]:
for index, h in enumerate(hypotheses):
  x = test_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [23]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

comet_output = comet_model.predict(data, batch_size=8, gpus=1)

comet_output

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 375/375 [01:49<00:00,  3.42it/s]


Prediction([('scores',
             [0.6038986444473267,
              0.9686755537986755,
              0.707268238067627,
              0.8978967070579529,
              0.7360478639602661,
              0.48323482275009155,
              0.8345738649368286,
              0.8331150412559509,
              0.7576176524162292,
              0.7661628723144531,
              0.768405556678772,
              0.6271353363990784,
              0.34001001715660095,
              0.5253028869628906,
              0.6944777965545654,
              0.5963590741157532,
              0.4652526378631592,
              0.6236972808837891,
              0.3783349096775055,
              0.6994886994361877,
              0.8630408644676208,
              0.6785460114479065,
              0.8215250372886658,
              0.8399178981781006,
              0.77462238073349,
              0.7872636318206787,
              0.8921706080436707,
              0.7838489413261414,
              0.75482881069

## Open Test Set

In [24]:
my_sentences = open_df["my"].tolist()
en_references = open_df["en"].tolist()

In [None]:
my_sentences[0]

In [26]:
hypotheses = []
references = []

for src, ref in tqdm(zip(my_sentences, en_references), total=len(my_sentences)):
    hyp = translate(src)
    hypotheses.append(hyp)
    references.append([ref])   # sacreBLEU expects list of refs per line

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1317/1317 [11:57<00:00,  1.83it/s]


In [27]:
bleu = sacrebleu.corpus_bleu(hypotheses, references)
chrfpp = sacrebleu.corpus_chrf(hypotheses, references)
ter = sacrebleu.corpus_ter(hypotheses, references)
print("BLEU:", bleu.score)
print("chrF++:", chrfpp.score)
print("Ter:", ter.score)
print("------------------")
print("Detailed BLEU analysis:")
print(f"Precision: {bleu.precisions}")
print(f"BP (Brevity Penalty): {bleu.bp}")
print(f"Ratio: {bleu.sys_len}/{bleu.ref_len}")

BLEU: 14.535768424205482
chrF++: 28.47786257853076
Ter: 54.29062177945724
------------------
Detailed BLEU analysis:
Precision: [90.0, 22.22222222222222, 6.25, 3.5714285714285716]
BP (Brevity Penalty): 1.0
Ratio: 10/10


In [None]:
for index, h in enumerate(hypotheses):
  x = open_df.loc[index]['my']
  print(x)
  print(f'hyp: {hypotheses[index]}')
  print(f'ref: {references[index]}')
  print('---------')

In [29]:
src = []  # mm src
ref = []  # en ref
mt = []   # hyp mt
for i in my_sentences:
  src.append(i)
for i in en_references:
  ref.append(i)
for i in hypotheses:
  mt.append(i)
data = [{"src": s, "mt": m, "ref": r} for s, m, r in zip(src, mt, ref)]

comet_output = comet_model.predict(data, batch_size=8, gpus=1)

comet_output

INFO:pytorch_lightning.utilities.rank_zero:ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 165/165 [00:29<00:00,  5.55it/s]


Prediction([('scores',
             [0.5861768126487732,
              0.552186906337738,
              0.7092905044555664,
              0.9016280770301819,
              0.6986328959465027,
              0.6172863841056824,
              0.6039941906929016,
              0.6879870295524597,
              0.5358574986457825,
              0.6718481183052063,
              0.5689143538475037,
              0.5648837089538574,
              0.7778329253196716,
              0.5984852910041809,
              0.670844554901123,
              0.6826345324516296,
              0.5392042398452759,
              0.7592801451683044,
              0.8452412486076355,
              0.8178825378417969,
              0.817083477973938,
              0.7102975249290466,
              0.5874270796775818,
              0.7791737914085388,
              0.4777141511440277,
              0.615534245967865,
              0.8863705396652222,
              0.8940228819847107,
              0.5466567277908