In [1]:
from transformers import MBartForConditionalGeneration, AutoTokenizer

In [2]:
from huggingface_hub import notebook_login
from transformers import MBart50Tokenizer
import tqdm

In [3]:
import torch

In [4]:
# checkpoint = "facebook/mbart-large-50-many-to-many-mmt"
checkpoint ="/home/yush/kreol-benchmark/checkpoint_tests/checkpoint-120000_best"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device='cpu'

In [6]:
# tokenizer = MBart50Tokenizer.from_pretrained("/mnt/disk/yrajcoomar/kreol-benchmark/pipelines/tok",max_len=256)
model = MBartForConditionalGeneration.from_pretrained(checkpoint)
model = model.to(device)

In [7]:
example_english_phrase = "Would you prefer money or shares in the company?"
# example_creole_phrase = 'Mo kosmar li koumanse aswar'

tokenizer.src_lang ='en_XX'
tokenizer.tgt_lang = 'cr_CR'

inputs = tokenizer(example_english_phrase, return_tensors="pt").to(device)

generated_tokens = model.generate(**inputs)

tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)


['Eski ou ti prefer larzan ou aksion dan konpayni?']

In [8]:
import evaluate

bleu = evaluate.load("bleu")
chrf = evaluate.load("chrf")

In [9]:
import pandas as pd
val = pd.read_json('/home/yush/kreol-benchmark/data/lang_data/en-cr/en-cr_dev.jsonl',lines=True)

In [10]:
tokenizer.src_lang='en_XX'
tokenizer.tgt_lang= 'cr_CR'

In [11]:
val_inputs = list(val['input'])
val_labels = list(val['target'])

In [12]:
input_tokens = tokenizer(val_inputs,max_length=128, truncation=True, padding="max_length",return_tensors='pt').to(device)

In [13]:
def index_slice_dict(dicti,slice_begin,slice_end=None):
    sliced_dict = {}
    for k,v in dicti.items():
        if slice_end:
            sliced_dict[k] = v[slice_begin:slice_end]
        else:
            sliced_dict[k] = v[slice_begin:]
    return sliced_dict

In [21]:
torch.cuda.empty_cache()

In [17]:
#assumes 2 dims since RNN
def pad_and_concat(tensor1, tensor2, axis=1, pad_value=0):
    """
    Pad the minimum of two tensors on a provided axis and concatenate them.

    Parameters:
    - tensor1, tensor2 (torch.Tensor): Tensors to be padded and concatenated.
    - axis (int): Axis along which to pad and concatenate the tensors.
    - pad_value (scalar): Value to use for padding.

    Returns:
    - torch.Tensor: Concatenated tensor with the minimum of the two tensors padded.
    """
    # Calculate the padding amount for each tensor
    pad_amount = max(0, abs(tensor1.shape[axis] - tensor2.shape[axis]))

    other_axis = 1 if axis == 0 else 0

    # Create padding tensors
    pad_tensor1 = torch.full((pad_amount,), pad_value).unsqueeze(axis)
    pad_tensor2 = torch.full((pad_amount,), pad_value).unsqueeze(axis)

    # Apply padding to the tensors
    if tensor1.shape[axis] < tensor2.shape[axis]:
        pad_tensor_full = torch.zeros([tensor1.shape[other_axis],pad_amount]).to(device)
        tensor1 = torch.cat([tensor1, pad_tensor_full], dim=axis)
    else:
        pad_tensor_full = torch.zeros([tensor2.shape[other_axis],pad_amount]).to(device)
        tensor2 = torch.cat([tensor2, pad_tensor_full], dim=axis)

    # Concatenate the tensors along the specified axis
    concatenated = torch.cat([tensor1, tensor2], dim=other_axis)

    return concatenated

In [22]:
batch_num = 5
batch_size = len(val) // batch_num
output = []
for i in range(0,len(val),batch_size):
    input_dict = index_slice_dict(input_tokens,i,i+batch_size)
    output_tokens_bn = model.generate(**input_dict)
    output_batch = tokenizer.batch_decode(output_tokens_bn, skip_special_tokens=True)
    output.extend(output_batch)
    torch.cuda.empty_cache()

0
--------
100
--------
200
--------
300
--------
400
--------


In [17]:
chrf.compute(predictions=output,references=val_labels)

{'score': 46.111429700032176, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [41]:
chrf.compute(predictions=output,references=val_labels) # unidirectional sentences + train :(

{'score': 43.95595976980054, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [17]:
chrf.compute(predictions=output,references=val_labels) # unidir early dictionary definitiosn + train

{'score': 44.820585206803734, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [37]:
chrf.compute(predictions=output,references=val_labels) # bidirectional 150 epoch

{'score': 44.63455973729872, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [17]:
chrf.compute(predictions=output,references=val_labels) # bidirectional good

{'score': 44.202100788001715, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [44]:
chrf.compute(predictions=output,references=val_labels) # bidirectional

{'score': 44.51428623272526, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [30]:
chrf.compute(predictions=output,references=val_labels)

{'score': 46.111429700032176, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [32]:
chrf.compute(predictions=output,references=val_labels)

{'score': 44.25118855197231, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [18]:
chrf.compute(predictions=output,references=val_labels)

{'score': 44.88389045784656, 'char_order': 6, 'word_order': 0, 'beta': 2}

In [18]:
bleu.compute(predictions=output,references=val_labels)

{'bleu': 0.232351109380634,
 'precisions': [0.5994308480364258,
  0.3108026554013277,
  0.18188824662813102,
  0.10883886906395827],
 'brevity_penalty': 0.9428478906353579,
 'length_ratio': 0.9444205547194152,
 'translation_length': 8785,
 'reference_length': 9302}

In [42]:
bleu.compute(predictions=output,references=val_labels) #unidir train dict sentences

{'bleu': 0.21477393179739254,
 'precisions': [0.5857774674115456,
  0.29745427582797823,
  0.16978398314014753,
  0.10009868884816016],
 'brevity_penalty': 0.9206871281307047,
 'length_ratio': 0.9236723285314986,
 'translation_length': 8592,
 'reference_length': 9302}

In [18]:
bleu.compute(predictions=output,references=val_labels) #unidir early train dict definitions

{'bleu': 0.22379070606879156,
 'precisions': [0.5999528524280999,
  0.31074649298597196,
  0.18065205772314272,
  0.10952040085898354],
 'brevity_penalty': 0.9080854612090485,
 'length_ratio': 0.9120619221672759,
 'translation_length': 8484,
 'reference_length': 9302}

In [1]:
bleu.compute(predictions=output,references=val_labels) #bi good 150 epoch

NameError: name 'bleu' is not defined

In [22]:
bleu.compute(predictions=output,references=val_labels) #bi good

{'bleu': 0.22670356688311502,
 'precisions': [0.596147270033056,
  0.3071437205366856,
  0.17689437797504182,
  0.10379433599120154],
 'brevity_penalty': 0.9414833236589492,
 'length_ratio': 0.9431305095678348,
 'translation_length': 8773,
 'reference_length': 9302}

In [45]:
bleu.compute(predictions=output,references=val_labels) #bi

{'bleu': 0.22710547384388105,
 'precisions': [0.5968992248062015,
  0.31207886199151486,
  0.18339100346020762,
  0.1127583749109052],
 'brevity_penalty': 0.911600492464646,
 'length_ratio': 0.9152870350462267,
 'translation_length': 8514,
 'reference_length': 9302}

In [31]:
bleu.compute(predictions=output,references=val_labels)

{'bleu': 0.232351109380634,
 'precisions': [0.5994308480364258,
  0.3108026554013277,
  0.18188824662813102,
  0.10883886906395827],
 'brevity_penalty': 0.9428478906353579,
 'length_ratio': 0.9444205547194152,
 'translation_length': 8785,
 'reference_length': 9302}

: 

In [33]:
bleu.compute(predictions=output,references=val_labels)

{'bleu': 0.21677202646600077,
 'precisions': [0.5791284403669725,
  0.29708029197080293,
  0.16904145077720206,
  0.0991552416562803],
 'brevity_penalty': 0.9354354658248155,
 'length_ratio': 0.9374328101483552,
 'translation_length': 8720,
 'reference_length': 9302}

In [17]:
bleu.compute(predictions=output,references=val_labels)

{'bleu': 0.22213435249592578,
 'precisions': [0.5861203059011528,
  0.3022636484687084,
  0.17265816260791136,
  0.1019003029468466],
 'brevity_penalty': 0.940117001603055,
 'length_ratio': 0.9418404644162546,
 'translation_length': 8761,
 'reference_length': 9302}

In [14]:
bleu.compute(predictions=output,references=val_labels)

{'bleu': 0.2034403402528741,
 'precisions': [0.5751253059797179,
  0.28382225522960763,
  0.16136693495184062,
  0.09110169491525423],
 'brevity_penalty': 0.9191779356555749,
 'length_ratio': 0.9222747796172867,
 'translation_length': 8579,
 'reference_length': 9302}

In [43]:
import numpy as np

In [46]:
r = list(np.random.randint(0,500,10))

In [47]:
for i in r:
    print(f"Input: {val_inputs[i]}")
    print(f"Prediction: {output[i]}")
    print(f"Label: {val_labels[i]}")
    print('------------')

Input: But when his disciples saw this, they became angry and complained, why such a waste?
Prediction: Me ler so bann disip trouv sa, zot koumans plengne, kifer finn perdi sa?
Label: Letan bann disip trouv sa, zot mekontan, zot dir, kifer bizin fer gaspiyaz?
------------
Input: Pompeii takes you on an adventure where you will witness volcanic 3D historic.
Prediction: Pompei amenn u lor enn plas kot u pu truv volkanik 3D istorik.
Label: Pompei amenn ou dan enn avantir volkanik kot ou pou asiste an 3D enn evenman historik.
------------
Input: Return of bad spirit.
Prediction: Movezer.
Label: Li finn sorti dan enn long distans pou ekout lasazes Salomon, isi ena pli gran ki Salomon.
------------
Input: They dress up like sheep, but inside they are wolves who have come to attack you.
Prediction: Zot abiye kouma mouton, me andan zot se loulou ki finn vinn atak twa.
Label: Zot vini kouma bann mouton, me andan zot bann loulou feros ki vini pou atak zot.
------------
Input: When they saw him a

: 

In [26]:
for i in r:
    print(f"Input: {val_inputs[i]}")
    print(f"Prediction: {output[i]}")
    print(f"Label: {val_labels[i]}")
    print('------------')

Input: According to my grandfather, my great-grandfather, Ramsamy Ramsamy, was able to save money and buy a plot of land.
Prediction: Dapre mo granper, mo granper, Ramsamy Ramsamy, ti kapav sov larzan ek aste enn plot later.
Label: Dapre mo granper, mo aryer granper, Ramsamy Ramsamy, ti konn fer lekonomi e li ti aste enn porsion teren.
------------
Input: They paid it for a potter's field, as the Lord had commanded me.
Prediction: Zot ti pey mwa kouma Lesegner ti donn mwa lord.
Label: Zot finn servi sa kas la pou aste later potie kouma Lesegner finn donn lord.
------------
Input: When an evil spirit leaves a person, it travels through the desert, looking for a place to rest.
Prediction: Kan enn move lespri kit enn dimunn, li travers dezer, li rod enn plas pou repoze.
Label: Ler enn lespri inpir sorti dan enn dimoun, li vwayaz dan bann landrwa dezer pou rod enn plas pou repoze.
------------
Input: If the sky is red in the evening, you say the weather will be good.
Prediction: Si lesiel 

In [22]:
for i in r:
    print(f"Input: {val_inputs[i]}")
    print(f"Prediction: {output[i]}")
    print(f"Label: {val_labels[i]}")
    print('------------')

Input: The disciples had forgotten to bring any bread when they crossed the lake.
Prediction: Bann disip la ti bliye amenn dipin kan zot ti travers lak.
Label: Bann disip ti finn bliye amenn dipin avek zot kan zot ariv lot kote lak.
------------
Input: Some day you will see that horrible hhingh in the holy place, just as the prophet Daniel said.
Prediction: Enn zour zot pou trouv sa move nouvel dan sa plas la, parey kouma profet Daniel ti dir.
Label: Alor ler zot trouv bann sakrilez abominab parey kouma Profet Daniel ti anonse.
------------
Input: In a democracy, policy makers and people in positions of responsibility must be constantly accountable to citizens.
Prediction: Dan enn demokrasi, bann fer politik, bann dimunn dan pozisyon de responsabilite, bizin constantly kontribye a bann sitwayin.
Label: Dan enn demokrasi, bann desider e dimounn dan pozision responsabilite bizin touletan rann kont bann sitwayin.
------------
Input: John's followers took his body and buried it.
Prediction

In [20]:
for i in r:
    print(f"Input: {val_inputs[i]}")
    print(f"Prediction: {output[i]}")
    print(f"Label: {val_labels[i]}")
    print('------------')

Input: While the Pharisees were still there, Jesus asked them, wWhat do you think about the Messiah?
Prediction: Pandan ki bann Farizien ti ankor laba, Zezi dir zot, ki zot panse lor Lemesi.
Label: Ler bann Farizien ti finn rasanble, Zezi dimann zot ki zot panse lor Lemesi?
------------
Input: When they got there, he told them, sit here while I go over there and pray.
Prediction: Ler zot ariv laba, li dir zot, asiz laba e mo fer priye.
Label: Kan zot ariv laba, li dir bann-la, res la, mwa mo pe al inpe pli lwin pou priye.
------------
Input: Mary Magdalene, Mary the mother of James and Joseph, and the mother of James and John were some of these women.
Prediction: Marie Magdalenn, Marie, mama Zak ek Josef, mama Zak ek Zan ti parmi sa bann fam la.
Label: Parmi zot, ti ena Marie Magdalenn, Marie, mama Zak ek Zozef e mama bann garson Zebede.
------------
Input: But I tell you that Solomon with all his wealth wasn't as well clothed as the flowers.
Prediction: Me mo dir zot, salme Salomon ar

In [12]:
for i in range(10):
    print(f"Input: {val_inputs[i]}")
    print(f"Prediction: {output[i]}")
    print(f"Label: {val_labels[i]}")
    print('------------')


Input: I did not come to do away with them, but to give them their full meaning.
Prediction: Mo pa ti vini avek zot, me zot ti fer zot per zot.
Label: Mo pa finn vini pou aboli me pou donn zot zot vre sinifikasion.
------------
Input: The fact is, at the time, you had to pay the teacher in order to go to school.
Prediction: Letan zot al lekol, zot ti fer nwaye dan lekol.
Label: Anverite sa lepok la pou al lekol ti ena enn fiz pou pey profeser.
------------
Input: Angina can be described as a discomfort, heaviness, pressure, aching, burning.
Prediction: Antrefwa “mili” enn antropolog, dibwa, par lafors gravite.
Label: Nou capav dekrir anzinn couma enn sensasion inkonfortab, lourder, presion.
------------
Input: The boy said he would, but he didn't go.
Prediction: Bann garson la ti pou al plis, me li pa ti ale.
Label: Garson-la reponn wi papa, li pou ale me li pa ale.
------------
Input: Was it God in heaven or merely some human being?
Prediction: Eski u ti dan enn lot ka?
Label: Eski sa

In [33]:
val_inputs[0]

'I did not come to do away with them, but to give them their full meaning.'

In [34]:
val_labels[0]

'Mo pa finn vini pou aboli me pou donn zot zot vre sinifikasion.'

In [32]:
tokenizer.batch_decode(output, skip_special_tokens=True)


['Mo pa vinn zwenn zot, me zot ti pe dimann zot.']

In [31]:
output

tensor([[    0, 20004,   890,   101,   230,  1196,    81, 19941,   190,    81,
            32,    68,  1104,    81, 19940,     2]])

In [5]:
val

Unnamed: 0,input,target
0,"I did not come to do away with them, but to gi...",Mo pa finn vini pou aboli me pou donn zot zot ...
1,"The fact is, at the time, you had to pay the t...",Anverite sa lepok la pou al lekol ti ena enn f...
2,"Angina can be described as a discomfort, heavi...",Nou capav dekrir anzinn couma enn sensasion in...
3,"The boy said he would, but he didn't go.","Garson-la reponn wi papa, li pou ale me li pa ..."
4,Was it God in heaven or merely some human being?,Eski sa ti sorti depi dan lesiel ouswa dimoun ...
...,...,...
495,"The angel answered, the Holy Spirit will come ...","Anz la reponn, Lespri Sin pou vinn lor twa, e ..."
496,"At its end, it had a whole lot of Flame-Trees ...","Dan so finision, de kote sime, ti ena enn ta p..."
497,"The king will answer, whenever you did it for ...","Lerla lerwa reponn zot, sak fwa ki zot finn fe..."
498,You Pharisees and teachers of the Law of Moses...,"Maler lor zot profeser lalwa Moiz ek Farizien,..."


In [None]:
import numpy as np


def compute_metrics(eval_preds):
    
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}