In [1]:
!pip install transformers



In [16]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("britllm/CuatroLLM")

model = AutoModelForSeq2SeqLM.from_pretrained("models/finetuned_eng_tgl_llama")
model.to("cuda")

tokenizer.pad_token = tokenizer.eos_token

ValueError: Unrecognized configuration class <class 'transformers.models.llama.configuration_llama.LlamaConfig'> for this kind of AutoModel: AutoModelForSeq2SeqLM.
Model type should be one of BartConfig, BigBirdPegasusConfig, BlenderbotConfig, BlenderbotSmallConfig, EncoderDecoderConfig, FSMTConfig, GPTSanJapaneseConfig, LEDConfig, LongT5Config, M2M100Config, MarianConfig, MBartConfig, MT5Config, MvpConfig, NllbMoeConfig, PegasusConfig, PegasusXConfig, PLBartConfig, ProphetNetConfig, Qwen2AudioConfig, SeamlessM4TConfig, SeamlessM4Tv2Config, SwitchTransformersConfig, T5Config, UMT5Config, XLMProphetNetConfig.

In [3]:
import pandas as pd

# Store items in Filipino-English DF

# Filipino lines
with open('en-fil.txt/QED-fil-reduced.txt', 'r') as file:
    fil_lines = file.readlines()

# English lines
with open('en-fil.txt/QED-en-reduced.txt', 'r') as file:
    en_lines = file.readlines()

fil_lines = ['Filipino: ' + fil_line for fil_line in fil_lines]
en_lines = ['English: ' + en_line for en_line in en_lines]

combined_items = list(zip(en_lines, fil_lines))
df = pd.DataFrame(combined_items, columns=['English', 'Filipino'])
print(len(df))

10000


In [4]:
from sklearn.model_selection import train_test_split

# Retrieve some matches for fine tuning
df['Split'] = 'unset'
for_translate, for_shots = train_test_split(df, test_size=0.1, random_state=42)
train, test = train_test_split(for_translate, test_size=0.1, random_state=42)

df.loc[for_shots.index, 'Split'] = 'shots'
df.loc[train.index, 'Split'] = 'train'
df.loc[test.index, 'Split'] = 'test'

In [5]:
source_sentences = list(test['English'])
actual_translations = list(test['Filipino'])

# Translation Predictions

In [17]:
def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src in zip(fuzzy_sources, fuzzy_targets, new_sources):
      segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src
      prompts.append(segment)
  else:
    for new_src in new_sources:
      segment = new_src
      prompts.append(segment)

  return prompts

In [18]:
import random

shots_source = list(for_shots['English'])
shots_target = list(for_shots['Filipino'])

fuzzy_shots = []

for sent in source_sentences:
    idx = random.randint(0, len(shots_source) - 1)
    fuzzy_shots.append([shots_source[idx], shots_target[idx]])

fuzzy_source_sentences, fuzzy_target_prefixes = list(zip(*fuzzy_shots))
online_source_sentences = source_sentences

In [19]:
source_lang = "English"
target_lang = "Filipino"

# Create prompts
# Set one_shot=True to create a one-shot prompts
zero_shot_prompts = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        one_shot=False
                        )
one_shot_prompts = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        one_shot=True
                        )

In [15]:
model_inputs = tokenizer('English.', return_tensors='pt').to("cuda")
generated_ids = model.generate(**model_inputs)
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


English.


In [10]:
zsp_translations = []
osp_translations = []
tokenizer.pad_token_id = tokenizer.eos_token_id

for p in zero_shot_prompts:
    model_inputs = tokenizer(p, return_tensors='pt').to("cuda")
    generated_ids = model.generate(**model_inputs)
    zsp_translations.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

for p in one_shot_prompts:
    model_inputs = tokenizer(p, return_tensors='pt').to("cuda")
    generated_ids = model.generate(**model_inputs)
    osp_translations.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [11]:
# Writing the translations
file_translation_osp = 'en-fil.txt/osp-translations-llama.txt'
file_translation_zsp = 'en-fil.txt/zsp-translations-llama.txt'

with open(file_translation_osp, 'w') as file:
    for l in osp_translations:
        file.write(l + "\n")

with open(file_translation_zsp, 'w') as file:
    for l in zsp_translations:
        file.write(l + "\n")

In [83]:
# Writing sources
file_online_sources = 'en-fil.txt/en-source.txt'

with open(file_online_sources, 'w') as file:
    for l in online_source_sentences:
        file.write(l)

In [7]:
# Writing the reference sentences (actual translations)
file_online_references = 'en-fil.txt/fil-references.txt'

with open(file_online_references, 'w') as file:
    for l in actual_translations:
        file.write(l)

## Load sentences

In [20]:
# Load test dataset

source_test_file = "en-fil.txt/en-source.txt"
target_test_file = "en-fil.txt/fil-references.txt"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

English: That's an insult. stupid? troublesome- idiot
Filipino: Pang iinsulto yan Tanga?


In [21]:
# Read the translations (that we have generated, as discussed in Moslem et al.)
# These are the zero shot translations

translations_file_name = "en-fil.txt/zsp-translations-llama.txt"  # change to your path

with open(translations_file_name, encoding="utf-8") as translated:
  translations = [sent.strip() for sent in translated.readlines()]

print(translations_file_name, "\n")
print(*translations[0:5], sep="\n")

en-fil.txt/zsp-translations-llama.txt 

English: That's an insult. stupid? troublesome- idiot

English: 'Soo--oop of the e--e--evening, Beautiful, beautiful Soup!' &gt;
&
&


In [22]:
# Check if there are missing translations
count = 0
for idx, line in enumerate(translations):
  if len(line.strip()) == 0:
    count += 1
    print(idx,
          source_sentences[idx].strip(),
          target_sentences[idx].strip(),
          sep="\n",
          end="\n\n")
print("Missing translations:", count)

1
English: 'Soo--oop of the e--e--evening, Beautiful, beautiful Soup!' &gt;
Filipino: 'Soo-oop ng e-e-gabi, Maganda, maganda ang sopas!' &gt;

41
English: Since you lost your house, you must have no shoes left. Am I right?
Filipino: Dahil nagiba ang bahay nyo, siguro wala kang sapatos na natira, tama ba ako?

42
English: I am already excited about meeting my old school mates.
Filipino: Nasasabik na akong muling makita ang aking mga kaklase noon.

54
English: If I'm really bored . . .
Filipino: Pag wala na talaga ako magawa..

62
English: You Crazy Mother Fucker!!
Filipino: Walang-hiyang baliw!!

68
English: Wait a second!
Filipino: Teka lang!

69
English: I'll go first.
Filipino: Ako na mauna.

74
English: I'm alright.
Filipino: Okay lang po ako.

83
English: Jandi? This child! I'm sorry that I have to leave this way.
Filipino: JanDi? patawad kung umalis ako ng ganito salamat sa lahat hindi ko makakalimutan ang utang na loob ko sayo mag-ingat kayong dalawa nagsasabi ako ng totoo bakit 

IndexError: list index out of range

# Evaluation

## Calculate BLEU, chrF++, and TER for 0-shot LLaMA Fine-Tuned MT (Eng-Fil)

In [1]:
!pip3 install sacrebleu sentencepiece -q

In [23]:
import sacrebleu

references = target_sentences
translations = translations


# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])  # for spBLEU: tokenize='flores200'
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate chrF++
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)
print("chrF++:", chrf)

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(translations, [references])
ter = round(ter.score, 2)
print("TER:", ter)

BLEU: 0.08
chrF++: 4.89
TER: 122.35


In [25]:
# Read the translations (that we have generated, as discussed in Moslem et al.)
# These are the zero shot translations

translations_file_name = "en-fil.txt/osp-translations-llama.txt"  # change to your path

with open(translations_file_name, encoding="utf-8") as translated:
  translations = [sent.strip() for sent in translated.readlines()]

print(translations_file_name, "\n")
print(*translations[0:5], sep="\n")

en-fil.txt/osp-translations-llama.txt 

English: Say it again....my name.

Filipino: Sabihin mo ulit...ang pangalan ko.

English: That's an insult. stupid? troublesome- idiot


## Calculate BLEU, chrF++, and TER for 1-shot LLaMA Fine-Tuned MT (Eng-Fil)

In [26]:
import sacrebleu

references = target_sentences
translations = translations


# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])  # for spBLEU: tokenize='flores200'
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate chrF++
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)
print("chrF++:", chrf)

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(translations, [references])
ter = round(ter.score, 2)
print("TER:", ter)

BLEU: 0.14
chrF++: 6.25
TER: 116.4
