In [1]:
!pip install transformers



In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-1.3B")

model = AutoModelForSeq2SeqLM.from_pretrained("results/checkpoint-504")
model.to("cuda")

tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.78it/s]


In [3]:
import pandas as pd

# Store items in Filipino-English DF

# Filipino lines
with open('en-fil.txt/QED-fil-reduced.txt', 'r') as file:
    fil_lines = file.readlines()

# English lines
with open('en-fil.txt/QED-en-reduced.txt', 'r') as file:
    en_lines = file.readlines()

fil_lines = ['Filipino: ' + fil_line for fil_line in fil_lines]
en_lines = ['English: ' + en_line for en_line in en_lines]

combined_items = list(zip(en_lines, fil_lines))
df = pd.DataFrame(combined_items, columns=['English', 'Filipino'])
print(len(df))

10000


In [4]:
from sklearn.model_selection import train_test_split

# Retrieve some matches for fine tuning
df['Split'] = 'unset'
for_translate, for_shots = train_test_split(df, test_size=0.1, random_state=42)
train, test = train_test_split(for_translate, test_size=0.1, random_state=42)

df.loc[for_shots.index, 'Split'] = 'shots'
df.loc[train.index, 'Split'] = 'train'
df.loc[test.index, 'Split'] = 'test'

In [5]:
source_sentences = list(test['English'])
actual_translations = list(test['Filipino'])

# Translation Predictions

In [69]:
def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src in zip(fuzzy_sources, fuzzy_targets, new_sources):
      segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src
      prompts.append(segment)
  else:
    for new_src in new_sources:
      segment = new_src
      prompts.append(segment)

  return prompts

In [70]:
import random

shots_source = list(for_shots['English'])
shots_target = list(for_shots['Filipino'])

fuzzy_shots = []

for sent in source_sentences:
    idx = random.randint(0, len(shots_source) - 1)
    fuzzy_shots.append([shots_source[idx], shots_target[idx]])

fuzzy_source_sentences, fuzzy_target_prefixes = list(zip(*fuzzy_shots))
online_source_sentences = source_sentences

In [71]:
source_lang = "English"
target_lang = "Filipino"

# Create prompts
# Set one_shot=True to create a one-shot prompts
zero_shot_prompts = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        one_shot=False
                        )
one_shot_prompts = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        one_shot=True
                        )

Filipino
('English: They seem dropping out of the headquarters.\n', 'English: Oh.\n', 'English: Really.\n', 'English: Dickon\'s a kind lad an\' animals likes him."\n', "English: It was two o'clock before Dr. Kemp had finished his work for the night.\n", 'English: Huh?\n', "English: It seems like you're curious about these aspects of my life.\n", 'English: Then she ran down the path through the other door and then into the orchard, and when she stood and looked up there was the tree on the other side of the wall, and there was the robin just finishing his song and, beginning to preen his feathers with his beak.\n', "English: Now we've already done enough rhythmic dictation in the last two foundation levels for you to be able to do that.\n", "English: They're definitely a great couple, aren't they?\n", 'English: Already he was starting to suffer from a shortage of breath, just as in his earlier days when his lungs had been quite unreliable.\n', "English: I'm calling the Police!\n", 'Engl

In [None]:
zsp_translations = []
osp_translations = []

for p in zero_shot_prompts:
    model_inputs = tokenizer(p, return_tensors='pt').to("cuda")
    generated_ids = model.generate(**model_inputs)
    zsp_translations.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

for p in one_shot_prompts:
    model_inputs = tokenizer(p, return_tensors='pt').to("cuda")
    generated_ids = model.generate(**model_inputs)
    osp_translations.append(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

In [None]:
# Writing the translations
file_translation_osp = 'en-fil.txt/osp-translations.txt'
file_translation_zsp = 'en-fil.txt/zsp-translations.txt'

with open(file_translation_osp, 'w') as file:
    for l in osp_translations:
        file.write(l + "\n")

with open(file_translation_zsp, 'w') as file:
    for l in zsp_translations:
        file.write(l + "\n")

In [83]:
# Writing sources
file_online_sources = 'en-fil.txt/en-source.txt'

with open(file_online_sources, 'w') as file:
    for l in online_source_sentences:
        file.write(l)

In [7]:
# Writing the reference sentences (actual translations)
file_online_references = 'en-fil.txt/fil-references.txt'

with open(file_online_references, 'w') as file:
    for l in actual_translations:
        file.write(l)

## Load sentences

In [9]:
# Load test dataset

source_test_file = "en-fil.txt/en-source.txt"
target_test_file = "en-fil.txt/fil-references.txt"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

English: That's an insult. stupid? troublesome- idiot
Filipino: Pang iinsulto yan Tanga?


In [10]:
# Read the translations (that we have generated, as discussed in Moslem et al.)
# These are the zero shot translations

translations_file_name = "en-fil.txt/zsp-translations.txt"  # change to your path

with open(translations_file_name, encoding="utf-8") as translated:
  translations = [sent.strip() for sent in translated.readlines()]

print(translations_file_name, "\n")
print(*translations[0:5], sep="\n")

en-fil.txt/zsp-translations.txt 

Filipino: Ito ay isang pag-insulto. bobo? nakaka-pansin- tanga
Filipino: 'Soo-oop ng email - e-gabi, Maganda, maganda sopas! ' &gt;
Filipino: Sunbae, gusto mo bang subukan ito na hawak ang raketa?
Filipino: O mapagmahal na poot!
Filipino: Geum JanDi...pag natatakot ka pa sa akin eh patay ka na!


In [11]:
# Check if there are missing translations
count = 0
for idx, line in enumerate(translations):
  if len(line.strip()) == 0:
    count += 1
    print(idx,
          source_sentences[idx].strip(),
          target_sentences[idx].strip(),
          sep="\n",
          end="\n\n")
print("Missing translations:", count)

Missing translations: 0


# Evaluation

## Calculate BLEU, chrF++, and TER

In [1]:
!pip3 install sacrebleu sentencepiece -q

In [12]:
import sacrebleu

references = target_sentences
translations = translations


# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])  # for spBLEU: tokenize='flores200'
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate chrF++
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)
print("chrF++:", chrf)

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(translations, [references])
ter = round(ter.score, 2)
print("TER:", ter)

BLEU: 40.14
chrF++: 61.79
TER: 50.53


In [13]:
# Read the translations (that we have generated, as discussed in Moslem et al.)
# These are the zero shot translations

translations_file_name = "en-fil.txt/osp-translations.txt"  # change to your path

with open(translations_file_name, encoding="utf-8") as translated:
  translations = [sent.strip() for sent in translated.readlines()]

print(translations_file_name, "\n")
print(*translations[0:5], sep="\n")

en-fil.txt/osp-translations.txt 

Filipino: Sila ay tila drop ng punong-himpilan.
Filipino: Oh. Filipino: Oh. 'Soo-oop ng email-e-gabi, Maganda, maganda sopas! ' &gt;
Filipino: Talaga.
Filipino: Dickon ay isang uri batang lalaki ng isang 'mga hayop gusto sa kanya." Filipino: Hayop Dickon'sa uri batang lalaki ng isang 'paggusto sa kanya. "
Filipino: Ito ay 02:00 bago Dr Kemp ay tapos na sa kanyang trabaho para sa gabi.


In [14]:
import sacrebleu

references = target_sentences
translations = translations


# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])  # for spBLEU: tokenize='flores200'
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate chrF++
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)
print("chrF++:", chrf)

# Calculate TER
metric = sacrebleu.metrics.TER()
ter = metric.corpus_score(translations, [references])
ter = round(ter.score, 2)
print("TER:", ter)

BLEU: 7.56
chrF++: 30.25
TER: 214.24
