# Translation between low-resource languages
This notebook implements the translation between two low-resource languages with an intermediate English translation using pretrained models (second part of Extension I). In particular, the available language pairs are:
* Vietnamese - Khmer
* Vietnamese - Indonesian

## Get ALT dataset
Download ALT dataset (the documentation can be found at [this link](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/)), containing translations to different Asian languages. 

In [None]:
# Download dataset
!wget https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/ALT-Parallel-Corpus-20191206.zip
# Unzip dataset
!unzip ALT-Parallel-Corpus-20191206.zip
! rm ALT-Parallel-Corpus-20191206.zip

## Preparation

In [None]:
###
# Define folders containing the pretrained models
###

checkpoint_vi_en = "CLAck/vi-en"
checkpoint_en_target = "CLAck/en-indo"

# Define target language ("khm" or "indo")
target_lang = "khm"

if target_lang == "khm":
    mbart_lang = "km_KH"
elif target_lang == "indo":
    mbart_lang = "id_ID"

In [None]:
## Install useful libraries
%%capture
!pip install transformers datasets metrics sacrebleu transformers[sentencepiece]

In [None]:
#CHANGE the name in /content/ALT-Parallel-Corpus-20191206/ of data_id.txt to data_indo.txt 
# Needed to avoid confusion between id (-> name of Indonesian language) and id (-> name of index column in dataframe)
import os
os.rename('/content/ALT-Parallel-Corpus-20191206/data_id.txt', '/content/ALT-Parallel-Corpus-20191206/data_indo.txt')

In [None]:
###
# Create dataframe with english, target language and vietnamese
###

import pandas as pd

df_en = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_en.txt", sep='\t', header=None, names=["id", "en"])
df_vi = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_vi.txt", sep='\t', header=None, names=["id", "vi"])
df_target = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_" + target_lang + ".txt", sep='\t', header=None, names=["id", target_lang])
df_en = df_en.set_index("id")
df_vi = df_vi.set_index("id")
df_target = df_target.set_index("id")

# These will be useful for the tokenizer
df_en_vi = df_en.join(df_vi)
df_en_vi.dropna(inplace=True)
df_en_target = df_en.join(df_target)
df_en_target.dropna(inplace=True)

# This will be used for evaluation
df_en_vi_target = df_en_vi.join(df_target)
df_en_vi_target.dropna(inplace=True)
df_en_vi_target.head()

In [None]:
### 
# Split dataset into training, evaluation and test
# To make sure that test sentences have not been seen by the models during training,
# the exact splits used for training are replicated, and are then merged together
###

from sklearn.model_selection import train_test_split

# Indonesian
train_df, test_df_en_target = train_test_split(df_en_target, test_size=0.2, random_state=42)

# Vietnamese
train_df, test_df_en_vi = train_test_split(df_en_vi, test_size=0.2, random_state=42)

test_df = test_df_en_target.join(test_df_en_vi["vi"])
test_df.dropna(inplace=True)
test_df.head()

In [None]:
####
# Load tokenizers: basic Marian tokenizer + pretrained mBart tokenizer for both languages
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random, transformers

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

mbart_tokenizer_vi = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="vi_VN")
mbart_tokenizer_target = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang=mbart_lang)

marian_tokenizer_vi_en = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
marian_tokenizer_en_target = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")

In [None]:
####
# Extend Marian tokenizer by adding tokens from target language according
# to how mBart tokenizes sentences in the dataset
####

import transformers
from tqdm import tqdm

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

## Add vietnamese tokens to vi-en tokenizer
tokensZoo = []

for sentence in tqdm(list(df_en_vi["vi"])):
  tokenized_sentence = mbart_tokenizer_vi(sentence)
  for t in mbart_tokenizer_vi.convert_ids_to_tokens(tokenized_sentence["input_ids"]):
    if t.lstrip("▁") not in tokensZoo and t.lstrip("▁") != "":
      tokensZoo.append(t.lstrip("▁"))
    

print(f"{len(tokensZoo)} tokens to be added.")
print(f"initial vocab size: {len(marian_tokenizer_vi_en)}")
initial_len = len(marian_tokenizer_vi_en)
marian_tokenizer_vi_en.add_tokens(tokensZoo, special_tokens=True)
marian_tokenizer_vi_en.add_tokens(["<zh>", "<vi>"], special_tokens=True)
print(f"final vocab size: {len(marian_tokenizer_vi_en)}")
added_tokens = len(marian_tokenizer_vi_en) - initial_len


## Add target language tokens to en-target tokenizer
tokensZoo = []

for sentence in tqdm(list(df_en_target[target_lang])):
  tokenized_sentence = mbart_tokenizer_target(sentence)
  for t in mbart_tokenizer_target.convert_ids_to_tokens(tokenized_sentence["input_ids"]):
    if t.lstrip("▁") not in tokensZoo and t.lstrip("▁") != "":
      tokensZoo.append(t.lstrip("▁"))
    

print(f"{len(tokensZoo)} tokens to be added.")
print(f"initial vocab size: {len(marian_tokenizer_en_target)}")
initial_len = len(marian_tokenizer_en_target)
marian_tokenizer_en_target.add_tokens(tokensZoo, special_tokens=True)
marian_tokenizer_en_target.add_tokens(["<2zh>", "<2" + target_lang +">"], special_tokens=True)
print(f"final vocab size: {len(marian_tokenizer_en_target)}")
added_tokens = len(marian_tokenizer_en_target) - initial_len

# First Phase
Vietnamese -> English

In [None]:
####
# Load vietnamese - english model 
####
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_vi_en = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_vi_en)

In [None]:
###
# Tokenize sentences in Vietnamese and English (using only test set) and create first dataset
###

from datasets import Dataset

ds_vi = Dataset.from_pandas(test_df)
batch_size = 64

def preprocess_function(examples):
    inputs = [s for s in examples["vi"]]
    targets = [s for s in examples["en"]]
    model_inputs = marian_tokenizer_vi_en(inputs, max_length=64, truncation=True, padding="max_length")

    with marian_tokenizer_vi_en.as_target_tokenizer():
        labels = marian_tokenizer_vi_en(targets, max_length=64, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

columns_to_return = ['input_ids', 'labels', 'attention_mask']
ds_vi = ds_vi.map(preprocess_function, batched=True, batch_size=16)
ds_vi.set_format(type='torch', columns=columns_to_return)


In [None]:
###
# Define trainer arguments to translate from Vietnamese to English
###

args = Seq2SeqTrainingArguments(
  "./", 
  evaluation_strategy = "epoch",
  save_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=5,
  predict_with_generate=True,
  fp16=True, #CUDA purposes,
  disable_tqdm=False,
)

data_collator = DataCollatorForSeq2Seq(marian_tokenizer_vi_en, model=model_vi_en)

trainer = Seq2SeqTrainer(
    model_vi_en,
    args,
    data_collator=data_collator,
    tokenizer=marian_tokenizer_vi_en,
)

In [None]:
###
# Translate from Vietnamese to English and decode predicted tokens
# to get translated sentences
###

predictions = trainer.predict(ds_vi)

translated_en_sentences = []
for p in tqdm(predictions[0]):
  translated_en_sentences.append(marian_tokenizer_vi_en.decode(p, skip_special_tokens=True))

print(translated_en_sentences[:3])

# Second Phase
English -> Target language

In [None]:
###
# Create dataframe with English translations + target language
# and add special token to identify target language
###

test_df_en_target = pd.DataFrame(translated_en_sentences)
test_df_en_target[target_lang] = test_df[target_lang].values
test_df_en_target = test_df_en_target.rename(columns={0:"en"})
test_df_en_target["en"] = "<2" + target_lang + "> " + test_df_en_target["en"]
test_df_en_target.head()

In [None]:
####
# Tokenize the sentences in the dataset
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random
from tqdm import tqdm
import transformers    


transformers.logging.set_verbosity(transformers.logging.CRITICAL)

# create dataset objects
test_ds_en_target = Dataset.from_pandas(test_df_en_target)

max_input_length = 64
max_target_length = 64
batch_size = 16

# To tokenize English, use a basic Marian tokenizer (the extended version
# has some problems when tokenizing English)
pure_marian_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-zh")
pure_marian_tokenizer.add_tokens(["<2zh>", "<2"+target_lang+">"], special_tokens=True)

def preprocess_function(examples):
    targets = [s for s in examples[target_lang]]
    inputs = [s for s in examples["en"]]

    model_inputs = pure_marian_tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    # Setup the tokenizer for targets
    with marian_tokenizer_en_target.as_target_tokenizer():
        labels = marian_tokenizer_en_target(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


test_ds_en_target = test_ds_en_target.map(preprocess_function, batched=True, batch_size=batch_size)
test_ds_en_target.set_format(type='torch', columns=columns_to_return)

In [None]:
####
# Define the function to compute the BLEU score during training
####
from datasets import load_metric
import numpy as np
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = marian_tokenizer_en_target.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, marian_tokenizer_en_target.pad_token_id)
    decoded_labels = marian_tokenizer_en_target.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    print("Decoded preds: ", decoded_preds[0:3])
    print("Decoded labels: ", decoded_labels[0:3])

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != marian_tokenizer_en_target.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
###
# Download the pretrained en-target language model from the hub
###

model_en_target = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_en_target)

In [None]:
###
# Define training arguments
###

args = Seq2SeqTrainingArguments(
  ".",
  evaluation_strategy = "epoch",
  save_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=5,
  predict_with_generate=True,
  fp16=True, #CUDA purposes,
  disable_tqdm=False,
)

data_collator = DataCollatorForSeq2Seq(marian_tokenizer_en_target, model=model_en_target)

trainer = Seq2SeqTrainer(
    model_en_target,
    args,
    data_collator=data_collator,
    tokenizer=marian_tokenizer_en_target,
    compute_metrics=compute_metrics
)

In [None]:
###
# Translate from English to target language and evaluate predictions
###

print(trainer.predict(test_ds_en_target))