# Get ALT dataset
Download ALT dataset (the documentation can be found at [this link](https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/)), containing translations to different Asian languages. 

In [None]:
# Download dataset
!wget https://www2.nict.go.jp/astrec-att/member/mutiyama/ALT/ALT-Parallel-Corpus-20191206.zip
# Unzip dataset
!unzip ALT-Parallel-Corpus-20191206.zip
! rm ALT-Parallel-Corpus-20191206.zip

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preparation

In [None]:
%%capture
!pip install transformers datasets metrics sacrebleu transformers[sentencepiece]

Start from pretrained MarianMT from English to Chinese

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-zh"

Select language and corresponding tokenizer

In [None]:
language = 'vi'

if language == 'vi': # Vietnamese
  mbart_language = "vi_VN"
elif language == 'indo': # Indonesian
  mbart_language = "id_ID"
elif language == 'fil': # Filipino
  mbart_language = 'tl_XX'

In [None]:
# Load english and target language from dataset
import pandas as pd

df_en = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_en.txt", sep='\t', header=None, names=["id", "en"])
df_target = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_" + language + ".txt", sep='\t', header=None, names=["id", language])
df_en = df_en.set_index("id")
df_target = df_target.set_index("id")
df_en_target = df_en.join(df_target)
df_en_target.dropna(inplace=True)
df_en_target.head()

In [None]:
### 
# Split dataset into training, evaluation and test
###

from sklearn.model_selection import train_test_split
from datasets import Dataset

## For mixed finetuning, add also Chinese sentences to the training dataset

TRAINING_TYPE = "pure-finetuning"
#TRAINING_TYPE = "mixed-finetuning"

if TRAINING_TYPE == "pure-finetuning":

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_target["en"] = "<2" + language + "> " + df_en_target["en"]

  train_df, test_df = train_test_split(df_en_target, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

elif TRAINING_TYPE == "mixed-finetuning": # add also Chinese sentences

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_target["en"] = "<2" + language + "> " + df_en_target["en"]

  # evaluation and test sets only have target language, training set also has Chinese 
  train_df, test_df = train_test_split(df_en_target, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

  # Add Chinese sentences to the training set
  df_en = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_en.txt", sep='\t', header=None, names=["id", "en"])
  df_zh = pd.read_csv("/content/ALT-Parallel-Corpus-20191206/data_zh.txt", sep='\t', header=None, names=["id", language])
  df_en = df_en.set_index("id")
  df_zh = df_zh.set_index("id")
  df_en_zh = df_en.join(df_zh)
  df_en_zh.dropna(inplace=True)
  
  ## Add special token to Chinese dataset
  df_en_zh["en"] = "<2zh> " + df_en_zh["en"]

  # Select subset of Chinese dataset to balance it with target language
  df_en_zh_train, _ = train_test_split(df_en_zh, test_size=0.2, random_state=21)
  train_df = pd.concat([train_df[["en", language]], df_en_zh_train[["en", language]]])

  print("Vietnamese dataset size: ", df_en_target.shape)
  print("Chinese dataset size: ", df_en_zh.shape)
  print("Vietnamese + Chinese dataset size: ", train_df.shape)




In [None]:
####
# Load tokenizers: basic Marian tokenizer + pretrained mBart tokenizer
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random, transformers

transformers.logging.set_verbosity(transformers.logging.CRITICAL)


mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang=mbart_language)
marian_tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Check how many sentences are longer than the length limit

num_tokens = 128

tokenized_en_sentences = mbart_tokenizer(list(df_en_target["en"]))["input_ids"]
longer_en = sum([len(s) > num_tokens for s in tokenized_en_sentences])
print(f"Out of {df_en_target.shape[0]} English sentences, {longer_en} ({longer_en/df_en_target.shape[0] * 100}%) have more than {num_tokens} tokens")

tokenized_vi_sentences = mbart_tokenizer(list(df_en_target["vi"]))["input_ids"]
longer_vi = sum([len(s) > num_tokens for s in tokenized_vi_sentences])
print(f"Out of {df_en_target.shape[0]} Vietnamese sentences, {longer_vi} ({longer_vi/df_en_target.shape[0] * 100}%) have more than {num_tokens} tokens")

In [None]:
####
# Extend Marian tokenizer by adding tokens from target language according
# to how mBart tokenizes sentences in the dataset
####

import transformers
from tqdm import tqdm

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

tokensZoo = []

for sentence in tqdm(list(df_en_target[language])):
  tokenized_sentence = mbart_tokenizer(sentence)
  for t in mbart_tokenizer.convert_ids_to_tokens(tokenized_sentence["input_ids"]):
    if t.lstrip("▁") not in tokensZoo and t.lstrip("▁") != "":
      tokensZoo.append(t.lstrip("▁"))
    

print(f"{len(tokensZoo)} tokens to be added.")
print(f"initial vocab size: {len(marian_tokenizer)}")
initial_len = len(marian_tokenizer)
marian_tokenizer.add_tokens(tokensZoo, special_tokens=True)
marian_tokenizer.add_tokens(["<2zh>", "<2"+language+">"], special_tokens=True)
print(f"final vocab size: {len(marian_tokenizer)}")
added_tokens = len(marian_tokenizer) - initial_len

In [None]:
####
# Tokenize the sentences in the dataset
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random
from tqdm import tqdm
import transformers    


transformers.logging.set_verbosity(transformers.logging.CRITICAL)

# create dataset objects
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)
test_ds = Dataset.from_pandas(test_df)


max_input_length = 64
max_target_length = 64
batch_size = 16

# To tokenize English, use a basic Marian tokenizer (the extended version
# has some problems when tokenizing English)
pure_marian_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pure_marian_tokenizer.add_tokens(["<2zh>", "<2"+language+">"], special_tokens=True)

def preprocess_function(examples):
    targets = [s for s in examples[language]]
    inputs = [s for s in examples["en"]]

    model_inputs = pure_marian_tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    # Setup the tokenizer for targets
    with marian_tokenizer.as_target_tokenizer():
        labels = marian_tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    if (len(model_inputs["input_ids"][0])!=len(model_inputs["input_ids"][1])):
        print ("Error!", )

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

columns_to_return = ['input_ids', 'labels', 'attention_mask']
train_ds = train_ds.map(preprocess_function, batched=True, batch_size=batch_size)
train_ds.set_format(type='torch', columns=columns_to_return)
eval_ds = eval_ds.map(preprocess_function, batched=True, batch_size=batch_size)
eval_ds.set_format(type='torch', columns=columns_to_return)
test_ds = test_ds.map(preprocess_function, batched=True, batch_size=batch_size)
test_ds.set_format(type='torch', columns=columns_to_return)


In [None]:
####
# Define the function to compute the BLEU score during training
####
from datasets import load_metric
import numpy as np
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = marian_tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, marian_tokenizer.pad_token_id)
    decoded_labels = marian_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    print("Decoded preds: ", decoded_preds[0:3])
    print("Decoded labels: ", decoded_labels[0:3])

    if language == 'zh':
      result = metric.compute(predictions=decoded_preds, references=decoded_labels, tokenize='zh')
    else:
      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != marian_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
####
# Download the initial translation model
####
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

USE_PRETRAINED = True

if USE_PRETRAINED:
  model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/mixed_ckp_final")
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  model.resize_token_embeddings(len(marian_tokenizer))

# Define training parameters and train

In [None]:
###
# Define training arguments
###

args = Seq2SeqTrainingArguments(
  "/content/drive/MyDrive/mixed_ckp",
  evaluation_strategy = "epoch",
  save_strategy="epoch",
  learning_rate=2e-5,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=5,
  predict_with_generate=True,
  fp16=True, #CUDA purposes,
  disable_tqdm=False,
  #resume_from_checkpoint = "final_checkpoint"
)

data_collator = DataCollatorForSeq2Seq(marian_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=marian_tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
import torch
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/pure_ckp_final")

In [None]:
print(trainer.predict(test_ds))