# Translation English-Kazakh
This notebook implements the finetuning for the translation English-Kazakh, using the WikiMatrix dataset (Extension II). The starting model used in this notebook is English-Turkish.

## Get WikiMatrix dataset
Download WikiMatrix dataset (the documentation can be found at [this link](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix)). In particular, we will use English - Kazakh

In [None]:
# Download dataset en-kk
!wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-kk.tsv.gz
# Download dataset en-tr
!wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-tr.tsv.gz

## Preparation

In [None]:
## Install useful libraries
%%capture
!pip install transformers datasets metrics sacrebleu transformers[sentencepiece]

Select model paths and training options

In [None]:
## Model paths
model_checkpoint = "Helsinki-NLP/opus-tatoeba-en-tr" # path of base model
pretrained_model_checkpoint = "CLAck/en-kk" # path of pretrained model

## Select the desired type of finetuning
TRAINING_TYPE = "pure-finetuning"
#TRAINING_TYPE = "mixed-finetuning"

## Set to True to start with the model in 'pretrained_model_checkpoint', 
## False for the model in 'model_checkpoint'
USE_PRETRAINED_MODEL = False

## Folder for checkpoints during training
checkpoint_dir = "ckp_dir"

In [None]:
###
# Repo containing useful files to manipulate the dataset
###

! git clone https://github.com/facebookresearch/LASER.git

In [None]:
###
# Get sentences for en-kk and sentences for en-tr
###

! python3 /content/LASER/tasks/WikiMatrix/extract.py \
  --tsv WikiMatrix.en-kk.tsv.gz \
  --bitext WikiMatrix.en-kk.txt \
  --src-lang en --trg-lang kk \
  --threshold 1.04

! python3 /content/LASER/tasks/WikiMatrix/extract.py \
  --tsv WikiMatrix.en-tr.tsv.gz \
  --bitext WikiMatrix.en-tr.txt \
  --src-lang en --trg-lang tr \
  --threshold 1.04

In [None]:
###
# Create dataframe with sentence pairs en-kk
###

import pandas as pd

df_en = pd.read_csv("/content/WikiMatrix.en-kk.txt.en", sep="sf,jgb", header=None, names=["en"])
df_kk = pd.read_csv("/content/WikiMatrix.en-kk.txt.kk", sep="sf,jgb", header=None, names=["kk"])
df_en_kk = df_en.join(df_kk)
df_en_kk.head()

In [None]:
###
# Create dataframe with sentence pairs en-tr
# (Same number of examples as for kazakh)
###

import pandas as pd

df_en = pd.read_csv("/content/WikiMatrix.en-tr.txt.en", sep="sf,jgb", header=None, names=["en"], nrows=df_en_kk.shape[0])
df_tr = pd.read_csv("/content/WikiMatrix.en-tr.txt.tr", sep="sf,jgb", header=None, names=["kk"], nrows=df_en_kk.shape[0])
df_en_tr = df_en.join(df_tr)
df_en_tr.head()

In [None]:
### 
# Split dataset into training, evaluation and test
###

from sklearn.model_selection import train_test_split
from datasets import Dataset

## For mixed finetuning, add also Chinese sentences to the training dataset

TRAINING_TYPE = "pure-finetuning"
#TRAINING_TYPE = "mixed-finetuning"

if TRAINING_TYPE == "pure-finetuning":

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_kk["en"] = "<2kk> " + df_en_kk["en"]

  train_df, test_df = train_test_split(df_en_kk, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

elif TRAINING_TYPE == "mixed-finetuning": # add also Turkish sentences

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_kk["en"] = "<2kk> " + df_en_kk["en"]

  # evaluation and test sets only have target language, training set also has Turkish 
  train_df, test_df = train_test_split(df_en_kk, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

  ## Add special token to Turkish dataset
  df_en_tr["en"] = "<2tr> " + df_en_tr["en"]

  # Select subset of Turkish dataset to balance it with target language
  df_en_tr_train, _ = train_test_split(df_en_tr, test_size=0.2, random_state=21)
  train_df = pd.concat([train_df[["en", "kk"]], df_en_tr_train[["en", "kk"]]])


## Tokenization
Load tokenizer, extend it with unknown tokens and tokenize dataset

In [None]:
####
# Load tokenizers: basic Marian tokenizer + pretrained mBart tokenizer
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random, transformers

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="kk_KZ")
marian_tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Check how many sentences are longer than the length limit

num_tokens = 64

tokenized_en_sentences = mbart_tokenizer(list(df_en_kk["en"]))["input_ids"]
#tokenized_en_sentences = marian_tokenizer(list(df_en_target["en"]))["input_ids"]
longer_en = sum([len(s) > num_tokens for s in tokenized_en_sentences])
print(f"Out of {df_en_kk.shape[0]} English sentences, {longer_en} ({longer_en/df_en_kk.shape[0] * 100}%) have more than {num_tokens} tokens")

tokenized_vi_sentences = mbart_tokenizer(list(df_en_kk["kk"]))["input_ids"]
#tokenized_vi_sentences = marian_tokenizer(list(df_en_target["vi"]))["input_ids"]
longer_vi = sum([len(s) > num_tokens for s in tokenized_vi_sentences])
print(f"Out of {df_en_kk.shape[0]} Kazakh sentences, {longer_vi} ({longer_vi/df_en_kk.shape[0] * 100}%) have more than {num_tokens} tokens")

In [None]:
####
# Extend Marian tokenizer by adding tokens from target language according
# to how mBart tokenizes sentences in the dataset
####

import transformers
from tqdm import tqdm

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

tokensZoo = []

for sentence in tqdm(list(df_en_kk["kk"])):
  tokenized_sentence = mbart_tokenizer(sentence)
  for t in mbart_tokenizer.convert_ids_to_tokens(tokenized_sentence["input_ids"]):
    if t.lstrip("▁") not in tokensZoo and t.lstrip("▁") != "":
      tokensZoo.append(t.lstrip("▁"))
    

print(f"{len(tokensZoo)} tokens to be added.")
print(f"initial vocab size: {len(marian_tokenizer)}")
initial_len = len(marian_tokenizer)
marian_tokenizer.add_tokens(tokensZoo, special_tokens=True)
marian_tokenizer.add_tokens(["<2zh>", "<2kk>"], special_tokens=True)
print(f"final vocab size: {len(marian_tokenizer)}")
added_tokens = len(marian_tokenizer) - initial_len

In [None]:
####
# Tokenize the sentences in the dataset
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random
from tqdm import tqdm
import transformers    


transformers.logging.set_verbosity(transformers.logging.CRITICAL)

# create dataset objects
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)
test_ds = Dataset.from_pandas(test_df)


max_input_length = 64
max_target_length = 64
batch_size = 16

# To tokenize English, use a basic Marian tokenizer (the extended version
# has some problems when tokenizing English)
pure_marian_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pure_marian_tokenizer.add_tokens(["<2zh>", "<2kk>"], special_tokens=True)

def preprocess_function(examples):
    targets = [s for s in examples["kk"]]
    inputs = [s for s in examples["en"]]

    model_inputs = pure_marian_tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    # Setup the tokenizer for targets
    with marian_tokenizer.as_target_tokenizer():
        labels = marian_tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    if (len(model_inputs["input_ids"][0])!=len(model_inputs["input_ids"][1])):
        print ("Error!", )

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

columns_to_return = ['input_ids', 'labels', 'attention_mask']
train_ds = train_ds.map(preprocess_function, batched=True, batch_size=batch_size)
train_ds.set_format(type='torch', columns=columns_to_return)
eval_ds = eval_ds.map(preprocess_function, batched=True, batch_size=batch_size)
eval_ds.set_format(type='torch', columns=columns_to_return)
test_ds = test_ds.map(preprocess_function, batched=True, batch_size=batch_size)
test_ds.set_format(type='torch', columns=columns_to_return)


## Prepare training
Define evaluation metric, download initial model and set training arguments

In [None]:
####
# Define the function to compute the BLEU score during training
####
from datasets import load_metric
import numpy as np
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = marian_tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, marian_tokenizer.pad_token_id)
    decoded_labels = marian_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    print("Decoded preds: ", decoded_preds[0:3])
    print("Decoded labels: ", decoded_labels[0:3])

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != marian_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

In [None]:
####
# Download the initial translation model
####
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

if USE_PRETRAINED_MODEL:
  model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_checkpoint)
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  model.resize_token_embeddings(len(marian_tokenizer))

In [None]:
# define training arguments

args = Seq2SeqTrainingArguments(
  checkpoint_dir,
  evaluation_strategy = "epoch",
  save_strategy="epoch",
  learning_rate=2e-4,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=10,
  predict_with_generate=True,
  fp16=True, #CUDA purposes,
  disable_tqdm=False,
)

data_collator = DataCollatorForSeq2Seq(marian_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=marian_tokenizer,
    compute_metrics=compute_metrics
)


## Train and evaluate

In [None]:
trainer.train()

In [None]:
print(trainer.predict(test_ds))