# Get dataset
Download dataset (the documentation can be found at [this link](https://github.com/facebookresearch/LASER/tree/main/tasks/WikiMatrix)). In particular, we will use English - Kazakh

In [None]:
# Download dataset en-kk
!wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-kk.tsv.gz
# Download dataset en-tr
!wget https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.en-tr.tsv.gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Finetuning

In [None]:
%%capture
!pip install transformers datasets metrics sacrebleu

In [None]:
%%capture
! pip install transformers[sentencepiece]

Start from pretrained MarianMT from English to Turkish (same family as Kazakh)

In [None]:
model_checkpoint = "Helsinki-NLP/opus-tatoeba-en-tr"

In [None]:
###
# Repo containing useful files to manipulate the dataset
###

! git clone https://github.com/facebookresearch/LASER.git

Cloning into 'LASER'...
remote: Enumerating objects: 807, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 807 (delta 10), reused 7 (delta 0), pack-reused 787[K
Receiving objects: 100% (807/807), 2.79 MiB | 9.64 MiB/s, done.
Resolving deltas: 100% (313/313), done.


In [None]:
###
# Get sentences for en-kk and sentences for en-tr
###

! python3 /content/LASER/tasks/WikiMatrix/extract.py \
  --tsv WikiMatrix.en-kk.tsv.gz \
  --bitext WikiMatrix.en-kk.txt \
  --src-lang en --trg-lang kk \
  --threshold 1.04

! python3 /content/LASER/tasks/WikiMatrix/extract.py \
  --tsv WikiMatrix.en-tr.tsv.gz \
  --bitext WikiMatrix.en-tr.txt \
  --src-lang en --trg-lang tr \
  --threshold 1.04

Tool to extract bitext from the WikiMatrix
Processing WikiMatrix.en-kk.tsv.gz
 - wrote 20053 lines
 - with 234204 source and 182143 target words
 - last threshold is 1.0400
Tool to extract bitext from the WikiMatrix
Processing WikiMatrix.en-tr.tsv.gz
 - wrote 477735 lines
 - with 8579379 source and 6939653 target words
 - last threshold is 1.0400


In [None]:
###
# Create dataframe with sentence pairs en-kk
###

import pandas as pd

df_en = pd.read_csv("/content/WikiMatrix.en-kk.txt.en", sep="sf,jgb", header=None, names=["en"])
df_kk = pd.read_csv("/content/WikiMatrix.en-kk.txt.kk", sep="sf,jgb", header=None, names=["kk"])
df_en_kk = df_en.join(df_kk)
df_en_kk.head()

  return func(*args, **kwargs)


Unnamed: 0,en,kk
0,"""Surely your Lord is ever watchful.”",Раббыңның үкіміне сабыр ет.
1,Be respectful to your Lord.,Раббыңды ұлықта!
2,Opsaridium ubangiense.,Ұлы Раббыңның атын дәріпте.
3,It will destroy everything at the bidding of i...,Раббыларыңның қай нығметтерін өтірік дейсіңдер.
4,We proclaim the presence of the risen Lord in ...,Раббымызға ешбіреуді ортақ қоспаймыз».


In [None]:
###
# Create dataframe with sentence pairs en-tr
# (Same number of examples as for kazakh)
###

import pandas as pd

df_en = pd.read_csv("/content/WikiMatrix.en-tr.txt.en", sep="sf,jgb", header=None, names=["en"], nrows=df_en_kk.shape[0])
df_tr = pd.read_csv("/content/WikiMatrix.en-tr.txt.tr", sep="sf,jgb", header=None, names=["kk"], nrows=df_en_kk.shape[0])
df_en_tr = df_en.join(df_tr)
df_en_tr.head()

  return func(*args, **kwargs)


Unnamed: 0,en,kk
0,And now the guidance from their Lord hath come...,"Andolsun ki, kendilerine, Rableri katından yol..."
1,"Is the religion of the Israelites, or that of ...",İsrailoğullarının dinleri mi yoksa Hristiyanla...
2,This is the day the Lord hath made!,"İşte o gün sevk, ancak Rabbinedir."
3,"Among them is Olar (Andre Paras), Mumay (Ryzza...","Don Omar, (El Rey olarak da bilinen)(d."
4,"Read, for your Lord is most beneficent.","Oku ve senin Rabbin, sonsuz kerem sahibidir."


In [None]:
### 
# Split dataset into training, evaluation and test
###

from sklearn.model_selection import train_test_split
from datasets import Dataset

## For mixed finetuning, add also Chinese sentences to the training dataset

TRAINING_TYPE = "pure-finetuning"
#TRAINING_TYPE = "mixed-finetuning"

if TRAINING_TYPE == "pure-finetuning":

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_kk["en"] = "<2kk> " + df_en_kk["en"]

  train_df, test_df = train_test_split(df_en_kk, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

elif TRAINING_TYPE == "mixed-finetuning": # add also Turkish sentences

  ## Add special token to target language dataset ("<2zz>" for language zz)
  df_en_kk["en"] = "<2kk> " + df_en_kk["en"]

  # evaluation and test sets only have target language, training set also has Turkish 
  train_df, test_df = train_test_split(df_en_kk, test_size=0.2, random_state=42)
  eval_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

  ## Add special token to Turkish dataset
  df_en_tr["en"] = "<2tr> " + df_en_tr["en"]

  # Select subset of Turkish dataset to balance it with target language
  df_en_tr_train, _ = train_test_split(df_en_tr, test_size=0.2, random_state=21)
  train_df = pd.concat([train_df[["en", "kk"]], df_en_tr_train[["en", "kk"]]])

  print("Kazakh dataset size: ", df_en_kk.shape)
  print("Turkish dataset size: ", df_en_tr.shape)
  print("Kazakh + Turkish dataset size: ", train_df.shape)




In [None]:
####
# Load tokenizers: basic Marian tokenizer + pretrained mBart tokenizer
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random, transformers

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="kk_KZ")
marian_tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)


In [None]:
# Check how many sentences are longer than the length limit

num_tokens = 64

tokenized_en_sentences = mbart_tokenizer(list(df_en_kk["en"]))["input_ids"]
#tokenized_en_sentences = marian_tokenizer(list(df_en_target["en"]))["input_ids"]
longer_en = sum([len(s) > num_tokens for s in tokenized_en_sentences])
print(f"Out of {df_en_kk.shape[0]} English sentences, {longer_en} ({longer_en/df_en_kk.shape[0] * 100}%) have more than {num_tokens} tokens")

tokenized_vi_sentences = mbart_tokenizer(list(df_en_kk["kk"]))["input_ids"]
#tokenized_vi_sentences = marian_tokenizer(list(df_en_target["vi"]))["input_ids"]
longer_vi = sum([len(s) > num_tokens for s in tokenized_vi_sentences])
print(f"Out of {df_en_kk.shape[0]} Vietnamese sentences, {longer_vi} ({longer_vi/df_en_kk.shape[0] * 100}%) have more than {num_tokens} tokens")

Out of 20053 English sentences, 286 (1.426220515633571%) have more than 64 tokens
Out of 20053 Vietnamese sentences, 247 (1.231735899865357%) have more than 64 tokens


In [None]:
####
# Extend Marian tokenizer by adding tokens from target language according
# to how mBart tokenizes sentences in the dataset
####

import transformers
from tqdm import tqdm

transformers.logging.set_verbosity(transformers.logging.CRITICAL)

tokensZoo = []

for sentence in tqdm(list(df_en_kk["kk"])):
  tokenized_sentence = mbart_tokenizer(sentence)
  for t in mbart_tokenizer.convert_ids_to_tokens(tokenized_sentence["input_ids"]):
    if t.lstrip("▁") not in tokensZoo and t.lstrip("▁") != "":
      tokensZoo.append(t.lstrip("▁"))
    

print(f"{len(tokensZoo)} tokens to be added.")
print(f"initial vocab size: {len(marian_tokenizer)}")
initial_len = len(marian_tokenizer)
marian_tokenizer.add_tokens(tokensZoo, special_tokens=True)
marian_tokenizer.add_tokens(["<2zh>", "<2kk>"], special_tokens=True)
print(f"final vocab size: {len(marian_tokenizer)}")
added_tokens = len(marian_tokenizer) - initial_len

100%|██████████| 20053/20053 [00:41<00:00, 487.25it/s]


16850 tokens to be added.
initial vocab size: 59994
final vocab size: 74151


In [None]:
####
# Tokenize the sentences in the dataset
####

from transformers import AutoTokenizer, MBart50TokenizerFast, MarianTokenizer
import random
from tqdm import tqdm
import transformers    


transformers.logging.set_verbosity(transformers.logging.CRITICAL)

# create dataset objects
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)
test_ds = Dataset.from_pandas(test_df)


max_input_length = 64
max_target_length = 64
batch_size = 16

# To tokenize English, use a basic Marian tokenizer (the extended version
# has some problems when tokenizing English)
pure_marian_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pure_marian_tokenizer.add_tokens(["<2zh>", "<2kk>"], special_tokens=True)

def preprocess_function(examples):
    targets = [s for s in examples["kk"]]
    inputs = [s for s in examples["en"]]

    model_inputs = pure_marian_tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    # Setup the tokenizer for targets
    with marian_tokenizer.as_target_tokenizer():
        labels = marian_tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    if (len(model_inputs["input_ids"][0])!=len(model_inputs["input_ids"][1])):
        print ("Error!", )

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

columns_to_return = ['input_ids', 'labels', 'attention_mask']
train_ds = train_ds.map(preprocess_function, batched=True, batch_size=batch_size)
train_ds.set_format(type='torch', columns=columns_to_return)
eval_ds = eval_ds.map(preprocess_function, batched=True, batch_size=batch_size)
eval_ds.set_format(type='torch', columns=columns_to_return)
test_ds = test_ds.map(preprocess_function, batched=True, batch_size=batch_size)
test_ds.set_format(type='torch', columns=columns_to_return)


  0%|          | 0/1003 [00:00<?, ?ba/s]

  0%|          | 0/126 [00:00<?, ?ba/s]

  0%|          | 0/126 [00:00<?, ?ba/s]

In [None]:
####
# Define the function to compute the BLEU score during training
####
from datasets import load_metric
import numpy as np
metric = load_metric("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = marian_tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, marian_tokenizer.pad_token_id)
    decoded_labels = marian_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    print("Decoded preds: ", decoded_preds[0:3])
    print("Decoded labels: ", decoded_labels[0:3])

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != marian_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

In [None]:
####
# Download the initial translation model
####
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

USE_PRETRAINED = False

if USE_PRETRAINED:
  model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/mixed_ckp_final")
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  model.resize_token_embeddings(len(marian_tokenizer))

Downloading:   0%|          | 0.00/990 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/202M [00:00<?, ?B/s]

# Define training parameters and train

In [None]:
# define training arguments
USE_PRETRAINED = False

if USE_PRETRAINED:
  model = AutoModelForSeq2SeqLM.from_pretrained("final_checkpoint")

args = Seq2SeqTrainingArguments(
  "/content/drive/MyDrive/pure_en_kk",
  evaluation_strategy = "epoch",
  save_strategy="epoch",
  learning_rate=2e-4,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  weight_decay=0.01,
  save_total_limit=1,
  num_train_epochs=10,
  predict_with_generate=True,
  fp16=True, #CUDA purposes,
  disable_tqdm=False,
  #resume_from_checkpoint = "final_checkpoint"
)

data_collator = DataCollatorForSeq2Seq(marian_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=marian_tokenizer,
    compute_metrics=compute_metrics
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [None]:
import torch
#torch.cuda.empty_cache()
#trainer.train(resume_from_checkpoint = "final_checkpoint")
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: en, __index_level_0__, kk.
***** Running training *****
  Num examples = 16042
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 10030


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.6009,1.506271,4.0241,15.9332


The following columns in the evaluation set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: en, __index_level_0__, kk.
***** Running Evaluation *****
  Num examples = 2005
  Batch size = 16


Decoded preds:  ['1952 жылы « Қазақ »( ағы л.)', 'Қала ның тұрақты тұрғындары ның саны 26 адамды құрайды (2008).', 'Қала ның тұрақты тұрғындары ның саны 26 адамды құрайды (2008).']
Decoded labels:  [['А., 1932 ; Тү йе тұқым ын асы лда нды ру туралы.'], ['Да ун синдром ымен балалар ды көбі не жасы келген әйелдер боса над ы.'], ['Бұл адамды не ын тал анд ыр ады?']]


Saving model checkpoint to /content/drive/MyDrive/pure_en_kk/checkpoint-1003
Configuration saved in /content/drive/MyDrive/pure_en_kk/checkpoint-1003/config.json
Model weights saved in /content/drive/MyDrive/pure_en_kk/checkpoint-1003/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/pure_en_kk/checkpoint-1003/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/pure_en_kk/checkpoint-1003/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/pure_en_kk/checkpoint-1003/added_tokens.json


In [None]:
trainer.save_model("/content/drive/MyDrive/pure_ckp_final")

Saving model checkpoint to /content/drive/MyDrive/pure_ckp_final
Configuration saved in /content/drive/MyDrive/pure_ckp_final/config.json
Model weights saved in /content/drive/MyDrive/pure_ckp_final/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/pure_ckp_final/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/pure_ckp_final/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/pure_ckp_final/added_tokens.json


In [None]:
print(trainer.predict(test_ds))

The following columns in the test set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: en, kk, __index_level_0__.
***** Running Prediction *****
  Num examples = 2006
  Batch size = 16


Decoded preds:  ['Өз ен са ға сы Кі ші өз ені нің оң жа ға лау ынан 53 км қа шықты қ та орналасқан.', 'Қала ның тұрақты тұрғындары ның саны 281 адамды құрайды (31 желтоқсан 2010 жыл ).', 'ISO / IEC JSC1/SC34( ISA / IO C Common Technic alCommittee1Subcomitee34– Belg eseltanımlamaveişlemedilleri) tara fındangeliştiri lm iştir.']
Decoded labels:  [['Өз ен са ға сы Север ка өз ені нің оң жа ға лау ынан 53 км қа шықты қ та орналасқан.'], ['Сол жылы ол қазіргі Демократ иялық сенатор Э ван Бай қарсы шығу ға шақыр ылды, бірақ Бай х күт пе ген жерден зе йн ет кер лік ке шыққан ын мәлім деген нен кейін де жарыс тан шық па уға шешім қабылда ды.butoptednottoentertherace,evenafterBayh unexpected ly announced thathe'], ['TeX ISO / IEC JTC 1/ WG 4 Information Technology — Document Description and Process ing Language s( ағы л.)']]
PredictionOutput(predictions=array([[59993, 60696, 60256, ..., 59993, 59993, 59993],
       [59993, 60237, 59997, ..., 59993, 59993, 59993],
       [59993, 65530,   799, ..