We want to build the model off of HuggingFace libraries instead of fairseq scripts. It is easier to debug and understand what you're doing when building the model.
We are starting from absolute scratch. We need to train a sentencepiece tokenizer on joint corpus. That's kinda done in the previous notebook.

However I never got a hf model to train with the tokenizer. Let's try that.

In [2]:
from pathlib import Path

import pandas as pd

### Load Pretrained Model from hub

In [3]:
TOKENIZER_BATCH_SIZE = 256  # Batch-size to train the tokenizer on
TOKENIZER_VOCABULARY = 25000  # Total number of unique subwords the tokenizer can have

BLOCK_SIZE = 128  # Maximum number of tokens in an input sample
NSP_PROB = 0.50  # Probability that the next sentence is the actual next sentence in NSP
SHORT_SEQ_PROB = 0.1  # Probability of generating shorter sequences to minimize the mismatch between pretraining and fine-tuning.
MAX_LENGTH = 512  # Maximum number of tokens in an input sample after padding

MLM_PROB = 0.2  # Probability with which tokens are masked in MLM

TRAIN_BATCH_SIZE = 2  # Batch-size for pretraining the model on
MAX_EPOCHS = 1  # Maximum number of epochs to train the model for
LEARNING_RATE = 1e-4  # Learning rate for training the model

MODEL_CHECKPOINT = "mbart-large-50"  # Name of pretrained model from 🤗 Model Hub

In [4]:
paths = [str(x) for x in Path("../experiments/data/").glob("**/*.jsonl")]
paths_df = [pd.read_json(x,lines=True) for x in paths]

In [5]:
all_str = ''
for i in paths_df:
    add_str = ''.join(i['input'].values + i['target'].values)
    all_str += add_str

In [9]:
sentences = all_str.split(". ")

with open("all_sentences.txt", "w") as file:
    for sentence in sentences:
        file.write(sentence + ".\n")

In [10]:
!spm_train --input="/mnt/disk/yrajcoomar/kreol-benchmark/pipelines/all_sentences.txt" --model_prefix=kreol --vocab_size=20000 --model_type=bpe

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /mnt/disk/yrajcoomar/kreol-benchmark/pipelines/all_sentences.txt
  input_format: 
  model_prefix: kreol
  model_type: BPE
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface

bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=23412 min_freq=7
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5562 size=20 all=2500 active=1831 piece=ar
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3045 size=40 all=3302 active=2633 piece=et
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2380 size=60 all=4262 active=3593 piece=il
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1600 size=80 all=5317 active=4648 piece=ion
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1226 size=100 all=6083 active=5414 piece=▁lor
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1226 min_freq=98
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=957 size=120 all=7048 active=1957 piece=ent
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=804 size=140 all=7832 active=2741 piece=▁se
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=638 size=160 all=8744 active=3653 piece=▁tu
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=582 size=180 all=9364 active=4273 piece=▁R


In [15]:
def batch_iterator():
    for i in range(0, len(all_str), TOKENIZER_BATCH_SIZE):
        yield all_str[i : i + TOKENIZER_BATCH_SIZE]

In [16]:
from tokenizers.implementations import SentencePieceBPETokenizer

In [17]:
tokenizer = SentencePieceBPETokenizer()

In [18]:
tokenizer.train_from_iterator(batch_iterator(), vocab_size=TOKENIZER_BATCH_SIZE, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])







In [21]:
!mkdir tok
tokenizer.save_model("./tok/")
#.save_model("tokenizer")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


1392.18s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


['./tok/vocab.json', './tok/merges.txt']

In [5]:
from tokenizers.implementations import SentencePieceBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = SentencePieceBPETokenizer(
    "tokenizer/vocab.json",
    "tokenizer/merges.txt",
)

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

### Training

In [2]:
import torch
torch.cuda.is_available() ## Current issue with cuda, will seek to resolve.

False

In [4]:
from transformers import MBartConfig

config = MBartConfig(vocab_size=TOKENIZER_VOCABULARY,max_position_embeddings=512)

In [1]:
from transformers import MBart50Tokenizer

tokenizer = MBart50Tokenizer.from_pretrained("./tok/",max_len=256)

  from .autonotebook import tqdm as notebook_tqdm


{'vocab_file': 'sentencepiece.bpe.model', 'added_tokens_file': 'added_tokens.json', 'special_tokens_map_file': 'special_tokens_map.json', 'tokenizer_config_file': 'tokenizer_config.json', 'tokenizer_file': 'tokenizer.json'}


In [2]:
tokenizer

MBart50Tokenizer(name_or_path='./tok/', vocab_size=20054, model_max_length=256, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN', 'af_ZA', 'az_AZ', 'bn_IN', 'fa_IR', 'he_IL', 'hr_HR', 'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 'ml_IN', 'mn_MN', 'mr_IN', 'pl_PL', 'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 'uk_UA', 'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False

In [22]:
''.join(paths_df[0]['input'].values + paths_df[0]['target'].values)

'I did not come to do away with them, but to give them their full meaning.Mo pa finn vini pou aboli me pou donn zot zot vre sinifikasion.The fact is, at the time, you had to pay the teacher in order to go to school.Anverite sa lepok la pou al lekol ti ena enn fiz pou pey profeser.Angina can be described as a discomfort, heaviness, pressure, aching, burning.Nou capav dekrir anzinn couma enn sensasion inkonfortab, lourder, presion.The boy said he would, but he didn\'t go.Garson-la reponn wi papa, li pou ale me li pa ale.Was it God in heaven or merely some human being?Eski sa ti sorti depi dan lesiel ouswa dimoun ki finn invant sa?After finding a very valuable one, the owner goes and sells everything in order to buy that pearl.Ler li trouv enn ki ena enn bel valer, li al vann tou seki li ena pou al aste sa perl la.They paid it for a potter\'s field, as the Lord had commanded me.Zot finn servi sa kas la pou aste later potie kouma Lesegner finn donn lord.So when you go to a town or a villag

In [2]:
from transformers import MBartTokenizer

tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
tokenizer_config.json: 100%|██████████| 531/531 [00:00<00:00, 3.13MB/s]
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 152MB/s]
special_tokens_map.json: 100%|██████████| 649/649 [00:00<00:00, 4.81MB/s]
config.json: 100%|██████████| 1.42k/1.42k [00:00<00:00, 10.4MB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


### Load Dataset

In [13]:
import pandas as pd
import os

def read_concat_jsonl(files,home_dir):
    dfs = [pd.read_json(os.path.join(home_dir,file), lines=True) for file in files]
    return pd.concat(dfs)

def concatenate_language_pairs(language_pairs_paths,home_dir):
    concatenated_dfs = {}
    for key, paths in language_pairs_paths.items():
        dfs = [read_concat_jsonl(paths,home_dir)]
        concatenated_dfs[key] = pd.concat(dfs)
    return concatenated_dfs

In [14]:
language_pairs_paths ={
    'en_cr': ['kreol-benchmark\experiments\data\en-cr\en-cr_dev.jsonl','kreol-benchmark\experiments\data\en-cr\en-cr_train.jsonl','kreol-benchmark\experiments\data\en-cr\en-cr_test.jsonl'],
    'cr':['kreol-benchmark\experiments\data\cr\cr_dev.jsonl','kreol-benchmark\experiments\data\cr\cr_train.jsonl','kreol-benchmark\experiments\data\cr\cr_test.jsonl']
}

In [15]:
data_all_dict = concatenate_language_pairs(language_pairs_paths,home_dir=r'C:\Users\yush\OneDrive\Desktop\papers')

In [16]:
data_all_dict

{'en_cr':                                                  input  \
 0    I did not come to do away with them, but to gi...   
 1    The fact is, at the time, you had to pay the t...   
 2    Angina can be described as a discomfort, heavi...   
 3             The boy said he would, but he didn't go.   
 4     Was it God in heaven or merely some human being?   
 ..                                                 ...   
 995  Any kingdom where people fight each other will...   
 996  And I am not good enough even to stoop down an...   
 997  Who among you, if your son asks for bread, you...   
 998  If that person listens, you have won back a fo...   
 999  Then he pointed to his disciples and said, the...   
 
                                                 target  
 0    Mo pa finn vini pou aboli me pou donn zot zot ...  
 1    Anverite sa lepok la pou al lekol ti ena enn f...  
 2    Nou capav dekrir anzinn couma enn sensasion in...  
 3    Garson-la reponn wi papa, li pou ale me li 