We want to build the model off of HuggingFace libraries instead of fairseq scripts. It is easier to debug and understand what you're doing when building the model.
We are starting from absolute scratch. We need to train a sentencepiece tokenizer on joint corpus. That's kinda done in the previous notebook.

However I never got a hf model to train with the tokenizer. Let's try that.

In [1]:
from pathlib import Path
from tokenizers.implementations import SentencePieceBPETokenizer
import pandas as pd

### Load Pretrained Model from hub

In [2]:
TOKENIZER_BATCH_SIZE = 256  # Batch-size to train the tokenizer on
TOKENIZER_VOCABULARY = 25000  # Total number of unique subwords the tokenizer can have

BLOCK_SIZE = 128  # Maximum number of tokens in an input sample
NSP_PROB = 0.50  # Probability that the next sentence is the actual next sentence in NSP
SHORT_SEQ_PROB = 0.1  # Probability of generating shorter sequences to minimize the mismatch between pretraining and fine-tuning.
MAX_LENGTH = 512  # Maximum number of tokens in an input sample after padding

MLM_PROB = 0.2  # Probability with which tokens are masked in MLM

TRAIN_BATCH_SIZE = 2  # Batch-size for pretraining the model on
MAX_EPOCHS = 1  # Maximum number of epochs to train the model for
LEARNING_RATE = 1e-4  # Learning rate for training the model

MODEL_CHECKPOINT = "mbart-large-50"  # Name of pretrained model from ðŸ¤— Model Hub

In [3]:
# Adding all the data i have in 1 large text file to train my tokenizer

# paths = [str(x) for x in Path("../experiments/data/").glob("**/*.jsonl")]
# paths_df = [pd.read_json(x,lines=True) for x in paths]

# all_str = ''
# for i in paths_df:
#     add_str = ''.join(i['input'].values + i['target'].values)
#     all_str += add_str

# sentences = all_str.split(". ")

# with open("all_sentences.txt", "w") as file:
#     for sentence in sentences:
#         file.write(sentence + ".\n")

# !spm_train --input="/mnt/disk/yrajcoomar/kreol-benchmark/pipelines/all_sentences.txt" --model_prefix=kreol --vocab_size=20000 --model_type=bpe

In [4]:
## Train tokenizer

# def batch_iterator():
#     for i in range(0, len(all_str), TOKENIZER_BATCH_SIZE):
#         yield all_str[i : i + TOKENIZER_BATCH_SIZE]

# tokenizer = SentencePieceBPETokenizer()
# tokenizer.train_from_iterator(batch_iterator(), vocab_size=TOKENIZER_BATCH_SIZE, min_frequency=2, special_tokens=[
#     "<s>",
#     "<pad>",
#     "</s>",
#     "<unk>",
#     "<mask>",
# ])


# !mkdir tok
# tokenizer.save_model("./tok/")

In [5]:
## idrk whats going on

# from tokenizers.implementations import SentencePieceBPETokenizer
# from tokenizers.processors import BertProcessing


# tokenizer = SentencePieceBPETokenizer(
#     "tokenizer/vocab.json",
#     "tokenizer/merges.txt",
# )
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
# tokenizer.enable_truncation(max_length=512)

In [6]:
from transformers import MBart50Tokenizer

tokenizer = MBart50Tokenizer.from_pretrained("./tok/",max_len=256)

  from .autonotebook import tqdm as notebook_tqdm


### Preprocess and split Data

In [7]:
from datasets import load_dataset

In [8]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={'train':'/mnt/disk/yrajcoomar/kreol-benchmark/experiments/data/en-cr/en-cr_train.jsonl','test':'/mnt/disk/yrajcoomar/kreol-benchmark/experiments/data/en-cr/en-cr_test.jsonl',
                'val':'/mnt/disk/yrajcoomar/kreol-benchmark/experiments/data/en-cr/en-cr_dev.jsonl'}
)


Using custom data configuration default-0cd2e6f29b60aba8
Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/default-0cd2e6f29b60aba8/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:00<00:00, 1278.75it/s]


In [9]:
def preprocess_function(examples):
    inputs = examples['input']
    outputs = examples['target']
    inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    outputs = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
    inputs["input_ids"] = [[tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] for ids in inputs["input_ids"]]
    outputs["input_ids"] = [[tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id] for ids in outputs["input_ids"]]
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "decoder_input_ids": outputs["input_ids"], "decoder_attention_mask": outputs["attention_mask"]}

dataset = dataset.map(preprocess_function, batched=True)

 95%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ| 21/22 [00:04<00:00,  4.76ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]
  0%|          | 0/1 [00:00<?, ?ba/s]


In [10]:
train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['val']

### Training

In [11]:
import torch
torch.cuda.is_available() ## Current issue with cuda, will seek to resolve.

True

In [12]:
from transformers import MBartConfig, MBartForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments #, Seq2SeqTrainer

config = MBartConfig(vocab_size=TOKENIZER_VOCABULARY,max_position_embeddings=512)
model = MBartForConditionalGeneration(config)


collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, model=model
)

In [13]:
# from torch.utils.data import IterableDataset

In [16]:
from transformers import Seq2SeqTrainer

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./checkpoint',
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    per_gpu_eval_batch_size=4,
    save_steps=2,
    save_total_limit=2,
    prediction_loss_only=True,
)

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [16]:
## UPDATELOG:
# 21/02/24 - Set up whole pipeline. Issue is cuda is only 9.1 on sv. so i can only use cudatoolkit 9.0 and torch==1.1

In [None]:
trainer.train()

In [9]:
model.num_parameters()
#380M

379375616

In [10]:
from transformers import LineByLineTextDataset

In [11]:

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./all_sentences.txt",
    block_size=128,
)



### Load Dataset

In [13]:
import pandas as pd
import os

def read_concat_jsonl(files,home_dir):
    dfs = [pd.read_json(os.path.join(home_dir,file), lines=True) for file in files]
    return pd.concat(dfs)

def concatenate_language_pairs(language_pairs_paths,home_dir):
    concatenated_dfs = {}
    for key, paths in language_pairs_paths.items():
        dfs = [read_concat_jsonl(paths,home_dir)]
        concatenated_dfs[key] = pd.concat(dfs)
    return concatenated_dfs

In [14]:
language_pairs_paths ={
    'en_cr': ['kreol-benchmark\experiments\data\en-cr\en-cr_dev.jsonl','kreol-benchmark\experiments\data\en-cr\en-cr_train.jsonl','kreol-benchmark\experiments\data\en-cr\en-cr_test.jsonl'],
    'cr':['kreol-benchmark\experiments\data\cr\cr_dev.jsonl','kreol-benchmark\experiments\data\cr\cr_train.jsonl','kreol-benchmark\experiments\data\cr\cr_test.jsonl']
}

In [15]:
data_all_dict = concatenate_language_pairs(language_pairs_paths,home_dir=r'C:\Users\yush\OneDrive\Desktop\papers')

In [16]:
data_all_dict

{'en_cr':                                                  input  \
 0    I did not come to do away with them, but to gi...   
 1    The fact is, at the time, you had to pay the t...   
 2    Angina can be described as a discomfort, heavi...   
 3             The boy said he would, but he didn't go.   
 4     Was it God in heaven or merely some human being?   
 ..                                                 ...   
 995  Any kingdom where people fight each other will...   
 996  And I am not good enough even to stoop down an...   
 997  Who among you, if your son asks for bread, you...   
 998  If that person listens, you have won back a fo...   
 999  Then he pointed to his disciples and said, the...   
 
                                                 target  
 0    Mo pa finn vini pou aboli me pou donn zot zot ...  
 1    Anverite sa lepok la pou al lekol ti ena enn f...  
 2    Nou capav dekrir anzinn couma enn sensasion in...  
 3    Garson-la reponn wi papa, li pou ale me li 