# Continue Pretraining on T5

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('/home/j/Desktop/MLotsawa/Notebooks/Models/TibetanToEnglishTranslation/TibetanToEnglishTranslationv2/Ablation/400k/RawData/400k-raw-ds')
dataset = dataset['train']

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google-t5/t5-small')
# Generate a list of all Tibetan Unicode characters (U+0F00 to U+0FFF)
tibetan_chars = [chr(codepoint) for codepoint in range(0x0F00, 0x0FFF)]

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

255

In [4]:
import random

def bo_corrupt_text_batch(examples):
    input_texts = []
    target_texts = []

    for text in examples["bo"]:
        words = text.split('་')
        num_masks = max(1, len(words) // 6)
        masked_indices = sorted(random.sample(range(len(words)), num_masks))

        new_text = []
        labels = []
        current_mask = 0

        for i, word in enumerate(words):
            if i in masked_indices:
                if not new_text or new_text[-1] != f"<extra_id_{current_mask}>":
                    new_text.append(f"<extra_id_{current_mask}>")
                    labels.append(f"<extra_id_{current_mask}> {word}")
                    current_mask += 1
                else:
                    labels[-1] += f" {word}"
            else:
                new_text.append(word)

        input_texts.append(" ".join(new_text))
        target_texts.append(" ".join(labels))

    return {"input_text": input_texts, "target_text": target_texts}

bo_train_dataset = dataset.map(bo_corrupt_text_batch, batched=True)

In [5]:
import random

def en_corrupt_text_batch(examples):
    input_texts = []
    target_texts = []

    for text in examples["en"]:
        words = text.split()
        num_masks = max(1, len(words) // 6)
        masked_indices = sorted(random.sample(range(len(words)), num_masks))

        new_text = []
        labels = []
        current_mask = 0

        for i, word in enumerate(words):
            if i in masked_indices:
                if not new_text or new_text[-1] != f"<extra_id_{current_mask}>":
                    new_text.append(f"<extra_id_{current_mask}>")
                    labels.append(f"<extra_id_{current_mask}> {word}")
                    current_mask += 1
                else:
                    labels[-1] += f" {word}"
            else:
                new_text.append(word)

        input_texts.append(" ".join(new_text))
        target_texts.append(" ".join(labels))

    return {"input_text": input_texts, "target_text": target_texts}

en_train_dataset = dataset.map(en_corrupt_text_batch, batched=True)

In [6]:
from datasets import concatenate_datasets

processed_dataset = concatenate_datasets([en_train_dataset, bo_train_dataset]).shuffle(seed=42)

In [7]:
def tokenize_data(example):
    inputs = tokenizer(example["input_text"], max_length=256, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids
    }

tokenized_dataset = processed_dataset.map(tokenize_data)

del dataset

## Train the Model

In [8]:
import gc
import torch

gc.collect()

with torch.no_grad():
    torch.cuda.empty_cache()

In [9]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration, EarlyStoppingCallback

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map='cuda:0')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="pretrain-checkpoints",
    auto_find_batch_size=True,
    learning_rate=3e-4,
    num_train_epochs=1
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.7447
1000,0.5151
1500,0.5065
2000,0.4268
2500,0.3415
3000,0.2876
3500,0.2814
4000,0.2609
4500,0.2629
5000,0.2579


TrainOutput(global_step=100000, training_loss=0.18743830932617186, metrics={'train_runtime': 13668.7242, 'train_samples_per_second': 58.528, 'train_steps_per_second': 7.316, 'total_flos': 5.41367205888e+16, 'train_loss': 0.18743830932617186, 'epoch': 1.0})