# Continue Pretraining on T5

In [1]:
from datasets import load_from_disk

dataset = load_from_disk('/home/j/Desktop/MLotsawa/Notebooks/Models/TibetanToEnglishTranslation/TibetanToEnglishTranslationv2/Ablation/100k/RawData/100k-raw-ds')
dataset = dataset['train']

Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('billingsmoore/getok-v0')

In [None]:
import random

def bo_corrupt_text_batch(examples):
    input_texts = []
    target_texts = []

    for text in examples["bo"]:
        words = text.split('་')
        num_masks = max(1, len(words) // 6)
        masked_indices = sorted(random.sample(range(len(words)), num_masks))

        new_text = []
        labels = []
        current_mask = 0

        for i, word in enumerate(words):
            if i in masked_indices:
                if not new_text or new_text[-1] != f"<extra_id_{current_mask}>":
                    new_text.append(f"<extra_id_{current_mask}>")
                    labels.append(f"<extra_id_{current_mask}> {word}")
                    current_mask += 1
                else:
                    labels[-1] += f" {word}"
            else:
                new_text.append(word)

        input_texts.append(" ".join(new_text))
        target_texts.append(" ".join(labels))

    return {"input_text": input_texts, "target_text": target_texts}

bo_train_dataset = dataset.map(bo_corrupt_text_batch, batched=True)

In [None]:
import random

def en_corrupt_text_batch(examples):
    input_texts = []
    target_texts = []

    for text in examples["en"]:
        words = text.split()
        num_masks = max(1, len(words) // 6)
        masked_indices = sorted(random.sample(range(len(words)), num_masks))

        new_text = []
        labels = []
        current_mask = 0

        for i, word in enumerate(words):
            if i in masked_indices:
                if not new_text or new_text[-1] != f"<extra_id_{current_mask}>":
                    new_text.append(f"<extra_id_{current_mask}>")
                    labels.append(f"<extra_id_{current_mask}> {word}")
                    current_mask += 1
                else:
                    labels[-1] += f" {word}"
            else:
                new_text.append(word)

        input_texts.append(" ".join(new_text))
        target_texts.append(" ".join(labels))

    return {"input_text": input_texts, "target_text": target_texts}

en_train_dataset = dataset.map(en_corrupt_text_batch, batched=True)

In [None]:
from datasets import concatenate_datasets

processed_dataset = concatenate_datasets([en_train_dataset, bo_train_dataset]).shuffle(seed=42)

In [None]:
def tokenize_data(example):
    inputs = tokenizer(example["input_text"], max_length=256, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=256, truncation=True, padding="max_length")
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids
    }

tokenized_dataset = processed_dataset.map(tokenize_data)

## Train the Model

In [4]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration, EarlyStoppingCallback

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map='cuda:0')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="pretrain-checkpoints",
    auto_find_batch_size=True,
    learning_rate=3e-4,
    num_train_epochs=1
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]