# Continue Pretraining on T5

In [2]:
from datasets import load_from_disk

tokenized_dataset = load_from_disk('Data/tokenized-pretraining-ds').shuffle().select(range(250_000))

tokenized_dataset = tokenized_dataset.train_test_split(.1)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../Models/my_tokenizer')

## Train the Model

In [5]:
%env WANDB_PROJECT=english-v2

env: WANDB_PROJECT=english-v2


In [13]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration, EarlyStoppingCallback

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small", device_map='cuda:0')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="t5-small-continue-pretrain",
    save_strategy="steps",
    eval_strategy='steps',
    auto_find_batch_size=True,
    learning_rate=3e-4,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    push_to_hub=False,
    load_best_model_at_end=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    callbacks=[EarlyStoppingCallback()]
)

In [14]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.3902,0.349715
1000,0.3536,0.339664
1500,0.3218,0.335654
2000,0.3402,0.331978
2500,0.3472,0.328917
3000,0.3447,0.325949
3500,0.2969,0.323871
4000,0.3334,0.320998
4500,0.3276,0.319469
5000,0.3176,0.317655


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=22500, training_loss=0.3146777094946967, metrics={'train_runtime': 7720.6184, 'train_samples_per_second': 87.428, 'train_steps_per_second': 10.929, 'total_flos': 1.218076213248e+16, 'train_loss': 0.3146777094946967, 'epoch': 0.8})

In [15]:
model.save_pretrained('../Models/pretrained-small-model')