# Continue Pretraining on T5

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../../Models/my_tokenizer')

In [2]:
%env WANDB_PROJECT=english-v2

env: WANDB_PROJECT=english-v2


In [3]:
from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration, Adafactor
from accelerate import Accelerator

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-large", device_map='cuda:0')
model.resize_token_embeddings(len(tokenizer))

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=False, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

training_args = TrainingArguments(
    output_dir="buddhist-large-pretrain",
    auto_find_batch_size=True,
    learning_rate=3e-4,
    num_train_epochs=1,
    save_strategy='no'
    )

In [None]:
from datasets import load_dataset
import gc

for i in range(0, 30):
    
    start = i * 60_000
    end = start + 60_000

    print(f'Training on {start} to {end}')

    tokenized_dataset = load_dataset('billingsmoore/temp2',split='train').select(range(start, end))
    gc.collect()

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        optimizers=(optimizer, None)
    )

    trainer.train()

    model.save_pretrained(f'chkpts/chkpt-{end}')

    del tokenized_dataset
    gc.collect()

Training on 0 to 60000


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Step,Training Loss
500,0.2239
1000,0.2174
1500,0.2018
2000,0.2077
2500,0.2124
3000,0.1984
3500,0.1966
4000,0.2114
4500,0.192
5000,0.1895


Training on 60000 to 120000


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Step,Training Loss
500,0.1821
1000,0.1902
1500,0.1882
2000,0.1905
2500,0.1816
3000,0.1823
3500,0.1833
4000,0.1754
4500,0.1757
5000,0.1744


Training on 120000 to 180000


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Step,Training Loss


In [6]:
model.save_pretrained('../../Models/large-models/pretrained-large-model')