# args & validate dataset

In [1]:
model_path = "E:\\data\\models\\gpt-boost\\gpt-2-layer-memory"
data_path = "E:\\data\\corpus\\gpt-boost\\novel.txt"
tokenizer_path = ".\\tokenizer"
tb_path = ".\\runs\\gpt-2-layer-memory"

In [2]:
n_embd=512
n_head=8
n_layer=2
n_positions=256
vocab_size=20000
finetuning_mode=False

In [3]:
learning_rate=1e-4
gradient_accumulation_steps=1
num_train_epochs=10
per_device_train_batch_size=1

# 1. create tokenizer

In [4]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path, max_len=256)

Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


# 2. initialize model

In [5]:
import sys
sys.path.append("../")
from models.memory.modeling_gpt2_memory import GPT2LMHeadModel
# from transformers import GPT2LMHeadModel

if finetuning_mode:
    model = GPT2LMHeadModel.from_pretrained(model_path)
else:
    from transformers import GPT2Config
    config = GPT2Config(
        n_embd=n_embd,
        n_head=n_head,
        n_layer=n_layer,
        n_positions=n_positions,
        vocab_size=vocab_size
    )
    model = GPT2LMHeadModel(config=config)

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


In [6]:
model.num_parameters()
# => 148798464 parameters (çº¦149M)

148798464

# 3. build training Dataset

In [7]:
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=data_path,
    block_size=256,
)

Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 4. initialize our Trainer & Train & Save

In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    no_cuda=False,
    do_train=True,
    fp16=True,
    logging_dir=tb_path,
    logging_steps=100,
    learning_rate=learning_rate,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    warmup_steps=10000,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [18]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='â€¦

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=183188.0, style=ProgressStyle(descriptionâ€¦

RuntimeError: SparseAdam does not support dense gradients, please consider Adam instead

### ðŸŽ‰ Save final model (+ tokenizer + config) to disk

In [None]:
trainer.save_model(model_path)