# args & validate dataset

In [1]:
model_path = "E:\\data\\models\\gpt-boost\\gpt-8-layer"
data_path = "E:\\data\\corpus\\gpt-boost\\novel.txt"
tokenizer_path = ".\\tokenizer"
tb_path = ".\\runs\\gpt-8-layer"

In [2]:
n_embd=512
n_head=8
n_layer=8
n_positions=256
vocab_size=20000
finetuning_mode=False

In [3]:
learning_rate=5e-4
gradient_accumulation_steps=4
num_train_epochs=10
per_device_train_batch_size=8

# 1. create tokenizer

In [4]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path, max_len=256)

Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


# 2. initialize model

In [5]:
import sys
sys.path.append("../")
# from models.memory.modeling_gpt2_memory import GPT2LMHeadModel
from transformers import GPT2LMHeadModel

if finetuning_mode:
    model = GPT2LMHeadModel.from_pretrained(model_path)
else:
    from transformers import GPT2Config
    config = GPT2Config(
        n_embd=n_embd,
        n_head=n_head,
        n_layer=n_layer,
        n_positions=n_positions,
        vocab_size=vocab_size
    )
    model = GPT2LMHeadModel(config=config)

In [6]:
model.num_parameters()
# => 13524480 parameters (约14M)

35591168

# 3. build training Dataset

In [7]:
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=data_path,
    block_size=256,
)

Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 4. initialize our Trainer & Train & Save

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    no_cuda=False,
    do_train=True,
    fp16=True,
    logging_dir=tb_path,
    logging_steps=100,
    learning_rate=learning_rate,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)



In [None]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 7.482940063476563, 'learning_rate': 0.0004991264849755416, 'epoch': 0.01746801170356784, 'step': 100}
{'loss': 7.119507446289062, 'learning_rate': 0.0004982529699510831, 'epoch': 0.03493602340713568, 'step': 200}
{'loss': 6.949014892578125, 'learning_rate': 0.0004973794549266247, 'epoch': 0.052404035110703524, 'step': 300}
{'loss': 6.817021484375, 'learning_rate': 0.0004965059399021664, 'epoch': 0.06987204681427137, 'step': 400}
{'loss': 6.65694091796875, 'learning_rate': 0.000495632424877708, 'epoch': 0.0873400585178392, 'step': 500}
{'loss': 6.4836767578125, 'learning_rate': 0.0004947589098532495, 'epoch': 0.10480807022140705, 'step': 600}
{'loss': 6.3166796875, 'learning_rate': 0.0004938853948287911, 'epoch': 0.12227608192497488, 'step': 700}
{'loss': 6.1766064453125, 'learning_rate': 0.0004930118798043327, 'epoch': 0.13974409362854273, 'step': 800}
{'loss': 6.093056640625, 'learning_rate': 0.0004921383647798742, 'epoch': 0.15721210533211058, 'step': 900}
{'loss': 6.0122558



{'loss': 5.92087890625, 'learning_rate': 0.0004903913347309573, 'epoch': 0.19214812873924625, 'step': 1100}
{'loss': 5.8588818359375, 'learning_rate': 0.0004895178197064989, 'epoch': 0.2096161404428141, 'step': 1200}
{'loss': 5.778486328125, 'learning_rate': 0.0004886443046820405, 'epoch': 0.22708415214638195, 'step': 1300}
{'loss': 5.721689453125, 'learning_rate': 0.00048777078965758216, 'epoch': 0.24455216384994977, 'step': 1400}
{'loss': 5.66572265625, 'learning_rate': 0.0004868972746331237, 'epoch': 0.26202017555351764, 'step': 1500}
{'loss': 5.580380859375, 'learning_rate': 0.0004860237596086653, 'epoch': 0.27948818725708546, 'step': 1600}
{'loss': 5.52337890625, 'learning_rate': 0.0004851502445842069, 'epoch': 0.2969561989606533, 'step': 1700}
{'loss': 5.4665234375, 'learning_rate': 0.00048427672955974845, 'epoch': 0.31442421066422116, 'step': 1800}
{'loss': 5.416162109375, 'learning_rate': 0.00048340321453529, 'epoch': 0.331892222367789, 'step': 1900}
{'loss': 5.372724609375, 'l

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 4.44794921875, 'learning_rate': 0.00044933612858141157, 'epoch': 1.0132756888947116, 'step': 5800}
{'loss': 4.39857421875, 'learning_rate': 0.0004484626135569532, 'epoch': 1.0307437005982794, 'step': 5900}
{'loss': 4.395625, 'learning_rate': 0.00044758909853249474, 'epoch': 1.0482117123018473, 'step': 6000}
{'loss': 4.38373046875, 'learning_rate': 0.00044671558350803636, 'epoch': 1.065679724005415, 'step': 6100}
{'loss': 4.40021484375, 'learning_rate': 0.00044584206848357797, 'epoch': 1.083147735708983, 'step': 6200}
{'loss': 4.36955078125, 'learning_rate': 0.00044496855345911953, 'epoch': 1.1006157474125509, 'step': 6300}
{'loss': 4.38462890625, 'learning_rate': 0.0004440950384346611, 'epoch': 1.1180837591161186, 'step': 6400}
{'loss': 4.369375, 'learning_rate': 0.00044322152341020265, 'epoch': 1.1355517708196865, 'step': 6500}
{'loss': 4.380546875, 'learning_rate': 0.00044234800838574426, 'epoch': 1.1530197825232542, 'step': 6600}
{'loss': 4.3542578125, 'learning_rate': 0.00

### 🎉 Save final model (+ tokenizer + config) to disk

In [None]:
trainer.save_model(model_path)