# args & validate dataset

In [3]:
model_path = "E:\\data\\models\\gpt-boost\\gpt-tiny-memory"
data_path = "E:\\data\\corpus\\gpt-boost\\novel.txt"
tokenizer_path = ".\\tokenizer"
tb_path = ".\\runs\\gpt-tiny-memory"

In [2]:
n_embd=512
n_head=8
n_layer=4
n_positions=256
vocab_size=20000
finetuning_mode=False

In [3]:
learning_rate=1e-3
gradient_accumulation_steps=4
num_train_epochs=40
per_device_train_batch_size=8

# 1. create tokenizer

In [4]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_path, max_len=256)

Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


# 2. initialize model

In [5]:
import sys
sys.path.append("../")
from models.memory.modeling_gpt2_memory import GPT2LMHeadModel

if finetuning_mode:
    model = GPT2LMHeadModel.from_pretrained(model_path)
else:
    from transformers import GPT2Config
    config = GPT2Config(
        n_embd=n_embd,
        n_head=n_head,
        n_layer=n_layer,
        n_positions=n_positions,
        vocab_size=vocab_size
    )
    model = GPT2LMHeadModel(config=config)

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


In [6]:
model.num_parameters()
# => 161405952 parameters (约1.6B)

161405952

# 3. build training Dataset

In [7]:
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=data_path,
    block_size=256,
)

Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.

This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 4. initialize our Trainer & Train & Save

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    no_cuda=False,
    do_train=True,
    fp16=True,
    logging_dir=tb_path,
    logging_steps=100,
    learning_rate=learning_rate,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)



In [10]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=40.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 7.406942138671875, 'learning_rate': 0.000999563242487771, 'epoch': 0.01746801170356784, 'step': 100}
{'loss': 7.035367431640625, 'learning_rate': 0.0009991264849755416, 'epoch': 0.03493602340713568, 'step': 200}
{'loss': 6.83661376953125, 'learning_rate': 0.0009986897274633123, 'epoch': 0.052404035110703524, 'step': 300}
{'loss': 6.65123779296875, 'learning_rate': 0.0009982529699510832, 'epoch': 0.06987204681427137, 'step': 400}
{'loss': 6.44953369140625, 'learning_rate': 0.000997816212438854, 'epoch': 0.0873400585178392, 'step': 500}
{'loss': 6.2636328125, 'learning_rate': 0.0009973794549266246, 'epoch': 0.10480807022140705, 'step': 600}
{'loss': 6.12408447265625, 'learning_rate': 0.0009969426974143956, 'epoch': 0.12227608192497488, 'step': 700}
{'loss': 5.999541015625, 'learning_rate': 0.0009965059399021663, 'epoch': 0.13974409362854273, 'step': 800}
{'loss': 5.910390625, 'learning_rate': 0.0009960691823899372, 'epoch': 0.15721210533211058, 'step': 900}
{'loss': 5.8230078125



{'loss': 5.7264453125, 'learning_rate': 0.0009951956673654788, 'epoch': 0.19214812873924625, 'step': 1100}
{'loss': 5.66408203125, 'learning_rate': 0.0009947589098532495, 'epoch': 0.2096161404428141, 'step': 1200}
{'loss': 5.587314453125, 'learning_rate': 0.0009943221523410204, 'epoch': 0.22708415214638195, 'step': 1300}
{'loss': 5.53396484375, 'learning_rate': 0.000993885394828791, 'epoch': 0.24455216384994977, 'step': 1400}
{'loss': 5.48033203125, 'learning_rate': 0.0009934486373165618, 'epoch': 0.26202017555351764, 'step': 1500}
{'loss': 5.4062890625, 'learning_rate': 0.0009930118798043327, 'epoch': 0.27948818725708546, 'step': 1600}
{'loss': 5.356865234375, 'learning_rate': 0.0009925751222921034, 'epoch': 0.2969561989606533, 'step': 1700}
{'loss': 5.306728515625, 'learning_rate': 0.000992138364779874, 'epoch': 0.31442421066422116, 'step': 1800}
{'loss': 5.2624609375, 'learning_rate': 0.000991701607267645, 'epoch': 0.331892222367789, 'step': 1900}
{'loss': 5.2238671875, 'learning_ra

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 4.577109375, 'learning_rate': 0.0009746680642907058, 'epoch': 1.0132756888947116, 'step': 5800}
{'loss': 4.5317578125, 'learning_rate': 0.0009742313067784766, 'epoch': 1.0307437005982794, 'step': 5900}
{'loss': 4.5296484375, 'learning_rate': 0.0009737945492662474, 'epoch': 1.0482117123018473, 'step': 6000}
{'loss': 4.52201171875, 'learning_rate': 0.0009733577917540181, 'epoch': 1.065679724005415, 'step': 6100}
{'loss': 4.540078125, 'learning_rate': 0.0009729210342417891, 'epoch': 1.083147735708983, 'step': 6200}
{'loss': 4.51197265625, 'learning_rate': 0.0009724842767295598, 'epoch': 1.1006157474125509, 'step': 6300}
{'loss': 4.5257421875, 'learning_rate': 0.0009720475192173306, 'epoch': 1.1180837591161186, 'step': 6400}
{'loss': 4.5166796875, 'learning_rate': 0.0009716107617051014, 'epoch': 1.1355517708196865, 'step': 6500}
{'loss': 4.528515625, 'learning_rate': 0.0009711740041928722, 'epoch': 1.1530197825232542, 'step': 6600}
{'loss': 4.504453125, 'learning_rate': 0.00097073

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 4.333515625, 'learning_rate': 0.0009497728860936408, 'epoch': 2.0090833660858554, 'step': 11500}
{'loss': 4.271484375, 'learning_rate': 0.0009493361285814116, 'epoch': 2.0265513777894233, 'step': 11600}
{'loss': 4.2772265625, 'learning_rate': 0.0009488993710691824, 'epoch': 2.044019389492991, 'step': 11700}
{'loss': 4.2760546875, 'learning_rate': 0.0009484626135569531, 'epoch': 2.0614874011965587, 'step': 11800}
{'loss': 4.281875, 'learning_rate': 0.0009480258560447241, 'epoch': 2.0789554129001266, 'step': 11900}
{'loss': 4.2698828125, 'learning_rate': 0.0009475890985324948, 'epoch': 2.0964234246036946, 'step': 12000}
{'loss': 4.2681640625, 'learning_rate': 0.0009471523410202656, 'epoch': 2.1138914363072625, 'step': 12100}
{'loss': 4.261015625, 'learning_rate': 0.0009467155835080364, 'epoch': 2.13135944801083, 'step': 12200}
{'loss': 4.27296875, 'learning_rate': 0.0009462788259958072, 'epoch': 2.148827459714398, 'step': 12300}
{'loss': 4.2641796875, 'learning_rate': 0.00094584

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 4.225703125, 'learning_rate': 0.0009248777078965758, 'epoch': 3.004891043276999, 'step': 17200}
{'loss': 4.12875, 'learning_rate': 0.0009244409503843466, 'epoch': 3.022359054980567, 'step': 17300}
{'loss': 4.1409375, 'learning_rate': 0.0009240041928721174, 'epoch': 3.0398270666841345, 'step': 17400}
{'loss': 4.151796875, 'learning_rate': 0.0009235674353598881, 'epoch': 3.0572950783877024, 'step': 17500}
{'loss': 4.139140625, 'learning_rate': 0.0009231306778476591, 'epoch': 3.0747630900912704, 'step': 17600}
{'loss': 4.136171875, 'learning_rate': 0.0009226939203354298, 'epoch': 3.0922311017948383, 'step': 17700}
{'loss': 4.14140625, 'learning_rate': 0.0009222571628232006, 'epoch': 3.109699113498406, 'step': 17800}
{'loss': 4.142890625, 'learning_rate': 0.0009218204053109714, 'epoch': 3.1271671252019737, 'step': 17900}
{'loss': 4.149453125, 'learning_rate': 0.0009213836477987422, 'epoch': 3.1446351369055416, 'step': 18000}
{'loss': 4.14765625, 'learning_rate': 0.0009209468902865

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=22899.0, style=ProgressStyle(description_…

{'loss': 4.146953125, 'learning_rate': 0.0008999825296995109, 'epoch': 4.000698720468143, 'step': 22900}
{'loss': 4.052890625, 'learning_rate': 0.0008995457721872816, 'epoch': 4.018166732171711, 'step': 23000}
{'loss': 4.05890625, 'learning_rate': 0.0008991090146750524, 'epoch': 4.035634743875279, 'step': 23100}
{'loss': 4.055546875, 'learning_rate': 0.0008986722571628231, 'epoch': 4.053102755578847, 'step': 23200}
{'loss': 4.07046875, 'learning_rate': 0.0008982354996505941, 'epoch': 4.0705707672824145, 'step': 23300}
{'loss': 4.065859375, 'learning_rate': 0.0008977987421383648, 'epoch': 4.088038778985982, 'step': 23400}
{'loss': 4.072421875, 'learning_rate': 0.0008973619846261356, 'epoch': 4.1055067906895495, 'step': 23500}
{'loss': 4.063984375, 'learning_rate': 0.0008969252271139064, 'epoch': 4.122974802393117, 'step': 23600}
{'loss': 4.063046875, 'learning_rate': 0.0008964884696016772, 'epoch': 4.140442814096685, 'step': 23700}
{'loss': 4.07078125, 'learning_rate': 0.000896051712089

KeyboardInterrupt: 

### 🎉 Save final model (+ tokenizer + config) to disk

In [None]:
trainer.save_model(model_path)

#### ！！！注意，这里需要把词表拷贝到模型文件夹！！！

# 5. Check the model

In [4]:
from transformers import pipeline

generate = pipeline(
    "text-generation",
    model=model_path,
)

Some weights of the model checkpoint at E:\data\models\gpt-boost\gpt-tiny-memory were not used when initializing GPT2LMHeadModel: ['transformer.h.0.memory_layer.keys', 'transformer.h.0.memory_layer.values.weight', 'transformer.h.0.memory_layer.query_proj.query_mlps.0.weight', 'transformer.h.0.memory_layer.query_proj.query_mlps.0.bias', 'transformer.h.0.memory_layer.query_proj.query_mlps.1.weight', 'transformer.h.0.memory_layer.query_proj.query_mlps.1.bias', 'transformer.h.0.memory_layer.query_proj.query_mlps.1.running_mean', 'transformer.h.0.memory_layer.query_proj.query_mlps.1.running_var', 'transformer.h.0.memory_layer.query_proj.query_mlps.1.num_batches_tracked', 'transformer.h.1.memory_layer.keys', 'transformer.h.1.memory_layer.values.weight', 'transformer.h.1.memory_layer.query_proj.query_mlps.0.weight', 'transformer.h.1.memory_layer.query_proj.query_mlps.0.bias', 'transformer.h.1.memory_layer.query_proj.query_mlps.1.weight', 'transformer.h.1.memory_layer.query_proj.query_mlps.1.b

In [5]:
import time
time0 = time.time()
result = generate("""第八十四章 满月的呓语
　　克莱恩刚披上双排扣长礼服，拿起半高丝绸礼帽，往门口走去，忽然听见了层层回荡的虚幻祈求声。
　　谁？他微皱眉头，侧耳倾听了一下，但只能确认祈求者是一位女士，而且嗓音断断续续，似乎蕴藏着极大的痛苦。
　　想着也没什么特别紧要的事情，新晋“魔术师”克莱恩随手一扔，让半高丝绸礼帽又准确无误地挂到了衣帽架上，自身则返回卧室，逆走四步，进入巍峨雄伟的宫殿。
　　这一次，""", max_length=256, repetition_penalty=1, do_sample=True, top_k=40, temperature=1.2, num_return_sequences=3)
print(time.time()-time0)
for item in result:
  print(item["generated_text"])
  print()
  print("="*50)
  print()

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


1.3519995212554932
第八十四章 满月的呓语
　　克莱恩刚披上双排扣长礼服，拿起半高丝绸礼帽，往门口走去，忽然听见了层层回荡的虚幻祈求声。
　　谁？他微皱眉头，侧耳倾听了一下，但只能确认祈求者是一位女士，而且嗓音断断续续，似乎蕴藏着极大的痛苦。
　　想着也没什么特别紧要的事情，新晋“魔术师”克莱恩随手一扔，让半高丝绸礼帽又准确无误地挂到了衣帽架上，自身则返回卧室，逆走四步，进入巍峨雄伟的宫殿。
　　这一次，他有足够的时间做一次的拜访了“愚者”先生。因为他因为没有别的能力和和意外得到了回应，有获得相应的馈赠，有的直接占卜，比如，比如阿蒙后，另外两个途径“0-02”，“安提哥努斯；
　　第一件事就是亚当亚当的亚当“命运将就交给你”。
　　紧接着，克莱恩的身影，一下联排。
　　一个一个半高丝绸礼帽，穿着镶斗篷的老仆恩·唐泰斯，位上一个银一个银，银两短。；四则两四个黑，三四个银。银两


第八十四章 满月的呓语
　　克莱恩刚披上双排扣长礼服，拿起半高丝绸礼帽，往门口走去，忽然听见了层层回荡的虚幻祈求声。
　　谁？他微皱眉头，侧耳倾听了一下，但只能确认祈求者是一位女士，而且嗓音断断续续，似乎蕴藏着极大的痛苦。
　　想着也没什么特别紧要的事情，新晋“魔术师”克莱恩随手一扔，让半高丝绸礼帽又准确无误地挂到了衣帽架上，自身则返回卧室，逆走四步，进入巍峨雄伟的宫殿。
　　这一次，克莱恩又将目光投向了古老宫殿大厅。
　他灵性直觉告诉他，道恩·唐泰斯的存在没有使用“偷盗”钥匙，也未有携带物品，仅是那点熟悉又又虚幻的气息。
　不过，他依旧只是表面粗糙，不和谐，却相当不错，相当不错，与格尔曼·斯帕罗的格尔曼·斯帕罗，属于正常，属于神秘组织的，“海神”级的是。
　当然，这不是代表没有灰雾之上那些神秘空间，也具备对应真神的力量和有相应的高层次特点？克莱恩表面微沉，有微沉，有微沉。



第八十四章 满月的呓语
　　克莱恩刚披上双排扣长礼服，拿起半高丝绸礼帽，往门口走去，忽然听见了层层回荡的虚幻祈求声。
　　谁？他微皱眉头，侧耳倾听了一下，但只能确认祈求者是一位女士，而且嗓音断断续续，似乎蕴藏着极大的痛苦。
　　想着也没什么特别紧要的事情，新晋“魔术师”克莱恩随手一扔，让半高丝绸礼帽又准确无误地挂到了衣帽架上，自身则返回卧室，逆走四步，进入巍峨雄伟的宫殿。
　　这一次，克莱恩又有了