In [2]:
import logging
from transformers import CONFIG_MAPPING, AutoTokenizer, AutoModelWithLMHead, AutoConfig
from transformers import DataCollatorForLanguageModeling, GPT2Config
from transformers import LineByLineTextDataset, TextDataset, Trainer, TrainingArguments

logger = logging.getLogger(__name__)

In [3]:
train_args = TrainingArguments(
    output_dir='C:/Users/bill/documents/projects/data/chatbot/gpt/model',
)
train_args.do_train = True
train_args.do_eval = True
#train_args.fp16 = True
train_args

TrainingArguments(output_dir='C:/Users/bill/documents/projects/data/chatbot/gpt/model', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluate_during_training=False, per_gpu_train_batch_size=8, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False)

In [4]:
model_name = 'gpt2'
train_file = 'C:/Users/bill/documents/projects/data/chatbot/gpt/train.txt'
test_file = 'C:/Users/bill/documents/projects/data/chatbot/gpt/test.txt'

In [6]:
#config = CONFIG_MAPPING['gpt2']()
config = GPT2Config(
    n_layer=2,
    n_head=2,
    n_embd=100,
    vocab_size=30000,
    bos_token_id=29999,
    eos_token_id=29999
)
config
print(config)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_config(config)
model

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 29999,
  "embd_pdrop": 0.1,
  "eos_token_id": 29999,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 100,
  "n_head": 2,
  "n_layer": 2,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 30000
}



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(30000, 100)
    (wpe): Embedding(1024, 100)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [7]:
config = CONFIG_MAPPING['gpt2']()
#config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

#model = AutoModelWithLMHead.from_config(config)
model = AutoModelWithLMHead.from_pretrained(model_name, config=config)

In [10]:
#model.resize_token_embeddings(len(tokenizer))
#model.resize_token_embeddings(50)
#model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50, 100)
    (wpe): Embedding(1024, 100)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNo

In [5]:
train_ds = TextDataset(
    tokenizer=tokenizer,
    file_path=train_file,
    block_size=tokenizer.max_len
)
eval_ds = TextDataset(
    tokenizer=tokenizer,
    file_path=test_file,
    block_size=tokenizer.max_len
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    mlm_probability=0.15
)
trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=eval_ds
)

In [6]:
trainer.train(model_path=model_name)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=180.0, style=ProgressStyle(description_wi…





	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


RuntimeError: CUDA out of memory. Tried to allocate 938.00 MiB (GPU 0; 8.00 GiB total capacity; 4.95 GiB already allocated; 324.04 MiB free; 5.90 GiB reserved in total by PyTorch) (malloc at ..\c10\cuda\CUDACachingAllocator.cpp:289)
(no backtrace available)

In [None]:
trainer.save_model()