In [1]:
import re
import json
import torch

from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelWithLMHead, pipeline

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
token_pretrained = "gpt2"
model_pretrained = "robowaifudev/megatron-gpt2-345m"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(token_pretrained)

train_path = 'clean-data/clean-4-14-23.csv'

In [4]:
def load_dataset(train_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=64,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, data_collator

In [5]:
train_dataset, data_collator = load_dataset(train_path, tokenizer)



In [6]:
torch.backends.cuda.matmul.allow_tf32 = True

model = AutoModelWithLMHead.from_pretrained(model_pretrained, torch_dtype=torch.float32)

training_args = TrainingArguments(
    output_dir="./model-output",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    save_steps=800,
    tf32=True,
    warmup_steps=500)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset)



In [7]:
trainer.train()

 48%|â–ˆâ–ˆâ–ˆâ–ˆâ–Š     | 500/1032 [04:29<04:47,  1.85it/s]

{'loss': 3.9802, 'learning_rate': 5e-05, 'epoch': 3.88}


 97%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‹| 1000/1032 [09:18<00:17,  1.88it/s]

{'loss': 2.3689, 'learning_rate': 3.007518796992481e-06, 'epoch': 7.75}


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1032/1032 [09:36<00:00,  1.79it/s]

{'train_runtime': 576.5044, 'train_samples_per_second': 21.44, 'train_steps_per_second': 1.79, 'train_loss': 3.1315339184546653, 'epoch': 8.0}





TrainOutput(global_step=1032, training_loss=3.1315339184546653, metrics={'train_runtime': 576.5044, 'train_samples_per_second': 21.44, 'train_steps_per_second': 1.79, 'train_loss': 3.1315339184546653, 'epoch': 8.0})

In [8]:
input_text = 'hi, '
capacity = 2

generation_length = len(input_text.split()) * capacity * 2

generation_text = tokenizer.encode(input_text, return_tensors='pt').to("cuda")

response = model.generate(
    input_ids=generation_text, 
    max_length=generation_length, 
    do_sample=True, 
    output_scores=True,
    early_stopping=True,
    num_beams=10,
    no_repeat_ngram_size=4,
    top_k=50, 
    top_p=0.95, 
    temperature=0.99)

response = tokenizer.decode(response[0], skip_special_tokens=True)


response = response.replace("\n", "</EOL> ")

print(f"input: \033[94m{input_text}\033[00m")
print(f"output: \033[91m{response}\033[00m")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input: [94mhi, [00m
output: [91mhi,?"[00m


In [9]:
trainer.save_model("model-output/model-4-21-23")

In [10]:
tokenizer.save_pretrained("model-output/model-4-21-23")

('model-output/model-4-21-23\\tokenizer_config.json',
 'model-output/model-4-21-23\\special_tokens_map.json',
 'model-output/model-4-21-23\\vocab.json',
 'model-output/model-4-21-23\\merges.txt',
 'model-output/model-4-21-23\\added_tokens.json',
 'model-output/model-4-21-23\\tokenizer.json')