In [1]:
import torch

from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, \
      Trainer, TrainingArguments, AutoModelWithLMHead

In [2]:
token_pretrained = "gpt2"
model_pretrained = "robowaifudev/megatron-gpt2-345m"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(token_pretrained)

train_path = 'clean-data/clean-4-28-23.csv'

In [4]:
def load_dataset(train_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=64,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, data_collator

In [5]:
train_dataset, data_collator = load_dataset(train_path, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (128736 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
torch.backends.cuda.matmul.allow_tf32 = True

model = AutoModelWithLMHead.from_pretrained(model_pretrained, torch_dtype=torch.float32)

training_args = TrainingArguments(
    output_dir="./model-output",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    save_steps=800,
    tf32=True,
    warmup_steps=500)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset)



In [7]:
trainer.train()



  0%|          | 0/1344 [00:00<?, ?it/s]

{'loss': 4.1467, 'learning_rate': 5e-05, 'epoch': 2.98}
{'loss': 2.8095, 'learning_rate': 2.037914691943128e-05, 'epoch': 5.95}
{'train_runtime': 632.7699, 'train_samples_per_second': 25.425, 'train_steps_per_second': 2.124, 'train_loss': 3.0838504972912015, 'epoch': 8.0}


TrainOutput(global_step=1344, training_loss=3.0838504972912015, metrics={'train_runtime': 632.7699, 'train_samples_per_second': 25.425, 'train_steps_per_second': 2.124, 'train_loss': 3.0838504972912015, 'epoch': 8.0})

In [28]:
input_text = 'omg are you'
capacity = 10

generation_length = len(input_text.split()) * capacity * 2

generation_text = tokenizer.encode(input_text, return_tensors='pt').to("cuda")

response = model.generate(
    input_ids=generation_text, 
    max_length=generation_length, 
    do_sample=True, 
    output_scores=True,
    early_stopping=True,
    num_beams=10,
    no_repeat_ngram_size=4,
    top_k=50, 
    typical_p=0.6,
    temperature=1.75)

response = tokenizer.decode(response[0], skip_special_tokens=True)
response = response.replace("\n", "</EOL> ")

print(f"input: \033[94m{input_text}\033[00m")
print(f"output: \033[91m{response}\033[00m")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input: [94momg are you[00m
output: [91momg are you?</EOL> Fan good</EOL> if i donate u double the donation</EOL> IF DONATE u ONLY HAVE</EOL> tipped 3 to p</EOL> Fn IF DONATE u only have</EOL> if dont buy any u gonna get an edgar stuck in your skull</EOL> "IF DONATE, YOU[00m


In [29]:
trainer.save_model("model-output/model-4-28-23")

In [30]:
tokenizer.save_pretrained("model-output/model-4-28-23")

('model-output/model-4-28-23\\tokenizer_config.json',
 'model-output/model-4-28-23\\special_tokens_map.json',
 'model-output/model-4-28-23\\vocab.json',
 'model-output/model-4-28-23\\merges.txt',
 'model-output/model-4-28-23\\added_tokens.json',
 'model-output/model-4-28-23\\tokenizer.json')