In [1]:
import re
import json
import torch

from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelWithLMHead, pipeline

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
token_pretrained = "gpt2"
model_pretrained = "robowaifudev/megatron-gpt2-345m"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(token_pretrained)
gpt2_tokenizer = AutoTokenizer.from_pretrained(token_pretrained)

data_path = 'data-generation/data.txt'

In [4]:
def load_dataset(train_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=data_path,
        block_size=64,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, data_collator

In [5]:
train_dataset, data_collator = load_dataset(data_path, tokenizer)



In [6]:
torch.backends.cuda.matmul.allow_tf32 = True

model = AutoModelWithLMHead.from_pretrained(model_pretrained, torch_dtype=torch.float32).to("cuda")

training_args = TrainingArguments(
    output_dir="./model-output",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    save_steps=800,
    tf32=True,
    warmup_steps=500)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset)



In [7]:
trainer.train()

 26%|██▌       | 500/1952 [02:34<07:26,  3.25it/s]

{'loss': 3.4186, 'learning_rate': 5e-05, 'epoch': 2.05}


 51%|█████     | 1000/1952 [05:34<04:53,  3.24it/s] 

{'loss': 2.3193, 'learning_rate': 3.278236914600551e-05, 'epoch': 4.1}


 77%|███████▋  | 1500/1952 [08:09<02:20,  3.21it/s]

{'loss': 1.3542, 'learning_rate': 1.5564738292011018e-05, 'epoch': 6.15}


100%|██████████| 1952/1952 [10:55<00:00,  2.98it/s]

{'train_runtime': 656.01, 'train_samples_per_second': 35.658, 'train_steps_per_second': 2.976, 'train_loss': 2.0199268528672514, 'epoch': 8.0}





TrainOutput(global_step=1952, training_loss=2.0199268528672514, metrics={'train_runtime': 656.01, 'train_samples_per_second': 35.658, 'train_steps_per_second': 2.976, 'train_loss': 2.0199268528672514, 'epoch': 8.0})

In [8]:
input_text = 'Magnetic declination is' # (good) top_k=1
# input_text = 'The international geomagnetic reference model is' # (hallucinations) top_k=1
# input_text = 'In a total solar eclipse,' # (good) top_k=5 
# input_text = 'What is a spherical harmonic model?' # (good) top_k=1, typical_p=0.8
# input_text = "A compass needle points towards" (bad)

capacity = 15

generation_length = len(input_text.split()) * capacity * 2

generation_text = tokenizer.encode(input_text, return_tensors='pt').to("cuda")

response = model.generate(
    input_ids=generation_text, 
    max_length=generation_length, 
    do_sample=True, 
    early_stopping=True,
    num_beams=10,
    no_repeat_ngram_size=4,
    top_k=1, 
    typical_p=0.8,
    temperature=.8)

response = tokenizer.decode(response[0], skip_special_tokens=True)
response = response.replace("\n", "</EOL> ")

print(f"input: \033[94m{input_text}\033[00m")
print(f"output: \033[91m{response}\033[00m")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input: [94mMagnetic declination is[00m
output: [91mMagnetic declination is the angle between magnetic north and true north. Declination is positive when this angle is east of true north and negative when it is west. Magnetic declination values are given in hours, minutes, and seconds. The Magsat satellite provides hourly and/or second-by-second values for the declination angle.,</EOL>  Magnetic declination is also shown on maps and charts as a vertical line that extends into local or virtual Geneva[00m


In [9]:
trainer.save_model("model-output/model-4-21-23")

In [10]:
tokenizer.save_pretrained("model-output/model-4-21-23")

('model-output/model-4-21-23\\tokenizer_config.json',
 'model-output/model-4-21-23\\special_tokens_map.json',
 'model-output/model-4-21-23\\vocab.json',
 'model-output/model-4-21-23\\merges.txt',
 'model-output/model-4-21-23\\added_tokens.json',
 'model-output/model-4-21-23\\tokenizer.json')