In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import warnings


In [2]:
print(torch.cuda.is_available())
torch.cuda.init()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
warnings.filterwarnings("ignore", message="The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.")

True
cuda


In [5]:
data_file_path = 'corpus/grimms/grimms.txt' # This has all of the story titles/headings removed
data_train_file_path = 'corpus/grimms/grimms_train_data.txt'
data_test_file_path = 'corpus/grimms/grimms_test_data.txt'

gpt_models = ['distilgpt2', 'sshleifer/tiny-gpt2']
gpt_model = 0
tokenizer_max_length=1024
needs_training = True

In [6]:
with open(data_file_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)

with open(data_train_file_path, 'w', encoding='utf-8') as file:
    file.writelines(train_data)
with open(data_test_file_path, 'w', encoding='utf-8') as file:
    file.writelines(test_data)

In [7]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir="./logs")

def custom_logging_callback(log_args):
    global_step = log_args["global_step"]
    loss = log_args["loss"]

    # Log loss to TensorBoard
    writer.add_scalar("Loss", loss, global_step)

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained(gpt_models[gpt_model])

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset('text', data_files={'train': data_train_file_path, 'test': data_test_file_path})
# Tokenize the text data
tokenized_datasets = dataset.map(lambda examples: tokenizer(examples['text'], padding='max_length', truncation=True, max_length=tokenizer_max_length, return_tensors='pt'), batched=True)

# Generate labels from tokenized data
def generate_labels(examples):
    return {"labels": examples["input_ids"]}

tokenized_datasets = tokenized_datasets.map(generate_labels, batched=True)

Downloading and preparing dataset text/default to C:/Users/adamm/.cache/huggingface/datasets/text/default-7cbe681dfb4bd9f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to C:/Users/adamm/.cache/huggingface/datasets/text/default-7cbe681dfb4bd9f9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/7595 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

Map:   0%|          | 0/7595 [00:00<?, ? examples/s]

Map:   0%|          | 0/1899 [00:00<?, ? examples/s]

In [10]:
# Initialize model and args
model = GPT2LMHeadModel.from_pretrained(gpt_models[gpt_model])
batch_multiplier = ((gpt_model + 1)**2)  # This is a mathy hack to adjust batch params depending on which of the two models are used

training_args = TrainingArguments(
    output_dir="./results",          # Directory for saving model checkpoints and results
    overwrite_output_dir=True,       # Overwrite the output directory if it exists
    num_train_epochs=2,             # Number of training epochs
    per_device_train_batch_size=4 * batch_multiplier,   # Batch size per device
    gradient_accumulation_steps=8 * batch_multiplier,
    per_device_eval_batch_size=4 * batch_multiplier,    # Batch size for evaluation per device
    warmup_steps=50,               # Number of warmup steps for the learning rate scheduler
    # save_steps=200,                 # Save model checkpoints every N steps
    save_total_limit=10,             # Limit the number of saved checkpoints
    evaluation_strategy="epoch",    # Evaluation strategy (steps or epoch)
    save_strategy="epoch",
    # eval_steps=200,                 # Evaluate every N steps
    load_best_model_at_end=True,    # Load the best model checkpoint at the end of training
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping
    logging_dir="./logs",           # Directory for logs
    logging_steps=100,              # Log metrics every N steps
    log_level="info",
    report_to="tensorboard",        # Report metrics to TensorBoard
    run_name="run01",          # Name of the run
    # accelerator='ddp',
    save_safetensors=True,
)

In [11]:
# Train model
torch.cuda.empty_cache()

if needs_training:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        # callbacks=[custom_logging_callback],
    )
    
    trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7,595
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 474
  Number of trainable parameters = 81,912,576


  0%|          | 0/474 [00:00<?, ?it/s]

{'loss': 2.8664, 'learning_rate': 5e-05, 'epoch': 0.21}
{'loss': 0.0622, 'learning_rate': 4.410377358490566e-05, 'epoch': 0.42}
{'loss': 0.0557, 'learning_rate': 3.820754716981133e-05, 'epoch': 0.63}
{'loss': 0.0546, 'learning_rate': 3.2311320754716986e-05, 'epoch': 0.84}


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1899
  Batch size = 4


  0%|          | 0/475 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-237
Configuration saved in ./results\checkpoint-237\config.json
Configuration saved in ./results\checkpoint-237\generation_config.json
Model weights saved in ./results\checkpoint-237\pytorch_model.bin


{'eval_loss': 0.05196129158139229, 'eval_runtime': 42.806, 'eval_samples_per_second': 44.363, 'eval_steps_per_second': 11.097, 'epoch': 1.0}


KeyboardInterrupt: 

In [12]:
# Run this to load the trained model and tokenizer if not autoloaded after training completion
torch.cuda.empty_cache()
model = GPT2LMHeadModel.from_pretrained("./results/checkpoint-237")  # Path to your chosen checkpoint
model = model.to('cuda')
tokenizer = GPT2Tokenizer.from_pretrained(gpt_models[gpt_model])  # Use the same tokenizer as during training

loading configuration file ./results/checkpoint-237\config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dt

In [15]:
# Input text for generating continuation
prompts = ["Once upon a time, a clever orphan stole a magic dagger from an evil wizard. The child ran into a haunted forest and ",
           "A very strange thing happened one day as the baker woke up and began her day. She saw outside that "
]

temperature = 0.85

# Initialize an empty list to store prompts, parameters, and answers
prompt_params_answers = []

# Iterate through each input text
for prompt in prompts:  # Use the prompts list directly
    # Tokenize the input text
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to('cuda')

    # Generate text continuation
    output = model.generate(
        input_ids,
        min_length=128,
        max_length=256,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        do_sample=True,
        no_repeat_ngram_size=10,  # Adjust the value as needed
    )

    # Decode and print the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Store the prompt, parameters, and generated answer as a tuple
    prompt_params_answers.append((prompt, model.config, generated_text))

# Define a file path to save the results
file_path = "prompt_answer_params.txt"

# Save the prompts, parameters, and answers to a file
with open(file_path, "a", encoding="utf-8") as file:
    for prompt, params, answer in prompt_params_answers:
        file.write(f"Prompt:\n{prompt}\n\n"
                    f"Generated Answer:\n{answer}\n\n"
                    f"Model Parameters:\n{params}\n\n")
        print(f"{answer}\n")

print(f"Results saved to {file_path}")

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.32.1"
}

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.32.1"
}



Once upon a time, a clever orphan stole a magic dagger from an evil wizard. The child ran into a haunted forest and iced it out as a fairy, and he soon found a gold unicorn, all of which belonged to the fairy, who was in the middle of a gold ring, he said: ‘Yes, I are going to pick up your golden, and you can put it in the bottle of water.’ ‘Goodbye,’ said the prince, ‘that you are the best man, that you have a gift that can be bought now.’ ‘I must have it.’

A very strange thing happened one day as the baker woke up and began her day. She saw outside that urn was the same as this: at first she was not pleased, but when she was so tired that the baker had a cat, she asked: ‘What is she going to eat?’ So the cook told her: ‘I am not too much,’ in which she said: ‘you are going to die.’ But when she was quite tired, the cook told her: ‘I am so thirsty, I cannot not be thirsty, so I have a beautiful horse, and I will eat

Results saved to prompt_answer_params.txt
