In [1]:
import torch.cuda
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GPT2LMHeadModel
from torch import nn
from datasets import load_dataset
import datasets as ds
import pyarrow as pa
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
class GPT2Trainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [4]:
train_path = './data/guide_context/'

train_names = ["FootballCommentariesGoalsTrain.jsonl", "FootballCommentariesRedCardsTrain.jsonl", "FootballCommentariesSubsTrain.jsonl", "FootballCommentariesYellowCardsTrain.jsonl"]

data_files_train = [train_path + file_name for file_name in train_names]

dataset = load_dataset('json', data_files=data_files_train).shuffle(42)

Using custom data configuration default-746aea814430b375
Reusing dataset json (/home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-2fa71f2e0e83a150.arrow


In [5]:
def tokenize_function(dataset_batch):
    return tokenizer(dataset_batch["concepts"], dataset_batch["target"], padding="max_length", max_length=128, truncation=True)
    

In [6]:
def change_dataset(example):
  example['labels'] = example['input_ids'].copy()
  return example

In [7]:
dataset1 = dataset["train"].train_test_split(test_size=0.4, train_size=0.6)
dataset2 = dataset1["test"].train_test_split(test_size=0.5, train_size=0.5)
train_dataset = dataset1["train"]
eval_dataset = dataset2["train"]
test_dataset = dataset2["test"]

Loading cached split indices for dataset at /home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-3d58c291e1bc29fa.arrow and /home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-6fb8e58eb234774a.arrow


In [8]:
train_data_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=["concepts"])
valid_data_tokenized = eval_dataset.map(tokenize_function, batched=True, remove_columns=["concepts"])
train_data_tokenized = train_data_tokenized.map(change_dataset, batched=True, remove_columns=["target"])
valid_data_tokenized = valid_data_tokenized.map(change_dataset, batched=True, remove_columns=["target"])

Loading cached processed dataset at /home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-2e1ec19d2d87559c.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/adria/.cache/huggingface/datasets/json/default-746aea814430b375/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-7920a74cdae9bd2a.arrow


  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./GPT2Finetuned8batch",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./GPT2Finetuned8batch/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
 )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=valid_data_tokenized,
)

In [10]:
model.to('cuda')
res = trainer.train()

***** Running training *****
  Num examples = 3542
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 1100


Epoch,Training Loss,Validation Loss
0,1.1826,0.658675
1,0.6598,0.55643
2,0.5734,0.505976
3,0.5258,0.476723
4,0.4916,0.456934
5,0.4693,0.441646
6,0.4518,0.431614
7,0.4407,0.424966
8,0.4319,0.421328
9,0.4255,0.42001


***** Running Evaluation *****
  Num examples = 1181
  Batch size = 8
Saving model checkpoint to ./GPT2Finetuned8batch/checkpoint-110
Configuration saved in ./GPT2Finetuned8batch/checkpoint-110/config.json
Model weights saved in ./GPT2Finetuned8batch/checkpoint-110/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1181
  Batch size = 8
Saving model checkpoint to ./GPT2Finetuned8batch/checkpoint-220
Configuration saved in ./GPT2Finetuned8batch/checkpoint-220/config.json
Model weights saved in ./GPT2Finetuned8batch/checkpoint-220/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1181
  Batch size = 8
Saving model checkpoint to ./GPT2Finetuned8batch/checkpoint-330
Configuration saved in ./GPT2Finetuned8batch/checkpoint-330/config.json
Model weights saved in ./GPT2Finetuned8batch/checkpoint-330/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1181
  Batch size = 8
Saving model checkpoint to ./GPT2Finetuned8batch/checkpoint-440
Configuration 