https://www.kaggle.com/code/tuckerarrants/text-generation-with-huggingface-gpt2

https://huggingface.co/docs/transformers/model_doc/gpt2

https://huggingface.co/openai-community/gpt2-medium


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp /content/drive/MyDrive/TFM-MUECIM/*.py /content
!cp /content/drive/MyDrive/TFM-MUECIM/*.txt /content
!cp /content/drive/MyDrive/TFM-MUECIM/50LabelsGPT2DataFrameForTextGen.csv /content
!cd /content/drive/MyDrive/TFM-MUECIM


In [3]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [4]:
import os
import shutil
import torch
from datetime import datetime
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel, set_seed
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

https://jalammar.github.io/illustrated-gpt2/


In [5]:
def tokenize_function(trainDataFrame):
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
  tokenizer.pad_token = tokenizer.eos_token
  return tokenizer(trainDataFrame['text'], padding="max_length", truncation=True)

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,
          model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          last_epoch_trained,
          num_train_epochs,
          save_steps):

  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset('csv', data_files=train_file_path)
  train_dataset = train_dataset.map(tokenize_function, batched=True)
  train_dataset = train_dataset['train'].select_columns(['input_ids'])

  data_collator = load_data_collator(tokenizer)
  tokenizer.save_pretrained(output_dir)
  tokenizer.pad_token = tokenizer.eos_token

  if last_epoch_trained > 0:
    model = GPT2LMHeadModel.from_pretrained(output_dir)
  else:
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  # Move the model to the correct device before training.
  model.to(device)

  training_args = TrainingArguments(
          output_dir=output_dir,
          report_to = 'none',
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset
  )

  trainer.train()
  trainer.save_model(output_dir='GPT2_trained_model_202504-5epochs-medium')

  baseDir = '.'
  prefixDate = datetime.today().strftime('%Y%m%d')
  fileName = f'{prefixDate}_50L_tfm_GPT2_medium_model_epoch_{last_epoch_trained + 1}.pt'
  modelFullPath = os.path.join(baseDir,fileName)
  drivePath = '/content/drive/MyDrive/TFM-MUECIM'
  destFullPath = os.path.join(drivePath,fileName)
  print('Save model after epoch train session')
  torch.save(model, modelFullPath)
  shutil.copyfile(modelFullPath, destFullPath)

  model.save_pretrained(save_directory='GPT2_trained_model_202504-5epochs-medium')

  return model

In [6]:
dataFrame = '50LabelsGPT2DataFrameForTextGen.csv'
train_file_path = f'/content/{dataFrame}'
model_name = 'gpt2-medium'
output_dir = '/content/result'
overwrite_output_dir = False

per_device_train_batch_size = 4
num_train_epochs = 5
save_steps = 500

torch.cuda.empty_cache()

model = train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    last_epoch_trained = 0,
    num_train_epochs = 5 ,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/28938 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.9033
1000,0.7392
1500,0.6868
2000,0.652
2500,0.5927
3000,0.6032
3500,0.572
4000,0.5719
4500,0.5671
5000,0.5541


Save model after epoch train session


In [7]:
!tar czf GPT2_trained_model_202504-5epochs-medium.tar.gz GPT2_trained_model_202504-5epochs-medium
!cp GPT2_trained_model_202504-5epochs-medium.tar.gz /content/drive/MyDrive/TFM-MUECIM