In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.21.0 (from transformers[torch])
  Using cached accelerate-0.32.1-py3-none-any.whl (314 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->transformers[t

In [4]:
from datasets import load_dataset

# Load your custom dataset
dataset = load_dataset('text', data_files={'train': '/content/drive/MyDrive/fairy_tales.txt'})

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [5]:
# Set up data collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [6]:
# Define training arguments
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

In [7]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [8]:
# Fine-tune the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=15, training_loss=2.485309600830078, metrics={'train_runtime': 460.211, 'train_samples_per_second': 0.059, 'train_steps_per_second': 0.033, 'total_flos': 14109769728000.0, 'train_loss': 2.485309600830078, 'epoch': 3.0})

In [9]:
# Save the fine-tuned model
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

('./fine-tuned-gpt2/tokenizer_config.json',
 './fine-tuned-gpt2/special_tokens_map.json',
 './fine-tuned-gpt2/vocab.json',
 './fine-tuned-gpt2/merges.txt',
 './fine-tuned-gpt2/added_tokens.json')

In [10]:
# Generate text using the fine-tuned model
from transformers import pipeline

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Once upon a time, a man named "Old Kefir" was living on a mountain in an unforgiving wilderness. He was an avid hunter and fisherman. When he awoke in the morning, he discovered a young young girl, beautiful and beautiful. The two had a close relationship and he could tell from her looks that she knew nothing of the world beyond the mountains. When it arrived, he found a young girl in the far forest, her head tucked inside a blanket. When asked whether she


In [11]:
prompt = "Avengers"
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

print(generated_text[0]['generated_text'])

Avengers: Infinity Wars

Main article: Avengers: Infinity Wars

Tony Stark is back from a long, long journey to find and rescue kidnapped princess Gamora and a kidnapped blue dragon. It became deadly, and so were more of Tony's secrets. As the new heroes battled the evil Avengers, Stark used his own magical powers to find and save Gamora, who was taken captive by the powerful Mjolnir. When the Mjolnir destroyed its stronghold, the heroes


In [16]:
prompt = "when i was young"
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

print(generated_text[0]['generated_text'])

when i was young…i was a pretty good student. I liked the music. I thought it was a little weird at first…but when my first love, the other man, was really into it, i was like…what's with all these different things like these things where it's a group of lovebirds in a forest and i'm like…they're all…sick and you have a lot of bad luck because everyone else lives off of them. And i had to just kind
