In [None]:
import torch
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import pipeline

In [2]:
# Load dataset
dataset = load_dataset("wikitext", "wikitext-103-v1")
train_data = dataset['train']['text']

In [3]:
# Create tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

In [4]:
# Encode data
train_encodings = tokenizer(train_data, return_tensors='pt', max_length=4, truncation=True, padding="max_length")

In [5]:
train_encodings

{'input_ids': tensor([[50256, 50256, 50256, 50256],
        [  796,   569, 18354,  7496],
        [50256, 50256, 50256, 50256],
        ...,
        [ 4176,   635, 12007, 47593],
        [  679,  3111,   355,  1486],
        [50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [0, 0, 0, 0],
        ...,
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [0, 0, 0, 0]])}

In [6]:
# Create model config and model
config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel(config)

In [7]:
# Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
# Create optimizer and data loader
optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_encodings, batch_size=8, shuffle=True)

In [9]:
train_loader.dataset

{'input_ids': tensor([[50256, 50256, 50256, 50256],
        [  796,   569, 18354,  7496],
        [50256, 50256, 50256, 50256],
        ...,
        [ 4176,   635, 12007, 47593],
        [  679,  3111,   355,  1486],
        [50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [0, 0, 0, 0],
        ...,
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [0, 0, 0, 0]])}

In [10]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)

In [22]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader.dataset:
        a = train_loader.dataset[batch]
        input_ids = train_loader.dataset['input_ids'].to(device)
        attention_mask = train_loader.dataset['attention_mask'].to(device)
        # print(input_ids)
        # print(attention_mask)
        print(batch)

input_ids
attention_mask
input_ids
attention_mask
input_ids
attention_mask
input_ids
attention_mask
input_ids
attention_mask


In [12]:
bk = train_loader.dataset
print(bk)
print(train_loader)
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in bk:
        if batch == "input_ids":
            batch = bk['input_ids'][0].to(device)
            print(batch)
        if batch == "attention_mask":
            batch = bk['attention_mask'][0].to(device)
            print(batch)

{'input_ids': tensor([[50256, 50256, 50256, 50256],
        [  796,   569, 18354,  7496],
        [50256, 50256, 50256, 50256],
        ...,
        [ 4176,   635, 12007, 47593],
        [  679,  3111,   355,  1486],
        [50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[0, 0, 0, 0],
        [1, 1, 1, 1],
        [0, 0, 0, 0],
        ...,
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [0, 0, 0, 0]])}
<torch.utils.data.dataloader.DataLoader object at 0x76e276f3f070>
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')


In [24]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader.dataset:

        input_ids = train_loader.dataset['input_ids'][0].to(device)
        print(input_ids)
        attention_mask = train_loader.dataset['attention_mask'][0].to(device)
        print(attention_mask)
        

        labels = input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in loss calculation

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch}: Loss {total_loss / len(train_loader)}")

tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
Epoch 0: Loss nan
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
Epoch 1: Loss nan
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
Epoch 2: Loss nan
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
Epoch 3: Loss nan
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
tensor([50256, 50256, 50256, 50256], device='cuda:0')
tensor([0, 0, 0, 0], device='cuda:0')
Epoch 4:

In [14]:
# Save model
model.save_pretrained('./meuModeloGPT2')

In [15]:
# Create generator pipeline
generator = pipeline('text-generation', model='./meuModeloGPT2', tokenizer='gpt2')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
# Generate text
prompt = "Qual é o significado da vida, do universo e tudo mais?"
response = generator(prompt, max_length=50, num_return_sequences=1)
print(response[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Qual é o significado da vida, do universo e tudo mais? toll Og Kelley1991 sailriusュ crowizzizzFu civilizedpythonFar Beautifulpher monthfect Einstein skatecknowledAbstractFun Lange clerkPROMess rmolicy Pediatrics movements
