# Preparing the dataset for unsupervised pre-training

Models like GPT2 and BERT are pre-trained on large amounts of text data. This is done in an unsupervised manner, meaning that the model is not given any labels. The model is only given the text data and it learns to predict the next word in a sequence. This is done by maximizing the likelihood of the next word given the previous words in the sequence.

We will use a smaller dataset for this example from IMDB.

# Download the dataset

Using HuggingFace's Datasets library, we can download the dataset easily.

In [1]:
# Create a PyTorch dataset for IMDB movie reviews using HuggingFace's datasets library
# https://huggingface.co/datasets/imdb

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets


# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the IMDB dataset using HuggingFace's load_dataset
imdb_dataset = load_dataset('imdb')

print(imdb_dataset)

# Concatenate the train and test splits into a single dataset
unsupervised_dataset = imdb_dataset['unsupervised']

# Define a function to tokenize the text
def tokenize_function(example):
    t = tokenizer(example['text'])
    return t


# Apply the tokenization to the dataset
tokenized_imdb_dataset = unsupervised_dataset.map(tokenize_function, batched=True)


Found cached dataset imdb (/home/alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/alex/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-8cf8377e8c7bb09f.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [2]:
# Concatenate all rows into a single list
all_text = []
for row in tokenized_imdb_dataset:
    all_text.extend(row['input_ids'])

In [3]:
class IMDBDataset(Dataset):
    def __init__(self, dataset, block_size=128):
        self.dataset = dataset
        self.block_size = block_size

    def __len__(self):
        return len(self.dataset) - self.block_size

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.dataset[idx:idx + self.block_size], dtype=torch.long)
        labels = torch.tensor(self.dataset[idx + 1:idx + 1 + self.block_size], dtype=torch.long)
        return input_ids, labels

# Create the PyTorch dataset
imdb_dataset = IMDBDataset(all_text)

In [4]:
import math
import torch
import torch.nn as nn


class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_seq_len):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_len, embedding_dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + self.pe[:x.size(0), :]


class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.query = nn.Linear(embedding_dim, embedding_dim)
        self.key = nn.Linear(embedding_dim, embedding_dim)
        self.value = nn.Linear(embedding_dim, embedding_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(p=0.1)
        self.out = nn.Linear(embedding_dim, embedding_dim)
        
    def forward(self, x):
        bsz, seq_len, embedding_dim = x.size()
        k = self.key(x).view(bsz, seq_len, self.num_heads, embedding_dim // self.num_heads).transpose(1, 2)
        q = self.query(x).view(bsz, seq_len, self.num_heads, embedding_dim // self.num_heads).transpose(1, 2)
        v = self.value(x).view(bsz, seq_len, self.num_heads, embedding_dim // self.num_heads).transpose(1, 2)
        
        att_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(embedding_dim // self.num_heads)
        att_probs = self.softmax(att_scores)
        att_probs = self.dropout(att_probs)
        
        att_output = torch.matmul(att_probs, v)
        att_output = att_output.transpose(1, 2).contiguous().view(bsz, seq_len, embedding_dim)
        
        return self.out(att_output)


class FeedForward(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, embedding_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, hidden_dim):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(embedding_dim, num_heads)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.feedforward = FeedForward(embedding_dim, hidden_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(p=0.1)
        
    def forward(self, x):
        att_output = self.attention(x)
        norm1_output = self.norm1(att_output + x)
        feedforward_output = self.feedforward(norm1_output)
        output = self.dropout(self.norm2(feedforward_output + norm1_output))
        
        return output


class GPT2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(GPT2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, max_seq_len=512)
        self.layers = nn.ModuleList([TransformerBlock(embedding_dim, num_heads, hidden_dim) for _ in range(num_layers)])
        self.fc = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.positional_encoding(embedded)
        for layer in self.layers:
            embedded = layer(embedded)
        output = self.fc(embedded)
        
        return output

In [18]:
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
import pytorch_lightning as pl
import torchmetrics

# define model hyperparameters
vocab_size = 50257 + 1 # Add 1 for the padding token
embedding_dim = 256
num_heads = 8
hidden_dim = 3072
num_layers = 4

# create data loader and data collator
data_loader = DataLoader(imdb_dataset, batch_size=32, shuffle=True)

class GPT2LightningModule(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(GPT2LightningModule, self).__init__()
        self.model = GPT2(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers)
        self.perp = torchmetrics.Perplexity()
        
    def forward(self, x):
        return self.model(x)

    def generate(self, x, max_len=512):
        self.model.eval()
        with torch.no_grad():
            generated = x
            for _ in range(max_len):
                outputs = self.model(generated)
                logits = outputs[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                # next_token = torch.argmax(outputs[:, -1, :], keepdim=True, dim=-1)
                generated = torch.cat((generated, next_token), dim=1)
        return generated
    
    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = F.cross_entropy(outputs.view(-1, vocab_size), labels.view(-1))
        self.perp(outputs, labels)
        self.log('train_loss', loss)
        self.log('train_perplexity', self.perp, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss
    
    def configure_optimizers(self):
        return optim.Adam(self.model.parameters(), lr=1e-4)

In [6]:
# Create a trainer and train the model for 1000 steps
trainer = pl.Trainer(accelerator='gpu', max_steps=1000)
trainer.fit(GPT2LightningModule(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers), data_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | GPT2       | 33.1 M
1 | perp  | Perplexity | 0     
-------------------------------------
33.1 M    Trainable params
0         Non-trainable params
33.1 M    Total params
132.576   Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_steps=1000` reached.


In [20]:
# Use the trained model to generate text
model = GPT2LightningModule(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers)
model.load_state_dict(torch.load('lightning_logs/version_11/checkpoints/epoch=0-step=10000.ckpt')['state_dict'])
model.eval()

# Encode the prompt
prompt = "the movie was good because"
encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')

# Generate text
generated = model.generate(encoded_prompt, max_len=128)
print(generated.shape)

# Decode the generated text
generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(generated_text)

torch.Size([1, 133])
the movie was good because bad among good film because they ain't help they made why Romero said up i did because unfortunately i guess i know i feel Cannonies i was good movie was fine movieobaados i did i. Maybe i can't bad /><br /><br />iddies i couldn't been improved the movie didn't know why they does not actually have broken didn't degenerated with obviously i can't help they did did. honestly didn't know when i can't beautiful and i didn't know why i always did i simply didn't ever to do with funny at CHO didn't know i didn't been a TV movie wasn't seem he


: 

In [15]:
# Continue training the model
trainer = pl.Trainer(accelerator='gpu', max_steps=10000)
trainer.fit(model, data_loader, ckpt_path="lightning_logs/version_10/checkpoints/epoch=0-step=2000.ckpt")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at lightning_logs/version_10/checkpoints/epoch=0-step=2000.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params
-------------------------------------
0 | model | GPT2       | 33.1 M
1 | perp  | Perplexity | 0     
-------------------------------------
33.1 M    Trainable params
0         Non-trainable params
33.1 M    Total params
132.576   Total estimated model params size (MB)
Restored all states from the checkpoint at lightning_logs/version_10/checkpoints/epoch=0-step=2000.ckpt
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn(
`Trainer.fit` stopped: `max_steps=10000` reached.
