In [15]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'sep_token': '[SEP]'})
id = tokenizer.convert_tokens_to_ids('[SEP]')

In [14]:
import numpy as np
from torchvision import transforms
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

ds = load_dataset("alpayariyak/IAM_Sentences")

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

ds = ds["train"].select(range(100))
ds = ds.map(lambda x: {"image": transform(x["image"]), "text": x["text"]})

ds

Dataset({
    features: ['image', 'text'],
    num_rows: 100
})

In [None]:
ds["image"][0]

AttributeError: 'list' object has no attribute 'shape'

In [33]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config, GPT2Tokenizer

# Define patch embedding module
class PatchEmbedding(nn.Module):
    def __init__(self, img_size=(224, 224), patch_size=(8, 4), embed_dim=768):
        super(PatchEmbedding, self).__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1])
        self.proj = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim))

    def forward(self, x):
        # Convert the image to patches
        x = self.proj(x)  # Shape: (batch_size, embed_dim, num_patches_height, num_patches_width)
        x = x.flatten(2).transpose(1, 2)  # Shape: (batch_size, num_patches, embed_dim)
        x = x + self.position_embeddings  # Add positional encoding
        return x

# Define DTrOCR model class
class DTrOCR(nn.Module):
    def __init__(self, embed_dim=768, max_seq_len=2568):
        super(DTrOCR, self).__init__()
        self.patch_embedding = PatchEmbedding(embed_dim=embed_dim).to("mps")

        # Load a pre-trained GPT-2 model as the decoder-only Transformer
        config = GPT2Config(vocab_size=50257, n_positions=max_seq_len, n_embd=embed_dim, n_layer=12, n_head=12)
        self.decoder = GPT2Model(config).to("mps")

        # Define special tokens
        self.sep_token = torch.tensor(tokenizer.convert_tokens_to_ids('[SEP]')).to("mps")
        self.eos_token = torch.tensor(tokenizer.convert_tokens_to_ids('[EOS]')).to("mps")

        # Output layer for generating token probabilities
        self.lm_head = nn.Linear(embed_dim, config.vocab_size, bias=False)

    def forward(self, images, max_length=50):
        # Start with the patch embeddings for the image and the [SEP] token
        x = self.patch_embedding(images.to("mps"))
        
        # Append the [SEP] token embedding to the sequence
        sep_token_embed = self.decoder.wte(self.sep_token).unsqueeze(0).unsqueeze(0)
        x = torch.cat((x, sep_token_embed), dim=1)
        
        generated_tokens = []

        for i in range(max_length):
            print(i)
            # Get the decoder output logits for the current sequence
            outputs = self.decoder(inputs_embeds=x)
            logits = self.lm_head(outputs.last_hidden_state)

            # Select the last token's logits and get the most likely next token
            next_token_logits = logits[:, -1, :]  # Shape: (batch_size, vocab_size)
            next_token = torch.argmax(next_token_logits, dim=-1)  # Shape: (batch_size,)

            # Append the token to the generated sequence
            generated_tokens.append(next_token)

            # Break if the EOS token is generated
            if next_token.item() == self.eos_token.item():
                break

            # Update `x` by appending the embedding of the next token
            next_token_embed = self.decoder.wte(next_token).unsqueeze(1)  # Embed the token
            x = torch.cat((x, next_token_embed), dim=1)  # Append to the sequence

        # Convert list of tokens to a tensor and return
        return torch.stack(generated_tokens, dim=1)

# Model instantiation
model = DTrOCR().to("mps")

# Example inputs
images = torch.randn(1, 3, 224, 224)  # Batch of 2 images, each 224x224 RGB
labels = torch.randint(0, 50257, (1, 20))  # Example labels (batch of 2 sequences)

# Generate text
output = model.forward(images, max_length=3)
tokenizer.decode(output[0].tolist())

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [13]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer
from datasets import load_dataset
from PIL import Image
import torchvision.transforms as transforms

# Load the dataset
dataset = load_dataset("alpayariyak/IAM_Sentences")

# Define a custom dataset class
class IAMSentencesDataset(Dataset):
    def __init__(self, dataset, tokenizer, img_size=(224, 224)):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.img_size = img_size
        self.transform = transforms.Compose([
            transforms.Resize(img_size),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Load image and label
        image = Image.open(self.dataset[idx]['image']).convert('RGB')
        label = self.dataset[idx]['text']

        # Preprocess image
        image = self.transform(image)

        # Tokenize label
        label_ids = self.tokenizer.encode(label, return_tensors='pt').squeeze()

        return image, label_ids

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Create dataset and dataloader
train_dataset = IAMSentencesDataset(dataset['train'], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Example usage
for images, labels in train_dataloader:
    images = images.to("mps")
    labels = labels.to("mps")
    output = model.forward(images, max_length=50)
    decoded_text = tokenizer.decode(output[0].tolist())
    print(decoded_text)
    break

AttributeError: 'PngImageFile' object has no attribute 'read'