In [12]:
# Step 1: Load necessary libraries and datasets

from transformers import Swinv2ForImageClassification, GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AutoFeatureExtractor
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from PIL import Image 

# Load the dataset
dataset = load_dataset("jxie/flickr8k")

# Step 2: Preprocess the Flickr8k dataset
# This involves loading the images, resizing them, and preprocessing the captions.


# Initialize Swin's feature extractor and GPT-2's tokenizer
swin_feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Image Preprocessing
def process_images(image_path): 
    
    # Convert image to features suitable for Swin
    image_features = swin_feature_extractor(image_path)
    return image_features['pixel_values']

# Text Preprocessing
def process_captions(caption):
    # Tokenize the caption using GPT-2 tokenizer and return input_ids
    return gpt2_tokenizer(caption, return_tensors='pt', padding='max_length', max_length=50, truncation=True).input_ids[0]

# Apply the preprocessing functions to the dataset
dataset = dataset.map(lambda x: {'image': process_images(x['image']), 'input_ids': process_captions(x['caption_0'])}, batched=False)

# Add attention masks for the captions
dataset = dataset.map(lambda x: {'attention_mask': [1] * len(x['input_ids']) + [0] * (50 - len(x['input_ids']))}, batched=False)


# Step 3: Set up the Swin Transformer model

swin_model = Swinv2ForImageClassification.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")

# Remove the classification head to get the image embeddings
swin_model = nn.Sequential(*list(swin_model.children())[:-1]) 

# Step 4: Set up the GPT-2 model

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_config = GPT2Config.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel(gpt2_config)

# Step 5: Combine Swin and GPT-2

class ImageCaptioningModel(nn.Module):
    def __init__(self, swin_model, gpt2_model):
        super(ImageCaptioningModel, self).__init__()
        self.swin_model = swin_model
        self.gpt2_model = gpt2_model
        
    def forward(self, images, input_ids, attention_mask):
        # Extract features from images using Swin
        image_features = self.swin_model(images)
        
        # Use the image features as the initial embedding for GPT-2
        gpt2_outputs = self.gpt2_model(input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=image_features)
        return gpt2_outputs.logits

model = ImageCaptioningModel(swin_model, gpt2_model)

# Step 6: Train the combined model
# [Implement training loop here]


Found cached dataset parquet (/home/bintangkevin29/.cache/huggingface/datasets/jxie___parquet/jxie--flickr8k-3226b79d510846ed/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 3/3 [00:00<00:00, 1127.91it/s]
Map:   0%|          | 0/6000 [00:00<?, ? examples/s]Using pad_token, but it is not set yet.
                                                    

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

: 

In [None]:
# Continuing from the previous code...

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, default_data_collator

# Define a custom data_collator
class CustomDataCollator(DataCollatorForSeq2Seq):
    def __init__(self, tokenizer, model):
        super().__init__(tokenizer, model=model, padding=True, max_length=256)
        
    def __call__(self, batch):
        # Separate out the image tensors from the text tensors
        images = torch.stack([item['image'] for item in batch])
        input_ids = [item['input_ids'] for item in batch]
        attention_mask = [item['attention_mask'] for item in batch]

        # Default data collator will handle padding of text sequences
        text_inputs = default_data_collator({'input_ids': input_ids, 'attention_mask': attention_mask})
        
        # Merge the image and text tensors
        batch = {'images': images, 'input_ids': text_inputs['input_ids'], 'attention_mask': text_inputs['attention_mask']}
        return batch

# Instantiate the custom data collator
data_collator = CustomDataCollator(tokenizer, model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=3,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    output_dir="./test-model",
    logging_first_step=True, 
    metric_for_best_model="bleu",  # Assuming you'd use BLEU for evaluation, but you can choose another metric
)

# Define training and validation data loaders
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(dataset['validation'], batch_size=8, collate_fn=data_collator)

# Define a metric for evaluation (e.g., BLEU). You can skip this step or modify it based on your requirements.

# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics  # Uncomment and define compute_metrics if needed
)

# Start training
trainer.train()






IndexError: Invalid key: 5638 is out of bounds for size 0