In [60]:
import pandas as pd
from transformers import AutoTokenizer
import os
from datasets import load_from_disk

In [28]:


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

text = "A dog is playing with a ball"
tokens = tokenizer(text)

print(tokens["input_ids"])

[101, 1037, 3899, 2003, 2652, 2007, 1037, 3608, 102]


In [29]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [7]:
import json

In [8]:
with open('archive/annotations_trainval2014/annotations/captions_train2014.json', 'r') as f:
    data = json.load(f)

In [9]:
import pandas as pd

In [10]:
images_data = pd.DataFrame(data['images'])
annotation_data = pd.DataFrame(data['annotations'])

In [11]:
data = annotation_data.merge(images_data, how='left', left_on='image_id', right_on='id')

In [12]:
data = data.drop(columns=['id_x', 'license', 'height', 'width', 'date_captured', 'flickr_url','coco_url', 'id_y', 'image_id'])

In [71]:
data = pd.read_csv('processed_caption.csv')

In [65]:
data

Unnamed: 0,caption,file_name
0,A very clean and well decorated empty bathroom,COCO_train2014_000000318556.jpg
1,A panoramic view of a kitchen and all of its a...,COCO_train2014_000000116100.jpg
2,A blue and white bathroom with butterfly theme...,COCO_train2014_000000318556.jpg
3,A panoramic photo of a kitchen and dining room,COCO_train2014_000000116100.jpg
4,A graffiti-ed stop sign across the street from...,COCO_train2014_000000379340.jpg
...,...,...
414108,a slice of bread is covered with a sour cream ...,COCO_train2014_000000133071.jpg
414109,A long plate hold some fries with some sliders...,COCO_train2014_000000410182.jpg
414110,Two women sit and pose with stuffed animals.,COCO_train2014_000000180285.jpg
414111,White Plate with a lot of guacamole and an ext...,COCO_train2014_000000133071.jpg


In [None]:
# convert to huggingface dataset for easy manipulation

In [59]:

load_from_disk('caption_dataset')

DatasetDict({
    train: Dataset({
        features: ['caption', 'file_name', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 414113
    })
})

In [72]:
data['file_name'] = data['file_name'].apply(lambda x: os.path.join('archive/train2014/train2014', x))

In [73]:
data.to_csv('processed_caption.csv', index=False)

In [2]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import 

In [74]:
data = datasets.load_dataset('csv', data_files='processed_caption.csv')
# data = datasets.load_dataset('caption_dataset')

Generating train split: 414113 examples [00:00, 1402181.45 examples/s]


In [75]:
data['train'][0]

{'caption': 'A very clean and well decorated empty bathroom',
 'file_name': 'archive/train2014/train2014/COCO_train2014_000000318556.jpg'}

In [96]:
data

DatasetDict({
    train: Dataset({
        features: ['caption', 'file_name', 'input_ids', 'attention_mask'],
        num_rows: 414113
    })
})

In [97]:
def tokenize(sample):
    tokens = tokenizer(sample['caption'], padding='max_length', truncation=True, max_length=32, add_special_tokens=False)
    return {'input_ids': tokens['input_ids'], "attention_mask": tokens['attention_mask']}

In [98]:
data = data.map(tokenize, batched=True, num_proc=3, batch_size=5000)

Map (num_proc=3): 100%|██████████| 414113/414113 [00:04<00:00, 95179.25 examples/s] 


In [101]:
len(data['train'][0]['input_ids'])

32

In [102]:
data.save_to_disk('caption_dataset')

PermissionError: Tried to overwrite /Users/abdulvajid/AI/PROJECTS/Image-Caption-Generation-VIT-From-Scratch/caption_dataset/train but a dataset can't overwrite itself.

In [104]:
data

DatasetDict({
    train: Dataset({
        features: ['caption', 'file_name', 'input_ids', 'attention_mask'],
        num_rows: 414113
    })
})

In [105]:
data.save_to_disk('caption_data')

Saving the dataset (1/1 shards): 100%|██████████| 414113/414113 [00:00<00:00, 3791126.14 examples/s]


In [91]:
data = load_from_disk('caption_dataset')

In [106]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from model_args import Arguments
from torchvision import transforms
from PIL import Image
import torch
from datasets import load_dataset

args = Arguments()


class ImageCaptionData(Dataset):
    def __init__(self, path='caption_data'):
        super().__init__()
        self.path = path
        self.data = load_from_disk(path)['train']
        self.data.set_format('torch')
        
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        print(self.data[index])
        return transform(self.data[index]['file_name']), self.data[index]['input_ids']
        

def get_dataloader():
    dataloader = DataLoader(dataset=ImageCaptionData(), batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=args.pin_memory)
    return dataloader


def transform(img_path):
    transformer = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((args.img_size, args.img_size)),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ])
    
    
    img = Image.open(img_path)
    transformed_img = transformer(img)
    
    return transformed_img

In [107]:
dataloader = get_dataloader()

In [108]:
for img, label in dataloader:
    print(img.shape)
    print(len(label))
    break

{'caption': 'A man wearing a safety vest performs a maintenance task on a fire hydrant.', 'file_name': 'archive/train2014/train2014/COCO_train2014_000000077533.jpg', 'input_ids': tensor([ 1037,  2158,  4147,  1037,  3808, 17447, 10438,  1037,  6032,  4708,
         2006,  1037,  2543, 26018,  3372,  1012,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])}
{'caption': 'A MAN IS SNOW BOARDING IN THE SNOW JUMPING HIGH', 'file_name': 'archive/train2014/train2014/COCO_train2014_000000016344.jpg', 'input_ids': tensor([1037, 2158, 2003, 4586, 9405, 1999, 1996, 4586, 8660, 2152,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,