In [8]:
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

from torchvision import datasets, transforms
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForCausalLM
import albumentations as A
from pycocotools.coco import COCO
from PIL import Image

In [33]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Define the dataset and data loader
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CLIP model and processor
# model_name = "openai/clip-vit-base-patch32"
model_name = "EleutherAI/polyglot-ko-1.3b"
# processor = CLIPProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name).to(device)

# Freeze all layers except the projection layer
for name, param in model.named_parameters():
    if "visual" in name and "projection" not in name:
        param.requires_grad = False
    elif "text" in name and "projection" not in name:
        param.requires_grad = False

optimizer = AdamW(
    [
        {'params':model.visual_projection.parameters()},
        {'params':model.text_projection.parameters()}
    ],
    lr=0.0001
    )

You are using a model of type gpt_neox to instantiate a model of type clip. This is not supported for all configurations of models and can yield errors.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at EleutherAI/polyglot-ko-1.3b were not used when initializing CLIPModel: ['gpt_neox.layers.16.mlp.dense_4h_to_h.bias', 'gpt_neox.layers.6.post_attention_layernorm.bias', 'gpt_neox.layers.12.attention.dense.weight', 'gpt_neox.layers.6.attention.dense.weight', 'gpt_neox.layers.2.attention.rotary_emb.inv_freq', 'gpt_neox.layers.7.mlp.dense_4h_to_h.weight', 'gpt_neox.layers.21.attention.masked_bias', 'gpt_neox.layers.11.mlp.dense_h_to_4h.bias', 'gpt_neox.layers.23.attention.query_key_value.bias', 'gpt_neox.layers.4.attention.bias', 'gpt_neox.layers.18.attention.bias', 'gpt_neox.layers.17.mlp.dense_h_to_4h.bias', 'gpt_neox.layers.4.input_layernorm.bias', 'gpt_neox.layers.17.attention.dense.bias', 'gpt_neox.layers.11.post_attention_layernorm.weight', 'gpt_neox.final_layer_norm.bias', 'gpt_neox.layers.19.mlp.dense_h_to_4h.weight', 'gpt_neox.layers.11.attention.query_key_value.weight', 'gpt_neox.layers.13.attention.dense.weight', 'gpt_neox.layers.4.mlp.den

In [34]:

class AIHub_data(Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json,  transform=None, tokenizer=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        
        self.transform = transform
        self.tokenizer = tokenizer

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        ann_id = self.ids[index]
        caption = coco.anns[ann_id]['caption']
        img_id = coco.anns[ann_id]['image_id']
        path = coco.loadImgs(img_id)[0]['file_name']

        image = Image.open(os.path.join(self.root, path))
        image = image.convert('RGB') if image.mode != 'RGB' else image
        
        if self.transform is not None:
            # image = self.transform(image)
            image = self.transform(image=np.asarray(image))
            image = np.transpose(image['image'],(2, 0, 1))
        

        # Convert caption (string) to word ids.
        # tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        inputs = self.tokenizer(caption, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        del inputs['token_type_ids']
        for k in ['input_ids', 'attention_mask']:
            inputs[k] = torch.squeeze(inputs[k],0)
        
        inputs['pixel_values'] = image
        
        
        return inputs
    
    def fix_img(self, img):
        return img.convert('RGB') if img.mode != 'RGB' else img

    def __len__(self):
        return len(self.ids)

In [35]:
def get_transforms(mode="train"):
    if mode == "train":
        return A.Compose(
            [
                A.Resize(224, 224, always_apply=True),
                A.Normalize(max_pixel_value=255.0, always_apply=True),
                # A.Transpose((2, 0, 1))
            ]
        )
    else:
        return A.Compose(
            [
                A.Resize(224, 224, always_apply=True),
                A.Normalize(max_pixel_value=255.0, always_apply=True),
                # A.Transpose((2, 0, 1))
            ]
        )
        
def build_loaders(mode, tokenizer):
    transforms = get_transforms(mode=mode)
    dataset = AIHub_data(
        root='/data/aihub/Training/images/',
        json='/data/aihub/Training/labels/labels.json',
        transform=transforms,
        tokenizer=tokenizer)
    
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=4,
        num_workers=0,
        shuffle=True if mode == "train" else False,
        # collate_fn=collate_fn
    )
    return dataloader

In [36]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-1.3b")

In [37]:
train_loader = build_loaders('train', tokenizer)


loading annotations into memory...
Done (t=0.16s)
creating index...
index created!


In [43]:

# Training loop
model.train()
for epoch in range(10):
    total_loss = 0.0
    for batch, inputs in enumerate(train_loader):
        inputs = {k:v.to('cuda:1') for k, v in inputs.items()}
        # inputs['pixel_values'] = inputs['pixel_values'].to(device)
        

        optimizer.zero_grad()
        # Preprocess images
        
        # Forward pass
        outputs = model(**inputs)
        print('for loop')
        break
        
        batch_size = images.size(0)
        logits_per_image = outputs.logits_per_image

        # Generate random labels for contrastive learning
        targets = torch.arange(batch_size).to(device)

        # Compute loss
        loss = torch.nn.functional.cross_entropy(logits_per_image / 0.1, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{10}], Step [{batch+1}/{len(data_loader)}], Loss: {loss.item():.4f}")

    print(f"Epoch [{epoch+1}/{10}], Total Loss: {total_loss:.4f}")
    break

# Save the trained model
model.save_pretrained("path/to/save/model")

RuntimeError: The size of tensor a (128) must match the size of tensor b (77) at non-singleton dimension 1

In [39]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'pixel_values'])

In [42]:
inputs['pixel_values'].shape

torch.Size([4, 3, 224, 224])