In [2]:
import torch
from transformers import CLIPModel, CLIPProcessor, AdamW
from torchvision import transforms

In [2]:
# Load pre-trained CLIP VIT32 model and tokenizer
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [6]:
data_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(),
    transforms.ToTensor(),
    transforms.Lambda(lambda img: img.repeat(3, 1, 1) if img.shape[0] == 1 else img),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [8]:
!pip install datasets
from datasets import load_dataset

coco_dataset = load_dataset("lmms-lab/COCO-Caption2017")
#coco_dataset = ds["test"].shuffle(seed=42).select(range(10000))
coco_dataset.set_transform(lambda examples: {'image': [data_transforms(img) for img in examples['image']],'labels': examples['answer']})
coco_dataloader = torch.utils.data.DataLoader(coco_dataset['test'], batch_size=256, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(coco_dataset['val'], batch_size=256, shuffle=True)





In [1]:
import torch.nn.functional as F
num_epochs = 2
optimizer = torch.optim.Adam(clip_model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
for epoch in range(num_epochs):
    train_loss = 0.0
    for batch in coco_dataloader:
        inputs = batch['image']
        labels = batch['labels']

        # Convert labels to integers
        unique_labels = list(set([label for sublist in labels for label in sublist]))
        label_mapping = {label: i for i, label in enumerate(unique_labels)}
        flat_labels = [label for sublist in labels for label in sublist]
        labels = torch.tensor([label_mapping.get(l, -1) for l in flat_labels],dtype=torch.long)
        text_labels = [', '.join(label) for label in batch['labels']]
        # Check for empty text labels and skip the batch if found
        if not any(text_labels):
            continue

        inputs = inputs.to(device)
        labels = labels.to(device)
        text_inputs = tokenizer(text_labels, return_tensors="pt", padding=True, truncation=True, ).to(device)
        optimizer.zero_grad()
        chunk_size = 32  # Adjusted this value based on my GPU memory
        for i in range(0, inputs.size(0), chunk_size):
            inputs_chunk = inputs[i:i + chunk_size]
            text_inputs_chunk = {k: v[i:i + chunk_size] for k, v in text_inputs.items()}
            labels_chunk = labels[i:i + chunk_size]
             # Check if text_inputs_chunk["input_ids"] is empty and skip if so
            if text_inputs_chunk["input_ids"].numel() == 0: # Check if the tensor is empty
                continue

            outputs = clip_model(pixel_values = inputs_chunk, input_ids = text_inputs_chunk["input_ids"], attention_mask = text_inputs_chunk["attention_mask"])
            logits_per_image = outputs.logits_per_image
            loss = F.cross_entropy(logits_per_image, labels_chunk)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss += loss.item()
    # Calculate average training loss per epoch
    avg_train_loss = train_loss / len(coco_dataloader)
    ###print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")
    # Validation loop
    val_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation during validation
        for batch in val_dataloader:
            inputs = batch['image'].to(device)
            labels = batch['labels']
            # Convert labels to integers
            unique_labels = list(set([label for sublist in labels for label in sublist]))
            label_mapping = {label: i for i, label in enumerate(unique_labels)}
            flat_labels = [label for sublist in labels for label in sublist]
            labels = torch.tensor([label_mapping.get(l, -1) for l in flat_labels], dtype=torch.long)
            text_labels = [', '.join(label) for label in batch['labels']]

            # Check for empty text labels and skip the batch if found
            if not any(text_labels):
                continue

            inputs = inputs.to(device)
            labels = labels.to(device)
            text_inputs = tokenizer(text_labels, return_tensors="pt", padding=True, truncation=True).to(device)
            optimizer.zero_grad()

            chunk_size = 32  # Adjust this value based on your GPU memory
            for i in range(0, inputs.size(0), chunk_size):
                inputs_chunk = inputs[i:i + chunk_size]
                text_inputs_chunk = {k: v[i:i + chunk_size] for k, v in text_inputs.items()}
                labels_chunk = labels[i:i + chunk_size]

                # Check if text_inputs_chunk["input_ids"] is empty and skip if so
                if text_inputs_chunk["input_ids"].numel() == 0:
                    continue

                outputs = clip_model(pixel_values=inputs, input_ids=text_inputs_chunk["input_ids"], attention_mask=text_inputs_chunk["attention_mask"])
                logits_per_image = outputs.logits_per_image
                loss = F.cross_entropy(logits_per_image, labels)
                val_loss += loss.item()

    # Calculate average validation loss per epoch
    avg_val_loss = val_loss / len(val_dataloader)

    # Print epoch and both average training and validation losses
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

NameError: name 'torch' is not defined

In [14]:
    # Validation loop
    val_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation during validation
        for batch in val_dataloader:
            inputs = batch['image'].to(device)
            labels = batch['labels']
            unique_labels = list(set([label for sublist in labels for label in sublist]))
            label_mapping = {label: i for i, label in enumerate(unique_labels)}
            flat_labels = [label for sublist in labels for label in sublist]
            labels = torch.tensor([label_mapping.get(l, -1) for l in flat_labels], dtype=torch.long).to(device)
            outputs = clip_model(pixel_values=inputs, input_ids=text_inputs_chunk["input_ids"], attention_mask=text_inputs_chunk["attention_mask"])
            logits_per_image = outputs.logits_per_image
            loss = F.cross_entropy(logits_per_image, labels)
            val_loss += loss.item()

    # Calculate average validation loss per epoch
    avg_val_loss = val_loss / len(val_dataloader)

    # Print epoch and both average training and validation losses
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

RuntimeError: each element in list of batch should be of equal size