In [1]:
import os
import json
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from sklearn.model_selection import train_test_split

# Load and parse JSON annotations
def load_annotations(json_path):
    with open(json_path, 'r') as f:
        annotations = [json.loads(line.strip()) for line in f]
    return annotations

# Custom Dataset class
class CustomCLIPDataset(Dataset):
    def __init__(self, image_dir, annotations, transform=None):
        self.image_dir = image_dir
        self.annotations = annotations
        self.transform = transform
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx):
        data = self.annotations[idx]
        image_path = os.path.join(self.image_dir, data['image'])
        image = Image.open(image_path).convert("RGB")
        
        # Generate cropped images and corresponding captions
        crops = []
        captions = []
        for ann in data['annotations']:
            bbox = ann['bbox']
            caption = ann['caption']
            cropped_image = image.crop((bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]))
            if self.transform:
                cropped_image = self.transform(cropped_image)
            crops.append(cropped_image)
            captions.append(caption)
        
        return crops, captions

In [2]:
# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Load annotations and create dataset
json_path = '/home/jupyter/advanced/vlm.jsonl'
image_dir = '/home/jupyter/advanced/images'
annotations = load_annotations(json_path)
# Split annotations into training and testing sets
train_annotations, test_annotations = train_test_split(annotations, test_size=0.2, random_state=42)

# Create datasets
train_dataset = CustomCLIPDataset(image_dir, train_annotations, transform)
test_dataset = CustomCLIPDataset(image_dir, test_annotations, transform)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [3]:
# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

In [4]:
# Finetuning function
def finetune_clip(model, dataloader, epochs=3, lr=1e-5):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Training Epoch {epoch+1}/{epochs}"):
            images, texts = batch
            
            images = torch.cat(images).to(model.device)
            #images = (images - images.min()) / (images.max() - images.min())
            # texts = sum(texts, [])
            texts = [item for sublist in texts for item in sublist]
            inputs = processor(text=texts, images=images, return_tensors="pt", padding=True).to(model.device)
            
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            logits_per_text = outputs.logits_per_text
            
            # Define contrastive loss
            ground_truth = torch.arange(len(logits_per_image)).long().to(model.device)
            loss = (torch.nn.functional.cross_entropy(logits_per_image, ground_truth) + torch.nn.functional.cross_entropy(logits_per_text, ground_truth)) / 2
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch+1} Loss: {total_loss / len(dataloader)}")
        
    # Save the model, processor, and tokenizer
    model_name = "CLIP-large-finetuned"
    model.save_pretrained(f"/home/jupyter/{model_name}")
    processor.save_pretrained(f"/home/jupyter/{model_name}")
    tokenizer.save_pretrained(f"/home/jupyter/{model_name}")

In [5]:
def evaluate_clip(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            images, texts = batch
            
            # Concatenate images and ensure they are in the range [0, 1]
            images = torch.cat(images).to(model.device)
            #images = (images - images.min()) / (images.max() - images.min())
            
            # Flatten the list of captions
            texts = [item for sublist in texts for item in sublist]
            
            # Process the images and texts
            inputs = processor(text=texts, images=images, return_tensors="pt", padding=True).to(device)
            
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            logits_per_text = outputs.logits_per_text
            
            # Define contrastive loss
            ground_truth = torch.arange(len(logits_per_image)).long().to(model.device)
            loss = (torch.nn.functional.cross_entropy(logits_per_image, ground_truth) + torch.nn.functional.cross_entropy(logits_per_text, ground_truth)) / 2
            total_loss += loss.item()
    
    average_loss = total_loss / len(dataloader)
    print(f"Evaluation Loss: {average_loss}")

In [6]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Finetune the model
finetune_clip(model, train_dataloader, 1)

Training Epoch 1/3:   0%|          | 0/4085 [00:00<?, ?it/s]It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Training Epoch 1/3: 100%|██████████| 4085/4085 [1:09:30<00:00,  1.02s/it]


Epoch 1 Loss: 1.6475635025463313


Training Epoch 2/3: 100%|██████████| 4085/4085 [1:09:15<00:00,  1.02s/it]


Epoch 2 Loss: 1.6463951499254934


Training Epoch 3/3: 100%|██████████| 4085/4085 [1:08:48<00:00,  1.01s/it]


Epoch 3 Loss: 1.6463986473538739


NameError: name 'CLIP' is not defined

In [8]:
# Evaluate the model on the test set
evaluate_clip(model, test_dataloader)

Evaluating: 100%|██████████| 1022/1022 [06:07<00:00,  2.78it/s]

Evaluation Loss: 1.6436608403396233



