In [1]:
import torch
import torch.nn as nn
from transformers import CLIPImageProcessor, CLIPModel
import torchvision.transforms as transforms
from datasets import Dataset, load_dataset
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from transformers import DefaultDataCollator

In [2]:
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")

dataset = load_dataset("imagefolder", data_dir='/data/upb/users/b/bakshit/profiles/unix/cs/FraudDetectionThesis/Dataset1')


Resolving data files:   0%|          | 0/86646 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/300 [00:00<?, ?it/s]

## Perform transformations on images
#### Without transformations, the pixel_values obtained from image_processor is not converted in tensors, and also there is a dimensionality issues when training is performed.

In [3]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [4]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples
    
dataset = dataset.with_transform(transforms)

# Define data loaders with batch size
train_loader = DataLoader(dataset['train'], batch_size=32, shuffle=True)
test_loader = DataLoader(dataset['test'], batch_size=32, shuffle=False)

### Define all training related parameters

In [5]:
from transformers import DefaultDataCollator
# dataset_transformed = dataset.with_transform(transforms)
data_collator = DefaultDataCollator()

In [6]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

class CLIPModelClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(CLIPModelClassifier, self).__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)       
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):       
        features = self.model.get_image_features(pixel_values=x)      
        x = self.fc(features)
        return x

In [8]:
from torch.optim import Adam
model = CLIPModelClassifier().to(device)
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [9]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

### Start Training and Evaluation

In [11]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(batch['pixel_values'])
        print(outputs.shape, batch['label'].shape)
        loss = criterion(outputs, batch['label'])
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/8124 [00:00<?, ?it/s]

torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size([32, 2]) torch.Size([32])
torch.Size([32, 512])
torch.Size([32, 2])
torch.Size

KeyboardInterrupt: 

In [92]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(batch['pixel_values'])
    predictions = torch.argmax(outputs, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["label"])

metric.compute()

{'accuracy': 0.9833333333333333}