In [1]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import os
import clip
    
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset

In [5]:
    model, preprocess = clip.load('ViT-B/32', device)

In [75]:
path_dataset_10 = "/projectnb/ec523kb/projects/chopped/data/food101_10percent/"
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor()
    ])
train_dataset_10 = datasets.ImageFolder(path_dataset_10, transform=transform)
dataloader_10 = DataLoader(train_dataset_10, batch_size=10, shuffle=True)

In [76]:
path_dataset_test = "/projectnb/ec523kb/projects/chopped/data/food101_test/"
test_dataset = datasets.ImageFolder(path_dataset_10, transform=transform)
dataloader_test = DataLoader(test_dataset, batch_size=10)

In [25]:
img, label = next(iter(dataloader_10))
print(img.shape)

img = transforms.Resize((224, 224))(img)
img = img.to(device)
img = img.half()
print(img.shape)

out = model.visual(img)
print(out.shape)
# fig, ax = plt.subplots(1, 3)
# ax[0].imshow(img.permute(1, 2, 0))
# print(label)

torch.Size([10, 3, 512, 512])
torch.Size([10, 3, 224, 224])
torch.Size([10, 512])


In [41]:
class ClipClassify(nn.Module):
    def __init__(self):
        super().__init__()
        self.vit = model.visual
        self.classification_head = nn.Linear(512, 101, dtype=torch.float32)

    def forward(self, x):
        x = self.vit(x)
        x = x.float()
        x = self.classification_head(x)
        x = F.softmax(x, dim=1)
        return x

In [63]:
def preprocess_images(inputs):
    inputs = transforms.Resize((224, 224))(inputs)
    inputs = inputs.to(device)
    inputs = inputs.half()
    return inputs

## Fine Tune With Various LLRD

In [56]:
parameters = []
net = ClipClassify().to(device)

# Freeze Vit
for name, param in net.vit.named_parameters():
    param.requires_grad = False

epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
print_n = 100

for epoch in range(epochs):
    running_loss = 0.
    for i, data in enumerate(dataloader_10):
        inputs, labels = data
        
        inputs = transforms.Resize((224, 224))(inputs)
        inputs = inputs.to(device)
        inputs = inputs.half()

        labels = labels.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % print_n == print_n - 1:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / print_n:.3f}')
            running_loss = 0.0

[1,   100] loss: 4.605
[1,   200] loss: 4.308
[1,   300] loss: 3.800
[1,   400] loss: 3.700
[1,   500] loss: 3.673
[1,   600] loss: 3.663
[1,   700] loss: 3.651
[2,   100] loss: 3.647
[2,   200] loss: 3.643
[2,   300] loss: 3.641
[2,   400] loss: 3.640
[2,   500] loss: 3.639
[2,   600] loss: 3.638
[2,   700] loss: 3.637
[3,   100] loss: 3.637
[3,   200] loss: 3.636
[3,   300] loss: 3.636
[3,   400] loss: 3.635
[3,   500] loss: 3.635
[3,   600] loss: 3.634
[3,   700] loss: 3.634
[4,   100] loss: 3.634
[4,   200] loss: 3.634
[4,   300] loss: 3.634
[4,   400] loss: 3.633
[4,   500] loss: 3.634
[4,   600] loss: 3.633
[4,   700] loss: 3.634
[5,   100] loss: 3.633
[5,   200] loss: 3.633
[5,   300] loss: 3.633
[5,   400] loss: 3.633
[5,   500] loss: 3.633
[5,   600] loss: 3.633
[5,   700] loss: 3.633
[6,   100] loss: 3.633
[6,   200] loss: 3.633
[6,   300] loss: 3.633
[6,   400] loss: 3.633
[6,   500] loss: 3.633
[6,   600] loss: 3.632
[6,   700] loss: 3.633
[7,   100] loss: 3.632
[7,   200] 

In [19]:
clip_classify = ClipClassify().to(device)
print(clip_classify.parameters)

<bound method Module.parameters of ClipClassify(
  (vit): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768,

In [71]:
correct = 0
total = 0
dataloader_test = DataLoader(test_dataset, batch_size=10, shuffle=True)
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in dataloader_test:
        images, labels = data
        # calculate outputs by running images through the network
        labels = labels.to(device)
        images = preprocess_images(images)
        
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        print("predicted: ", predicted)
        print("labels: ", labels)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
predicted:  tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
l

In [None]:
dataloader_test = DataLoader(test_dataset, batch_size=10, shuffle=True)
dataloader_10 = DataLoader(train_dataset_10, batch_size=10, shuffle=True)
for data in dataloader_10:
    img, label = data
    print(label)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0,