In [1]:
import importlib

import os
import clip
import torch
from torchvision import transforms, models

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

import argparse
from omegaconf import OmegaConf

import json

device = "cuda" if torch.cuda.is_available() else "cpu"
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import seaborn as sn

from columnar import columnar
from nltk.corpus import wordnet as wn
clip_model, clip_preprocess = clip.load("ViT-B/32", )


import datasets #import the module here, so that it can be reloaded.
importlib.reload(datasets)
from datasets import *


In /nethome/bdevnani3/anaconda3/envs/p3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /nethome/bdevnani3/anaconda3/envs/p3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /nethome/bdevnani3/anaconda3/envs/p3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /nethome/bdevnani3/anaconda3/envs/p3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /

In [30]:
import json
def pp(d):
    print(json.dumps(d,sort_keys=True, indent=4))

In [9]:
smallflowers = SmallFlowers102(4, 1)
smallflowers.classes = [x.replace("_", " ").title() for x in smallflowers.classes]
sf_train_loader, _ = smallflowers.get_train_loaders(transform_fn=clip_preprocess)
sf_test_loader = smallflowers.get_test_loader(transform_fn=clip_preprocess)

pets = OxfordPets(4,1)
pets.classes = [x.replace("_", " ").title() for x in pets.classes]
p_train_loader, _ = pets.get_train_loaders(transform_fn=clip_preprocess)
p_test_loader = pets.get_test_loader(transform_fn=clip_preprocess)

cifar100 = Cifar100(4,1, root=None)
c_train_loader, _ = cifar100.get_train_loaders(transform_fn=clip_preprocess)
cifar100.classes = [x.replace("_", " ").title() for x in cifar100.classes]
c_test_loader = cifar100.get_test_loader(transform_fn=clip_preprocess)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [10]:
def clip_zero_shot(
    loader,
    classes,
    zeroshot_weights,
    clip_model_name="ViT-B/32",
):

    global clip_model, clip_preprocess
    device = "cuda" if torch.cuda.is_available() else "cpu"

    def accuracy(output, target, topk=(1,)):
        pred = output.topk(max(topk), 1, True, True)[1].t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))
        return [
            float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy())
            for k in topk
        ]

    # lazy load
    if clip_model == None:
        clip_model, clip_preprocess = clip.load(clip_model_name, device)
        
    per_class_accuracy_top1 = { k:[0,0, classes[k]] for k in range(len(classes))} # correct, total, class_name
    per_class_accuracy_top5 = { k:[0,0, classes[k]] for k in range(len(classes))} 

    with torch.no_grad():
        top1, top5, n = 0.0, 0.0, 0.0
        for i , (images, target) in enumerate(tqdm(loader)):
            images = images.cuda()
            target = target.cuda()

            # predict
            image_features = clip_model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            logits = 100.0 * image_features @ zeroshot_weights

            # measure accuracy
            acc1, acc5 = accuracy(logits, target, topk=(1, 5))
            top1 += acc1
            top5 += acc5
            n += images.size(0)
            
            per_class_accuracy_top1[target.cpu().detach().numpy()[0]][0]+= acc1
            per_class_accuracy_top1[target.cpu().detach().numpy()[0]][1]+= 1

    top1 = (top1 / n) * 100
    top5 = (top5 / n) * 100

    return top1, per_class_accuracy_top1

In [11]:
phrase_file = "/nethome/bdevnani3/vis_lang/efficient_finetuning/configs/phrases/flowers.txt"

templates = ["{}"]
    
def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [
                template.format(classname) for template in templates
            ]  # format with class
            texts = clip.tokenize(texts).cuda()  # tokenize
            class_embeddings = clip_model.encode_text(texts)  # embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights

This notebook will explore characteristics particular to the structure of every query to understand what makes a good query.

### Baselines

In [19]:
sf_zw = zeroshot_classifier(smallflowers.classes,templates)

sf_czs = clip_zero_shot(
    sf_train_loader,
    smallflowers.classes,
    sf_zw
)
print("Small Flowers: ", sf_czs[0])

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

Small Flowers:  44.44444444444444


In [20]:
p_zw = zeroshot_classifier(pets.classes,templates)

p_czs = clip_zero_shot(
    p_test_loader,
    pets.classes,
    p_zw
)
print("Pets: ", p_czs[0])

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

Pets:  82.28400109021533


In [21]:
c_zw = zeroshot_classifier(cifar100.classes,templates)

c_czs = clip_zero_shot(
    c_test_loader,
    cifar100.classes,
    c_zw
)
print("Cifar100: ", c_czs[0])

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Cifar100:  60.38


### Query Length

In [25]:
phrase_file = "/nethome/bdevnani3/vis_lang/efficient_finetuning/configs/phrases/flowers.txt"

templates = ["{}", 
             "                               {}                                ", 
             "{}                                                               ",
             "This is a {}",
             "This is a photo of a {}",
             "This image is a photo of a {}",
             "I see that this image is a photo of a {}",
             "One observes that this image contains within it a {}"]


In [31]:
dummy = {}
for template in templates:
    sf_zw = zeroshot_classifier(smallflowers.classes,[template])

    sf_czs = clip_zero_shot(
        sf_train_loader,
        smallflowers.classes,
        sf_zw
    )
    dummy[template]= sf_czs[0]
print("Small Flowers:")
pp(dummy)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

Small Flowers:
{
    "                               {}                                ": 44.44444444444444,
    "I see that this image is a photo of a {}": 56.776556776556774,
    "One observes that this image contains within it a {}": 38.82783882783883,
    "This image is a photo of a {}": 60.68376068376068,
    "This is a photo of a {}": 63.85836385836385,
    "This is a {}": 65.07936507936508,
    "{}": 44.44444444444444,
    "{}                                                               ": 44.44444444444444
}


In [34]:
dummy = {}
for template in templates:
    p_zw = zeroshot_classifier(pets.classes,[template])

    p_czs = clip_zero_shot(
        p_test_loader,
        pets.classes,
        p_zw
    )
    dummy[template]= p_czs[0]
print("Pets: ")
pp(dummy)

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

Pets: 
{
    "                               {}                                ": 82.28400109021533,
    "I see that this image is a photo of a {}": 80.04905968928864,
    "One observes that this image contains within it a {}": 75.98800763150723,
    "This image is a photo of a {}": 85.09130553284272,
    "This is a photo of a {}": 88.1984191877896,
    "This is a {}": 88.90705914418098,
    "{}": 82.28400109021533,
    "{}                                                               ": 82.28400109021533
}


In [35]:
dummy = {}
for template in templates:
    c_zw = zeroshot_classifier(cifar100.classes,[template])

    c_czs = clip_zero_shot(
        c_test_loader,
        cifar100.classes,
        c_zw
    )
    dummy[template]= c_czs[0]
print("Cifar100: ")
print(dummy)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Cifar100: 
{'{}': 60.38, '                               {}                                ': 60.38, '{}                                                               ': 60.38, 'This is a {}': 62.72, 'This is a photo of a {}': 63.13999999999999, 'This image is a photo of a {}': 62.81, 'I see that this image is a photo of a {}': 62.18, 'One observes that this image contains within it a {}': 59.040000000000006}


### Many templates or one?

In [38]:
# Flowers
templates = [["This is a {} with leaves"],
             ["This is a {} with petals"],
             ["This is a {} with petals and leaves"], 
             ["This is a {} with leaves", "This is a {} with petals"]]


for template in templates:
    sf_zw = zeroshot_classifier(smallflowers.classes,template)

    sf_czs = clip_zero_shot(
        sf_train_loader,
        smallflowers.classes,
        sf_zw
    )
    print(template, sf_czs[0])
print("Small Flowers:")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

['This is a {} with leaves'] 66.30036630036629


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

['This is a {} with petals'] 70.45177045177046


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

['This is a {} with petals and leaves'] 71.30647130647131


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/819 [00:00<?, ?it/s]

['This is a {} with leaves', 'This is a {} with petals'] 70.6959706959707
Small Flowers:


In [None]:
# Flowers
templates = [["This is a white {}"],
             ["This is a pink {}"],
             ["This is a {} white or pink flower"], 
             ["Many beatiful white and pink flowers"], 
             ["This is a white {}","This is a pink {}", "Many beatiful {} flowers"]]

for template in templates:
    sf_zw = zeroshot_classifier(smallflowers.classes,template)

    sf_czs = clip_zero_shot(
        sf_train_loader,
        smallflowers.classes,
        sf_zw
    )
    print(template, sf_czs[0])
print("Small Flowers:")

In [39]:
# Pets
templates = [["This is a cute {}"],
             ["This is a fluffy {}"],
             ["This is a small {}"],
             ["This is a {} cute, fluffy and small"], 
             ["This is a cute {}","This is a fluffy {}","This is a small {}"]]

for template in templates:
    p_zw = zeroshot_classifier(pets.classes,template)

    p_czs = clip_zero_shot(
        p_test_loader,
        pets.classes,
        p_zw
    )
    print(template, p_czs[0])
print("Small Flowers:")

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

['This is a cute {}'] 89.07059144180975


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

['This is a fluffy {}'] 82.63832106841102


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

['This is a small {}'] 87.32624693376943


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

['This is a {} cute, fluffy and small'] 84.35541019351321


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

['This is a cute {}', 'This is a fluffy {}', 'This is a small {}'] 88.28018533660398
Small Flowers:
