In [6]:
import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from tqdm import tqdm
from torchvision import transforms, datasets
from imagenetv2_pytorch import ImageNetV2Dataset

In [13]:
ROOT = "../IMAGENET/"
OUTPUT = "embeddings/"
DATASET = 'IN1K'

BATCH_SIZE = 256
IMG_SIZE = 256
CENTER_CROP_SIZE = 224

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
transforms.Resize(IMG_SIZE),
transforms.CenterCrop(CENTER_CROP_SIZE),
transforms.ToTensor(),
normalize])

In [14]:
backbone = [
    'RN50',
#     'RN101',
#     'RN50x4',
#     'RN50x16',
#     'RN50x64',
#     'ViT-B/32',
#     'ViT-B/16',
#     'ViT-L/14',
#     'ViT-L/14@336px'
]

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
for b in backbone:
    b = b.replace('/', '')
    print(b)

RN50


In [16]:
def get_features(dataset, model):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

In [17]:
# Load the dataset
for b in backbone:
    model, preprocess = clip.load(b, device)
    train_dataset = datasets.ImageFolder(ROOT+"train/", transform=preprocess)
    test_dataset = datasets.ImageFolder(ROOT+"val/", transform=preprocess)
    
    if DATASET == 'INV2':
        test_dataset = dataset = ImageNetV2Dataset("matched-frequency", transform=preprocess)
    b = b.replace('/', '')
    print(b)
    
    # Calculate the image features
    test_features, test_labels = get_features(test_dataset, model)
    print(test_features.shape, test_labels.shape)
    np.save(OUTPUT+DATASET+"_"+str(b)+"_"+'test-X.npy', test_features)
    np.save(OUTPUT+DATASET+"_"+str(b)+"_"+'test-y.npy', test_labels)
    
    if DATASET != 'INV2':
        train_features, train_labels = get_features(train_dataset, model)
        print(train_features.shape, train_labels.shape)
        np.save(OUTPUT+DATASET+"_"+str(b)+"_"+'train-X.npy', train_features)
        np.save(OUTPUT+DATASET+"_"+str(b)+"_"+'train-y.npy', train_labels)

RN50


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 196/196 [21:07<00:00,  6.47s/it]


(50000, 1024) (50000,)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5005/5005 [9:05:16<00:00,  6.54s/it]


(1281167, 1024) (1281167,)


In [7]:
# # Perform logistic regression
# classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
# classifier.fit(train_features, train_labels)

# # Evaluate using the logistic regression classifier
# predictions = classifier.predict(test_features)
# accuracy = np.mean((test_labels == predictions).astype(float)) * 100.
# print(f"Accuracy = {accuracy:.3f}")

# CLIP k-NN Index Build

In [None]:
clip_backbone = [
    'RN50',
#     'RN101',
#     'RN50x4',
#     'RN50x16',
#     'RN50x64',
#     'ViT-B/32',
#     'ViT-B/16',
#     'ViT-L/14',
#     'ViT-L/14@336px'
]

ROOT = '../../../CLIP/embeddings/'
DATASET = 'INV2'
D = 512
k = 2048

for b in clip_backbone:
    b = b.replace('/', '')
    print(b)

    database = np.load(ROOT+"IN1K_"+str(b)+"_"+'train-X.npy')
    queries = np.load(ROOT+DATASET+"_"+str(b)+"_"+'test-X.npy')
    
    if not path.isdir(ROOT+'index_files/'+b):
        makedirs(ROOT+'index_files/'+b)
    index_file = ROOT+'index_files/'+b+'IN1K_'+str(queries.shape[1])+'dim_''exactl2.index'

    # Load or build index
    if path.exists(index_file):
        print("Loading index file: " + index_file)
        cpu_index = faiss.read_index(index_file)

    else:
        print("Generating index file: " + index_file)

        xb = np.ascontiguousarray(database, dtype=np.float32)
        faiss.normalize_L2(xb)
        d = xb.shape[1]                           # dimension
        nb = xb.shape[0]                       # database size
        print("database: ", xb.shape)

        start = time.time()
        print("Building Exact L2 Index")
        cpu_index = faiss.IndexFlatL2(d)   # build the index
        cpu_index.add(xb)                  # add vectors to the index
        faiss.write_index(cpu_index, index_file)
        print("GPU Index build time= %0.3f sec" % (time.time() - start))
    
#     index = faiss.index_cpu_to_all_gpus(
#         cpu_index
#     )
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 1, cpu_index)

    # Load the queries
    xq = np.ascontiguousarray(queries, dtype=np.float32)
    faiss.normalize_L2(xq)
    nq = xq.shape[0]
    print("queries: ", xq.shape)

    Dist, Ind = index.search(xq, k)
    print("neighbors: ", Ind.shape)
    if not path.isdir(ROOT+"neighbors/"+b):
        makedirs(ROOT+"neighbors/"+b)
    nn_dir = ROOT+"neighbors/"+b+"mrl_exactl2_"+str(database.shape[1])+"dim_"+str(k)+"shortlist_"+DATASET+".csv"
    pd.DataFrame(Ind).to_csv(nn_dir, header=None, index=None)
            
    del index, Dist, Ind

# CLIP k-NN Eval

In [None]:
## CLIP
clip_backbone = [
    'RN50',
    'RN101',
    'RN50x4',
    'RN50x16',
    'RN50x64',
    'ViT-B/32',
    'ViT-B/16',
    'ViT-L/14',
    'ViT-L/14@336px'
]

ROOT = '../../../CLIP/embeddings/'
DATASET = 'IN1K'
D = 512
k = 2048

for b in clip_backbone:
    b = b.replace('/', '')
#     print(b)
    
    db_labels_clip = np.load(ROOT+"IN1K_"+str(b)+"_"+'train-y.npy')
    query_labels_clip = np.load(ROOT+DATASET+"_"+str(b)+"_"+'test-y.npy')
    query_X = np.load(ROOT+DATASET+"_"+str(b)+"_"+'test-X.npy')
    d = query_X.shape[1]
    del query_X
    
    neighbors_path = ROOT+"neighbors/"+b+"mrl_exactl2_"+str(d)+"dim_"+str(k)+"shortlist_"+DATASET+".csv"
    neighbors_clip = pd.read_csv(neighbors_path, header=None).to_numpy()
  
    
    top1_clip = db_labels_clip[neighbors_clip[:, 0]]
    print(np.sum(top1_clip == query_labels_clip) / query_labels_clip.shape[0])