#**Téléchargement et installation de CLIP**

In [None]:
! pip install -q ftfy regex tqdm
! pip install -q git+https://github.com/openai/CLIP.git
import clip
import numpy as np

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone


#**Téléchargement d'imagenet100 depuis kaggle**

In [None]:
from google.colab import files
uploaded = files.upload()

%mkdir ~/.kaggle
%cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download ambityga/imagenet100 -p 'data/imagenet/'

!unzip -q 'data/imagenet/imagenet100.zip' -d 'data/imagenet/'
%rm -r 'data/imagenet/imagenet100.zip'
%mv data/imagenet/train.X1 data/imagenet/train
%mv data/imagenet/val.X data/imagenet/val

Saving kaggle.json to kaggle.json
Downloading imagenet100.zip to data/imagenet
100% 16.1G/16.1G [01:52<00:00, 178MB/s]
100% 16.1G/16.1G [01:52<00:00, 153MB/s]


#**Génération d'imagenet10 depuis imagenet100 (script du tp 5 d'analyse d'images, 13000 données, 10 classes)**

In [None]:
import shutil
import sys
import os
import os.path as osp

import numpy as np
from tqdm import tqdm

from torchvision.datasets import ImageFolder


n_classes = 10
print('Creating a subset of ImageNet with {} classes'.format(n_classes))

dset_dir = osp.join('data', 'imagenet')
dset = ImageFolder(osp.join(dset_dir, 'train'))
classes = dset.classes

new_dset_dir = osp.join('data', 'imagenet{}'.format(n_classes))
classes_subset = np.random.choice(classes, size=n_classes, replace=False)

os.makedirs(osp.join(new_dset_dir, 'train'))
os.makedirs(osp.join(new_dset_dir, 'val'))

for c in tqdm(classes_subset):
    src = osp.join(dset_dir, 'train', c)
    dst = osp.join(new_dset_dir, 'train', c)
    shutil.copytree(src, dst)

    src = osp.join(dset_dir, 'val', c)
    dst = osp.join(new_dset_dir, 'val', c)
    shutil.copytree(src, dst)


Creating a subset of ImageNet with 10 classes


100%|██████████| 10/10 [00:25<00:00,  2.59s/it]


#**Traduction des noms des classes imagenet (ID -> Label en string)**

In [None]:
import pandas as pd
import os

df = pd.read_csv('imagenet_names.txt', header=None)
df.columns = ['full_text']

df['identifier'] = df['full_text'].apply(lambda x: x.split(' ')[0])
df['name'] = df['full_text'].apply(lambda x: x.split(' ')[-1] if len(x.split(' ')) > 1 else None)

df.dropna(subset=['name'], inplace=True)

id_to_name = dict(zip(df['identifier'], df['name']))

def rename_images(base_path):
    for root, dirs, files in os.walk(base_path):
        for file in files:
            identifier = file.split('_')[0]
            new_name = id_to_name.get(identifier, identifier)
            new_file_path = os.path.join(root, file.replace(identifier, new_name))
            os.rename(os.path.join(root, file), new_file_path)

rename_images('data/imagenet10/train')


In [None]:
df

Unnamed: 0,full_text,identifier,name
0,n02119789 1 kit_fox,n02119789,kit_fox
1,n02100735 2 English_setter,n02100735,English_setter
2,n02110185 3 Siberian_husky,n02110185,Siberian_husky
3,n02096294 4 Australian_terrier,n02096294,Australian_terrier
4,n02102040 5 English_springer,n02102040,English_springer
...,...,...,...
995,n03063599 996 coffee_mug,n03063599,coffee_mug
996,n04116512 997 rubber_eraser,n04116512,rubber_eraser
997,n04325704 998 stole,n04325704,stole
998,n07831146 999 carbonara,n07831146,carbonara


#**Création d'une classe dataset pour notre dataset imagenet10 afin de l'utiliser dans un dataloader. Utilisation des 13000 images d'imagenet10 en association avec les 1000 labels du vocabulaire d'imagenet1000.**

In [None]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os

class ImageNet10Dataset(Dataset):
    def __init__(self, directory, class_name_map, transform=None):
        self.image_paths = []
        self.labels = []
        self.labels_to_tokenize = []
        max_elements_to_add = 999
        self.labels_to_tokenize.extend(itertools.islice((item for item in df['name'] if item not in self.labels_to_tokenize), max_elements_to_add))
        for subdir in sorted(os.listdir(directory)):
            class_name = class_name_map.get(subdir, subdir)  # Map subdir to class name
            for file in os.listdir(os.path.join(directory, subdir)):
                self.image_paths.append(os.path.join(directory, subdir, file))
                if class_name not in self.labels_to_tokenize:
                  self.labels_to_tokenize.append(class_name)
                self.labels.append(class_name)
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

#**Zero shot predictions sur la totalité d'imagenet10 (Similarité cosine entre les 13000 images d'imagenet10 et les 1000 classes d'imagenet1000, puis calcul d'accuracy top 1 et top 5)**

In [None]:
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
import torchvision.transforms as transforms
from tqdm import tqdm
import os
import torch
import itertools

def zero_shot(model_name,batch_size):
  model, preprocess = clip.load(model_name)
  model.cuda().eval()
  imagenet10_dataset = ImageNet10Dataset('data/imagenet10/train', id_to_name, transform=preprocess)

  imagenet10_loader = DataLoader(imagenet10_dataset, batch_size=batch_size, shuffle=False)
  total = 0
  correct_top1 = 0
  correct_top5 = 0
  print(imagenet10_dataset.labels_to_tokenize)
  text_descriptions = [f"This is a photo of a {label}" for label in imagenet10_dataset.labels_to_tokenize]
  text_tokens = clip.tokenize(text_descriptions).cuda()
  with torch.no_grad():
      text_features = model.encode_text(text_tokens).float()
      text_features /= text_features.norm(dim=-1, keepdim=True)
  #batch processing
  for batch_images, batch_labels in tqdm(imagenet10_loader):
      images = []
      labels = []
      for image in batch_images:
          images.append(image)
      image_input = torch.tensor(np.stack(images)).cuda()
      with torch.no_grad():
          image_features = model.encode_image(image_input).float()
      image_features /= image_features.norm(dim=-1, keepdim=True)

      labels.extend(batch_labels) #nous pouvons utiliser extend puisqu'on reset labels au début de la boucle
      total += len(labels)
      text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
      top_probs1, top_labels1 = text_probs.cpu().topk(1, dim=-1)
      top_probs5, top_labels5 = text_probs.cpu().topk(5, dim=-1)
      for i,prediction in enumerate(top_labels1) :
        if imagenet10_dataset.labels_to_tokenize[int(prediction[0])] == labels[i]:
          correct_top1+=1
        top_labels5_list = []
        for item in top_labels5:
          temp_list = []
          for item2 in item:
            temp_list.append(imagenet10_dataset.labels_to_tokenize[int(item2)])
          top_labels5_list.append(temp_list)
        if labels[i] in top_labels5_list[i] :
          correct_top5+=1


  print("\nScore top 1 : ", correct_top1," / ",total)
  print("\nScore top 5 : ", correct_top5," / ",total)
  accuracy_top1 = correct_top1/total
  accuracy_top5 = correct_top5/total
  print("\nAccuracy top 1 : ", accuracy_top1)
  print("\nAccuracy top 5 : ", accuracy_top5)
  return [accuracy_top1,accuracy_top5]

In [None]:
import gc
gc.collect()

5951

#**Application de la fonction zero shot, affichage des résultats**

*Ancien résultats lorsque nous n'utilisions que 10 mots de vocabulaire au lieu de 1000 : ~0.92 et 0.999*

Les résultats seraient sans doute encore inférieurs sur la totalité des images d'imagenet1000

In [None]:
zero_shot('ViT-B/32',1000)

['kit_fox', 'English_setter', 'Siberian_husky', 'Australian_terrier', 'English_springer', 'grey_whale', 'lesser_panda', 'Egyptian_cat', 'ibex', 'Persian_cat', 'cougar', 'gazelle', 'porcupine', 'sea_lion', 'malamute', 'badger', 'Great_Dane', 'Walker_hound', 'Welsh_springer_spaniel', 'whippet', 'Scottish_deerhound', 'killer_whale', 'mink', 'African_elephant', 'Weimaraner', 'soft-coated_wheaten_terrier', 'Dandie_Dinmont', 'red_wolf', 'Old_English_sheepdog', 'jaguar', 'otterhound', 'bloodhound', 'Airedale', 'hyena', 'meerkat', 'giant_schnauzer', 'titi', 'three-toed_sloth', 'sorrel', 'black-footed_ferret', 'dalmatian', 'black-and-tan_coonhound', 'papillon', 'skunk', 'Staffordshire_bullterrier', 'Mexican_hairless', 'Bouvier_des_Flandres', 'weasel', 'miniature_poodle', 'Cardigan', 'malinois', 'bighorn', 'fox_squirrel', 'colobus', 'tiger_cat', 'Lhasa', 'impala', 'coyote', 'Yorkshire_terrier', 'Newfoundland', 'brown_bear', 'red_fox', 'Norwegian_elkhound', 'Rottweiler', 'hartebeest', 'Saluki', '

100%|██████████| 13/13 [05:19<00:00, 24.56s/it]


Score top 1 :  8779  /  13000

Score top 5 :  12051  /  13000

Accuracy top 1 :  0.6753076923076923

Accuracy top 5 :  0.927





[0.6753076923076923, 0.927]