---
title: "ZERO-Shot Image Classification"
author: "이정민"
date: "02/09/2024"
categories:
  - Deep Learning
  - zero-shot image classification
---

# 1. import

In [5]:
import pandas as pd
from tqdm import tqdm
from natsort import natsorted
import os, json, open_clip, torch
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# 2. open clip

In [9]:
## open_clip 모델 확인, [('model_name','pretrained')]
open_clip.list_pretrained()[:5]

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m')]

# image classificaion 기본 코드

In [18]:
import pandas as pd
from tqdm import tqdm
from natsort import natsorted
import os, json, open_clip, torch
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# 0. Settings
device = 'cuda:0'
model_name = 'ViT-B-16'
pretrained = 'openai'
root = './'
dataset_name = 'Scene'

# 1. Load CLIP model.
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='ViT-bigG-14-CLIPA',
    pretrained='datacomp1b',
    force_patch_dropout=0.05
)
tokenizer = open_clip.get_tokenizer('ViT-bigG-14-CLIPA')

# 2. Load test dataset.
ds = ImageFolder(os.path.join(root, dataset_name), transform=preprocess)
ds.samples = natsorted(ds.samples)
dl = DataLoader(ds, shuffle=False, batch_size=32, num_workers=2)

# 3. Load class name list.
with open(os.path.join(root, 'classes.json'), 'r') as j:
     class_names = json.loads(j.read())

# 4. Perform zero-shot classification.
zero_shot_top1 = 0
submission = dict({'id_idx':list(range(8100)), 'label':[]})
with torch.no_grad(), torch.cuda.amp.autocast():
    text = tokenizer([f"{class_name}" for class_name in class_names])
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    model = model.to(device)
    for x, y in tqdm(dl):
        x = x.cuda(device)
        image_features = model.encode_image(x).to('cpu').float()
        image_features /= image_features.norm(dim=-1, keepdim=True)
        zero_shot_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        zero_shot_pred = zero_shot_probs.max(dim=-1)[1].tolist()
        submission['label'] += zero_shot_pred

# 5. Save prediction as submission.scv file.
pd.DataFrame(submission).to_csv(os.path.join(root, 'submission.csv'), index=False)

100%|███████████████████████████████████████████████████████████████████| 254/254 [01:10<00:00,  3.60it/s]


- SCORE : 0.88790

## 이미지 증강

### CROP_transform

In [None]:
crop_transform = transforms.Compose([
    transforms.CenterCrop((int(224 * 0.95), int(224 * 0.95))),
    preprocess
])

# Load dataset
ds = ImageFolder(os.path.join(root, dataset_name), transform=crop_transform)

## 점수 제일 많이 나왔던 코드(검증 파일)

### classes
classes :

["Buildings","Forests","Glacier","Mountains","Sea","Street"]

### classes2 :

["Building","Forest","Glacier","Mountains","Sea","Street"]

### - model 1 

In [20]:
device = 'cuda:0'
root = './'
dataset_name = 'Scene'
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='ViT-bigG-14-CLIPA',
    pretrained='datacomp1b'
)
tokenizer = open_clip.get_tokenizer('ViT-bigG-14-CLIPA')
model = model.to(device)
ds = ImageFolder(os.path.join(root, dataset_name), transform=preprocess)
ds.samples = natsorted(ds.samples)
dl = DataLoader(ds, shuffle=False, batch_size=32, num_workers=2)

with open(os.path.join(root, 'classes.json'), 'r') as j:
    class_names = json.loads(j.read())

submission = {'id_idx': [], 'label': [], 'probabilities': []}

with torch.no_grad(), torch.cuda.amp.autocast():
    text = tokenizer([f"{class_name}" for class_name in class_names])
    text = text.to(device)
    
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    for i, (x, _) in enumerate(dl):
        x = x.to(device)  # Move image tensors to GPU
        image_features = model.encode_image(x)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        zero_shot_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        
        for idx, probs in enumerate(zero_shot_probs):
            # Get the top prediction and its probability
            top_pred = probs.argmax().item()
            top_prob = probs.max().item()
            
            # Collect results for each image
            submission['id_idx'].append(i * dl.batch_size + idx)
            submission['label'].append(top_pred)
            submission['probabilities'].append(probs.tolist())

# Save predictions to submission.csv 
pd.DataFrame(submission).to_csv(os.path.join(root, 'basicBigG14.csv'), index=False)

KeyboardInterrupt: 

### - model 2

In [None]:
# model 2
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='ViT-L-14-CLIPA',
    pretrained='datacomp1b',
    force_patch_dropout= 0.05
)
tokenizer = open_clip.get_tokenizer('ViT-L-14-CLIPA')
model = model.to(device)
crop_transform = transforms.Compose([
    transforms.CenterCrop((int(224 * 0.95), int(224 * 0.95))),
    preprocess
])

# Load dataset
ds = ImageFolder(os.path.join(root, dataset_name), transform=crop_transform)
ds.samples = natsorted(ds.samples)
dl = DataLoader(ds, shuffle=False, batch_size=64, num_workers=2)

# Load class names
with open(os.path.join(root, 'classes2.json'), 'r') as j:
    class_names = json.loads(j.read())

# Perform zero-shot classification
submission = {'id_idx': [], 'label': [], 'probabilities': []}
with torch.no_grad(), torch.cuda.amp.autocast():
    text = tokenizer([f"{class_name}" for class_name in class_names])
    text = text.to(device)
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    for i, (x, _) in enumerate(dl):
        x = x.to(device) 
        image_features = model.encode_image(x)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        zero_shot_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        for idx, probs in enumerate(zero_shot_probs):
            # Get the top prediction and its probability
            top_pred = probs.argmax().item()
            top_prob = probs.max().item()
            # Collect results for each image
            submission['id_idx'].append(i * dl.batch_size + idx)
            submission['label'].append(top_pred)
            submission['probabilities'].append(probs.tolist())

# Save predictions to submission.csv
pd.DataFrame(submission).to_csv(os.path.join(root, 'z_L14_cl2_do_64_crop.csv'), index=False)


### - model 3

In [None]:
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='EVA02-L-14',
    pretrained='merged2b_s4b_b131k',
    force_patch_dropout= 0.05
)
tokenizer = open_clip.get_tokenizer('EVA02-L-14')
model = model.to(device)
crop_transform = transforms.Compose([
    transforms.CenterCrop((int(224 * 0.95), int(224 * 0.95))),
    preprocess
])

# Load dataset
ds = ImageFolder(os.path.join(root, dataset_name), transform=crop_transform)
ds.samples = natsorted(ds.samples)
dl = DataLoader(ds, shuffle=False, batch_size=64, num_workers=2)

# Load class names
with open(os.path.join(root, 'classes2.json'), 'r') as j:
    class_names = json.loads(j.read())

# Perform zero-shot classification
submission = {'id_idx': [], 'label': [], 'probabilities': []}
with torch.no_grad(), torch.cuda.amp.autocast():
    text = tokenizer([f"{class_name}" for class_name in class_names])
    text = text.to(device)
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    for i, (x, _) in enumerate(dl):
        x = x.to(device) 
        image_features = model.encode_image(x)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        zero_shot_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        for idx, probs in enumerate(zero_shot_probs):
            # Get the top prediction and its probability
            top_pred = probs.argmax().item()
            top_prob = probs.max().item()
            # Collect results for each image
            submission['id_idx'].append(i * dl.batch_size + idx)
            submission['label'].append(top_pred)
            submission['probabilities'].append(probs.tolist())

# Save predictions to submission.csv
pd.DataFrame(submission).to_csv(os.path.join(root, 'z_EVA02_cl2_do_64_crop.csv'), index=False)

## 소프트 보팅

In [7]:
import pandas as pd
import numpy as np
import ast
df1 = pd.read_csv('/root/ajou/basicBigG14.csv')
df2 = pd.read_csv('/root/ajou/z_L14_cl2_do_64_crop.csv')
df3 = pd.read_csv('/root/ajou/z_EVA02_cl2_do_64_crop.csv')



data = df1.assign(probabilities2 = df2.probabilities).assign(probabilities3 = df3.probabilities)


data['probabilities'] = data['probabilities'].apply(ast.literal_eval)
data['probabilities2'] = data['probabilities2'].apply(ast.literal_eval)
data['probabilities3'] = data['probabilities3'].apply(ast.literal_eval)

# 확률 평균 계산
def soft_voting(row):
    probs = np.array(row['probabilities'])
    probs2 = np.array(row['probabilities2'])
    probs3 = np.array(row['probabilities3'])
    avg_probs = (probs + probs2 + probs3 ) / 3
    return avg_probs

data['soft_voting'] = data.apply(soft_voting, axis=1)

# 최종 예측 클래스 계산
def final_prediction(row):
    return np.argmax(row['soft_voting'])

data['final_prediction'] = data.apply(final_prediction, axis=1)
data1 = data.drop(['label','probabilities','probabilities2','probabilities3','soft_voting'],axis=1).rename({'final_prediction':'label'},axis=1)

data1.to_csv("test5.csv",index=False)