In [1]:
import torch
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pandas as pd


  check_for_updates()


#### 유사도에 사용할 모델 가져오기

In [2]:
ckpt = torch.load("result/resnet50-256-0.0005-AdamW-O-augv1_09-16_0/epoch=34-step=1645.ckpt")
state_dict = {'.'.join(key.split('.')[2:]): val for key, val in ckpt['state_dict'].items()}

#### 학습된 weight와 동일한 모델 필요

In [3]:
model = timm.create_model('resnet50', pretrained=False,num_classes=500)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [4]:
common_transforms = [
    A.Resize(224, 224),  # 이미지를 224x224 크기로 리사이즈
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # 정규화
    ToTensorV2()  # albumentations에서 제공하는 PyTorch 텐서 변환
]
transform = A.Compose(common_transforms)

In [5]:
from torch.utils.data import Dataset,DataLoader
from typing import Callable, Union, Tuple
import cv2
import os

class CustomDataset(Dataset):
    def __init__(
        self, 
        root_dir, 
        info_df, 
        transform,
        is_inference
    ):
        # 데이터셋의 기본 경로, 이미지 변환 방법, 이미지 경로 및 레이블을 초기화합니다.
        self.root_dir = root_dir  # 이미지 파일들이 저장된 기본 디렉토리
        self.transform = transform  # 이미지에 적용될 변환 처리
        self.is_inference = is_inference # 추론인지 확인
        self.info_df = info_df
        # self.info_df = info_df.iloc[:100] #TODO 
        self.image_paths = self.info_df['image_path'].tolist()  # 이미지 파일 경로 목록
        
        if not self.is_inference:
            self.targets = self.info_df['target'].tolist()  # 각 이미지에 대한 레이블 목록

    def __len__(self) -> int:
        # 데이터셋의 총 이미지 수를 반환합니다.
        return len(self.image_paths)

    def __getitem__(self, index: int) -> Union[Tuple[torch.Tensor, int], torch.Tensor]:
        # 주어진 인덱스에 해당하는 이미지를 로드하고 변환을 적용한 후, 이미지와 레이블을 반환합니다.
        img_path = os.path.join(self.root_dir, self.image_paths[index])  # 이미지 경로 조합
        image = cv2.imread(img_path, cv2.IMREAD_COLOR)  # 이미지를 BGR 컬러 포맷의 numpy array로 읽어옵니다.
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # BGR 포맷을 RGB 포맷으로 변환합니다.
        image = self.transform(image=image)  # 설정된 이미지 변환을 적용합니다.

        if self.is_inference:
            return image
        else:
            target = self.targets[index]  # 해당 이미지의 레이블
            return image['image'], target, index# 변환된 이미지와 레이블을 튜플 형태로 반환합니다. 

In [6]:
info_df = pd.read_csv('./data/train.csv')

In [7]:
dup_dataset = CustomDataset('./data/train',info_df,transform,False)

In [8]:
dup_dataloader = DataLoader(dup_dataset, batch_size=512, num_workers=4, shuffle=False)

In [9]:
model.cuda()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act2): ReLU(inplace=True)
      (aa): Identity()
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

#### 이미지 embedding 추출

In [10]:
feats = []
ys = []
idxes = []
with torch.no_grad():
    for batch in dup_dataloader:
        x,y,idx = batch
        x = x.cuda()
        feat = model(x)
        feats.append(feat)
        ys.append(y)
        idxes.append(idx)

feats = torch.concat(feats)
ys = torch.concat(ys)      
idxes = torch.concat(idxes)  

In [11]:
print(ys.shape)
print(idxes.shape)
print(feats.shape)

torch.Size([15021])
torch.Size([15021])
torch.Size([15021, 500])


In [12]:
import torch.nn.functional as F


In [13]:
feats = F.normalize(feats, p=2, dim=1)


In [14]:
cosine_similarity_matrix = torch.matmul(feats, feats.T)


In [15]:
cosine_similarity_matrix.shape

torch.Size([15021, 15021])

In [16]:
# Upper triangular matrix에서 threshold 이상인 값의 index 찾기
indices = torch.triu(cosine_similarity_matrix, diagonal=1) > 0.999



In [17]:
# 조건을 만족하는 i, j 쌍의 index 추출
pairs = torch.nonzero(indices, as_tuple=False)


In [18]:
pairs.shape

torch.Size([2817, 2])

In [19]:
from tqdm import tqdm

In [20]:
high_similarity_pairs = []
for i,j in tqdm(pairs):
    high_similarity_pairs.append((idxes[i].item(), idxes[j].item(), cosine_similarity_matrix[i, j].item()) )

100%|██████████| 2817/2817 [00:00<00:00, 6396.63it/s]


In [21]:
len(high_similarity_pairs)

2817

In [22]:
paths = info_df['image_path'].tolist()

In [23]:
high_similarity_pairs[0]

(2, 5210, 0.9997597336769104)

In [24]:
train_root = './data/train'

In [67]:
import os
import cv2
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import clear_output

# 데이터 준비 (이 부분은 실제 코드에 맞게 조정해야 합니다)
# miss_df, val_data_dir, class_name_to_idx 등의 데이터와 디렉토리 설정이 필요합니다

def plot_image(value):
    value = int(value)  # 슬라이더 값 추출
    clear_output(wait=True)  # 이전 출력 제거
    fig, ax = plt.subplots(1, 2, figsize=(6, 12))  # 새 그림 생성

    x1,x2,sim = high_similarity_pairs[value]
    # 잘못 예측된 sample (miss_df에서 잘못 예측된 이미지 경로)
    img_path = os.path.join(train_root, paths[x1])
    image = cv2.imread(img_path, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    ax[0].imshow(image)
    x1_label = ys[x1].item()
    ax[0].set_title(f'{x1_label},sim:{sim}')
    ax[0].axis('off')
    

    img_path = os.path.join(train_root, paths[x2])
    image = cv2.imread(img_path, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    ax[1].imshow(image)
    x2_label = ys[x2].item()
    ax[1].set_title(f'{x2_label}')
    ax[1].axis('off')


    plt.tight_layout()
    plt.show()

# 슬라이더 생성
slider = widgets.IntSlider(min=0, max=len(high_similarity_pairs) - 1, step=1, description='Index')

# 슬라이더와 plot_image 함수 연결
widgets.interact(plot_image, value=slider)


interactive(children=(IntSlider(value=0, description='Index', max=2816), Output()), _dom_classes=('widget-inte…

<function __main__.plot_image(value)>

## 특정 class 만 

In [74]:
path_dict = {}
for idx,(i,j,sim) in enumerate(high_similarity_pairs):
    path_dict[idx] = (paths[i],paths[j],sim,ys[i].item(),ys[j].item())

In [83]:
len(path_dict)

2817

In [75]:
res_df = pd.DataFrame(path_dict).T

In [76]:
len(res_df[res_df[3]==res_df[4]]) # 최종 이미지 pair 

2644

In [38]:
len(set(res_df[res_df[3]==res_df[4]][1]))

2027

In [43]:
class_df = res_df[res_df[3]==res_df[4]]

In [77]:
class_df[class_df['class']=='n02110958']

Unnamed: 0,0,1,2,3,4,class
1190,n02110958/sketch_48.JPEG,n02110958/sketch_2.JPEG,0.999292,150,150,n02110958


In [53]:
class_df['class'] = class_df[0].apply(lambda x: x.split('/')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  class_df['class'] = class_df[0].apply(lambda x: x.split('/')[0])


In [78]:
class_df[class_df['class'] == 'n02110958']

Unnamed: 0,0,1,2,3,4,class
1190,n02110958/sketch_48.JPEG,n02110958/sketch_2.JPEG,0.999292,150,150,n02110958


In [54]:
tt = res_df[res_df[3]!=res_df[4]]

Unnamed: 0,0,1,2,3,4,class
0,n02106166/sketch_3.JPEG,n02106166/sketch_29.JPEG,0.99976,138,138,n02106166
1,n02056570/sketch_40.JPEG,n02056570/sketch_5.JPEG,0.999002,80,80,n02056570
2,n01748264/sketch_1.JPEG,n01748264/sketch_44.JPEG,0.999205,38,38,n01748264
3,n03857828/sketch_26.JPEG,n03857828/sketch_21.JPEG,0.999569,330,330,n03857828
4,n03857828/sketch_26.JPEG,n03857828/sketch_36.JPEG,0.999555,330,330,n03857828
...,...,...,...,...,...,...
2811,n02971356/sketch_13.JPEG,n02971356/sketch_39.JPEG,0.999075,258,258,n02971356
2812,n01828970/sketch_2.JPEG,n01828970/sketch_49.JPEG,0.999214,52,52,n01828970
2814,n07583066/sketch_0.JPEG,n07583066/sketch_45.JPEG,0.999514,463,463,n07583066
2815,n07742313/sketch_3.JPEG,n07742313/sketch_42.JPEG,0.99961,474,474,n07742313


In [32]:
tt.to_csv('diff_cls_pair.csv')

In [99]:
remain = info_df[(~info_df.image_path.isin(set(res_df[res_df[3]==res_df[4]][1])))]

In [100]:
remain.to_csv('remain.csv')

In [101]:
res2 = remain['class_name'].value_counts()

In [97]:
res2.to_csv('value_counts.csv', header=True)

In [58]:
cv_df = pd.read_csv('./train_reduce.csv')

In [60]:
cv_df['image_path'] = cv_df['image_path'].apply(lambda x: '/'.join(x.split('/')[3:]))

In [66]:
info_df[(~info_df['image_path'].isin(set(cv_df['image_path']))) & (info_df['class_name'] == 'n02110958')]


Unnamed: 0,class_name,image_path,target
8541,n02110958,n02110958/sketch_44.JPEG,150


In [60]:
remain['image_path'] = remain['image_path'].apply(lambda x: "./data/train/"+x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain['image_path'] = remain['image_path'].apply(lambda x: "./data/train/"+x)


In [62]:
res = remain[remain['image_path'].isin(set(cv_df['image_path']))]

In [64]:
res.to_csv('del_dup_train.csv')

In [63]:
res

Unnamed: 0,class_name,image_path,target
0,n01872401,./data/train/n01872401/sketch_50.JPEG,59
1,n02417914,./data/train/n02417914/sketch_11.JPEG,202
2,n02106166,./data/train/n02106166/sketch_3.JPEG,138
3,n04235860,./data/train/n04235860/sketch_2.JPEG,382
4,n02056570,./data/train/n02056570/sketch_40.JPEG,80
...,...,...,...
15011,n02108915,./data/train/n02108915/sketch_3.JPEG,145
15012,n02107683,./data/train/n02107683/sketch_7.JPEG,142
15016,n02108089,./data/train/n02108089/sketch_32.JPEG,143
15017,n02129604,./data/train/n02129604/sketch_7.JPEG,172


In [None]:
res_df.to_csv('res.csv')

#### n07860988class 중복 보기

In [174]:
remain = info_df[(info_df.class_name == 'n07860988') & (~info_df.image_path.isin(set(res_df[res_df[3] == 481][1])))]

In [138]:
import shutil
import os


In [141]:
image_paths = remain['image_path'].tolist()
destination_dir = './image_remain'
os.makedirs(destination_dir, exist_ok=True)


In [148]:
for image_path in image_paths:
    # 이미지 파일 이름을 추출
    file_name = os.path.basename(image_path)
    # 대상 경로로 복사
    shutil.copy(os.path.join(train_root,image_path), os.path.join(destination_dir, file_name))


#### 다른 class 유사도 높은 샘플 보기/

In [176]:
check = res_df[res_df[3]!=res_df[4]]

In [178]:
import os
import cv2
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import clear_output

# 데이터 준비 (이 부분은 실제 코드에 맞게 조정해야 합니다)
# miss_df, val_data_dir, class_name_to_idx 등의 데이터와 디렉토리 설정이 필요합니다

def plot_image(value):
    value = int(value)  # 슬라이더 값 추출
    clear_output(wait=True)  # 이전 출력 제거
    fig, ax = plt.subplots(1, 2, figsize=(6, 12))  # 새 그림 생성

    check_img = check.iloc[value]
    # 잘못 예측된 sample (miss_df에서 잘못 예측된 이미지 경로)
    img_path = os.path.join(train_root, check_img[0])
    image = cv2.imread(img_path, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    ax[0].imshow(image)
    x1_label = check_img[3]
    ax[0].set_title(f'{x1_label}')
    ax[0].axis('off')
    

    img_path = os.path.join(train_root, check_img[1])
    image = cv2.imread(img_path, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    ax[1].imshow(image)
    x2_label = check_img[4]
    ax[1].set_title(f'{x2_label}')
    ax[1].axis('off')


    plt.tight_layout()
    plt.show()

# 슬라이더 생성
slider = widgets.IntSlider(min=0, max=len(check) - 1, step=1, description='Index')

# 슬라이더와 plot_image 함수 연결
widgets.interact(plot_image, value=slider)


interactive(children=(IntSlider(value=0, description='Index', max=172), Output()), _dom_classes=('widget-inter…

<function __main__.plot_image(value)>

In [162]:
len(check)

173