In [1]:
# !pip install torch torchvision

# !pip install -U transformers
# !pip install -U albumentations
# !pip install -U opencv-python
# !pip install -U scikit-learn
# !pip install -U Pillow
# !pip install -U tqdm
# !pip install -U pandas
# !pip install -U torchsummary
# !pip install timm
# !pip install ipywidgets

![image](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*XbuW8WuRrAY5pC4t-9DZAQ.jpeg)

In [2]:
import os
import random
import pandas as pd
import numpy as np
from PIL import Image, ImageOps
from tqdm.notebook import tqdm
from itertools import product

from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from torchsummary import summary

import timm

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# Подготовка данных для обучения

In [5]:
root_dir = 'dataset'
batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
data = pd.DataFrame([
    {'image_path': os.path.join(directory, filename), 'label': os.path.basename(directory)}
    for directory, _, filenames in os.walk(root_dir)
    for filename in filenames
    if os.path.basename(directory).isdigit()
])

In [7]:
data

Unnamed: 0,image_path,label
0,dataset/8443/4e5f5bdd-5d3d-45b3-9408-03f8d1f33...,8443
1,dataset/8443/7b530781-7900-4ae2-b387-a1efdb521...,8443
2,dataset/8443/5af9cd6e-518b-43f8-8f8d-aebf0cdec...,8443
3,dataset/8443/65b140e2-a0b5-4ee6-8c45-23bcfcf20...,8443
4,dataset/8443/4d00724c-a300-48ab-91e6-7f2ba698b...,8443
...,...,...
8995,dataset/13866/78fcc893-7d03-4aff-bf27-5c07ad4c...,13866
8996,dataset/13866/a716dcbd-91b7-4f75-adf3-30404bbc...,13866
8997,dataset/13866/97418098-5215-4863-8dcd-7ba48935...,13866
8998,dataset/13866/e7f08476-fa39-47ff-ba38-ff59d6d5...,13866


In [8]:
data.label.value_counts()

label
8443       600
9617       600
14508      600
14167      600
8909       600
9000316    600
12518      600
14302      600
14498      600
19513      600
12981      600
24303      600
14057      600
9000395    600
13866      600
Name: count, dtype: int64

In [9]:
data.label.nunique()

15

In [10]:
label2id = {value: i for i, value in enumerate(data['label'].unique())}
data['label_id'] = data['label'].map(label2id)

In [11]:
label2id

{'8443': 0,
 '9617': 1,
 '14508': 2,
 '14167': 3,
 '8909': 4,
 '9000316': 5,
 '12518': 6,
 '14302': 7,
 '14498': 8,
 '19513': 9,
 '12981': 10,
 '24303': 11,
 '14057': 12,
 '9000395': 13,
 '13866': 14}

In [12]:
train, val, _, _ = train_test_split(data, data['label_id'], test_size=0.1)

In [13]:
class ProductDataset(Dataset):

    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform
        self.preprocessing = A.Compose([
            A.Resize(height=224, width=224),
            A.Normalize(),
            ToTensorV2()
        ])

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = self.meta['image_path'].iloc[idx]
        image = Image.open(img_name).convert('RGB')
        image = np.asarray(image)

        if self.transform:
            image = self.transform(image = image)['image']

        image = self.preprocessing(image=image)['image']
            
        cls = self.meta['label_id'].iloc[idx]

        return image, cls

In [14]:
train.label.value_counts()

label
12981      550
8443       548
9000395    548
14057      544
14302      544
14508      543
24303      542
8909       541
14167      538
9617       537
19513      537
13866      536
9000316    535
14498      530
12518      527
Name: count, dtype: int64

In [15]:
train_dataset = ProductDataset(meta=train)

In [16]:
img_name = train['image_path'].iloc[0]
img_name
image = Image.open(img_name).convert('RGB')
image = np.asarray(image)
image.shape

# image = self.preprocessing(image=image)['image']

(2048, 1251, 3)

In [17]:
image.min(), image.max()

(np.uint8(0), np.uint8(255))

In [18]:
preprocessing = A.Compose([
    A.Resize(height=224, width=224),
    A.Normalize(),
    ToTensorV2()
])

In [19]:
preprocessing

Compose([
  Resize(p=1.0, height=224, width=224, interpolation=1, mask_interpolation=0),
  Normalize(p=1.0, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, normalization='standard'),
  ToTensorV2(p=1.0, transpose_mask=False),
], p=1.0, bbox_params=None, keypoint_params=None, additional_targets={}, is_check_shapes=True)

In [20]:
new_image = preprocessing(image=image)['image']
new_image

tensor([[[-0.5082, -0.2856, -0.4054,  ..., -1.5528, -1.2959, -1.6384],
         [-0.8849, -0.2342, -0.0972,  ..., -1.4843, -1.5870, -1.6898],
         [-0.8849, -0.8335, -0.8164,  ..., -1.6727, -1.6727, -1.6727],
         ...,
         [-1.0562, -1.2103, -1.0904,  ..., -1.0048, -0.9363, -0.8164],
         [-1.1247, -0.7650, -0.6794,  ..., -0.8678, -0.8507, -0.7479],
         [-0.9363, -1.0562, -1.1760,  ..., -0.7308, -0.6965, -0.7137]],

        [[-0.4076, -0.1800, -0.3025,  ..., -1.4230, -1.2129, -1.5630],
         [-0.7927, -0.1275,  0.0126,  ..., -1.3704, -1.3529, -1.3880],
         [-0.8452, -0.7927, -0.7577,  ..., -1.4055, -1.3354, -1.3354],
         ...,
         [-1.2829, -1.3529, -1.2129,  ..., -0.9678, -0.8803, -0.8102],
         [-1.2654, -0.8978, -0.8102,  ..., -0.8627, -0.8452, -0.7927],
         [-1.0728, -1.1954, -1.3179,  ..., -0.8277, -0.7927, -0.7402]],

        [[-0.4973, -0.2707, -0.3927,  ..., -1.2641, -1.0724, -1.3339],
         [-0.8807, -0.2184, -0.0790,  ..., -1

In [21]:
image_tensor, label = train_dataset[0]

In [22]:
image_tensor.shape

torch.Size([3, 224, 224])

In [23]:
train_dataset = ProductDataset(meta=train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12, pin_memory=True)

val_dataset = ProductDataset(meta=val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=12, pin_memory=True)

# Создание модели и обучение

In [24]:
timm.list_models(pretrained=True)

['aimv2_1b_patch14_224.apple_pt',
 'aimv2_1b_patch14_336.apple_pt',
 'aimv2_1b_patch14_448.apple_pt',
 'aimv2_3b_patch14_224.apple_pt',
 'aimv2_3b_patch14_336.apple_pt',
 'aimv2_3b_patch14_448.apple_pt',
 'aimv2_huge_patch14_224.apple_pt',
 'aimv2_huge_patch14_336.apple_pt',
 'aimv2_huge_patch14_448.apple_pt',
 'aimv2_large_patch14_224.apple_pt',
 'aimv2_large_patch14_224.apple_pt_dist',
 'aimv2_large_patch14_336.apple_pt',
 'aimv2_large_patch14_336.apple_pt_dist',
 'aimv2_large_patch14_448.apple_pt',
 'bat_resnext26ts.ch_in1k',
 'beit_base_patch16_224.in22k_ft_in22k',
 'beit_base_patch16_224.in22k_ft_in22k_in1k',
 'beit_base_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_224.in22k_ft_in22k',
 'beit_large_patch16_224.in22k_ft_in22k_in1k',
 'beit_large_patch16_384.in22k_ft_in22k_in1k',
 'beit_large_patch16_512.in22k_ft_in22k_in1k',
 'beitv2_base_patch16_224.in1k_ft_in1k',
 'beitv2_base_patch16_224.in1k_ft_in22k',
 'beitv2_base_patch16_224.in1k_ft_in22k_in1k',
 'beitv2_large_patc

In [25]:
model = timm.create_model('resnet50.a1_in1k', pretrained=True, num_classes=0)
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act2): ReLU(inplace=True)
      (aa): Identity()
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

In [26]:
summary(model, (3, 224, 224), batch_size=32)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [32, 64, 112, 112]           9,408
       BatchNorm2d-2         [32, 64, 112, 112]             128
              ReLU-3         [32, 64, 112, 112]               0
         MaxPool2d-4           [32, 64, 56, 56]               0
            Conv2d-5           [32, 64, 56, 56]           4,096
       BatchNorm2d-6           [32, 64, 56, 56]             128
              ReLU-7           [32, 64, 56, 56]               0
            Conv2d-8           [32, 64, 56, 56]          36,864
       BatchNorm2d-9           [32, 64, 56, 56]             128
         Identity-10           [32, 64, 56, 56]               0
             ReLU-11           [32, 64, 56, 56]               0
         Identity-12           [32, 64, 56, 56]               0
           Conv2d-13          [32, 256, 56, 56]          16,384
      BatchNorm2d-14          [32, 256,

In [27]:
model.eval()


train_predictions = []
train_targets = []
for batch, targets in tqdm(train_dataloader):
    with torch.no_grad():

        batch = batch.to(device)

        predictions = model(batch)

        train_predictions.extend(predictions.cpu().numpy())
        train_targets.extend(targets.tolist())

train_predictions = np.array(train_predictions)
train_targets = np.array(train_targets)

  0%|          | 0/507 [00:00<?, ?it/s]

In [28]:
train_predictions

array([[0.01146584, 0.        , 0.        , ..., 0.01758988, 0.00593074,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05723911, 0.02901906,
        0.        ],
       [0.03410833, 0.12590829, 0.00156066, ..., 0.04027033, 0.01835243,
        0.        ],
       ...,
       [0.        , 0.        , 0.11589691, ..., 0.04084621, 0.05453984,
        0.        ],
       [0.00964396, 0.        , 0.0086084 , ..., 0.        , 0.03760785,
        0.00049861],
       [0.05349359, 0.02408644, 0.04418667, ..., 0.05916491, 0.00179488,
        0.        ]], shape=(8100, 2048), dtype=float32)

In [29]:
model.eval()

val_predictions = []
val_targets = []
for batch, targets in tqdm(val_dataloader):

    with torch.no_grad():

        batch = batch.to(device)
        predictions = model(batch)


        val_predictions.extend(predictions.cpu().numpy())
        val_targets.extend(targets.tolist())

val_predictions = np.array(val_predictions)

  0%|          | 0/57 [00:00<?, ?it/s]

In [30]:
val_predictions.shape

(900, 2048)

In [31]:
cosine_similarity(val_predictions, train_predictions).shape

(900, 8100)

In [32]:
val_predictions = train_targets[cosine_similarity(val_predictions, train_predictions).argmax(axis=1)]

In [33]:
print('Val f1:', f1_score(val_targets, val_predictions, average='weighted'))

Val f1: 0.8527672043110576
