In [1]:
# !pip install torch torchvision

# !pip install -U transformers
# !pip install -U albumentations
# !pip install -U opencv-python
# !pip install -U scikit-learn
# !pip install -U Pillow
# !pip install -U tqdm
# !pip install -U pandas
# !pip install -U torchsummary
# !pip install timm
# !pip install ipywidgets

In [2]:
import os
import random
import pandas as pd
import numpy as np
from PIL import Image, ImageOps
from tqdm import tqdm
from itertools import product

from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import albumentations as A
from albumentations.pytorch import ToTensorV2

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import ViTFeatureExtractor, ViTModel
from torchsummary import summary

In [4]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [5]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(32, 10, 512)
out = transformer_encoder(src)
out.shape

torch.Size([32, 10, 512])

# Подготовка данных для обучения

In [6]:
root_dir = 'dataset'
batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
data = pd.DataFrame([
    {'image_path': os.path.join(directory, filename), 'label': os.path.basename(directory)}
    for directory, _, filenames in os.walk(root_dir)
    for filename in filenames
    if os.path.basename(directory).isdigit()
])

In [8]:
data

Unnamed: 0,image_path,label
0,dataset/8443/4e5f5bdd-5d3d-45b3-9408-03f8d1f33...,8443
1,dataset/8443/7b530781-7900-4ae2-b387-a1efdb521...,8443
2,dataset/8443/5af9cd6e-518b-43f8-8f8d-aebf0cdec...,8443
3,dataset/8443/65b140e2-a0b5-4ee6-8c45-23bcfcf20...,8443
4,dataset/8443/4d00724c-a300-48ab-91e6-7f2ba698b...,8443
...,...,...
8995,dataset/13866/78fcc893-7d03-4aff-bf27-5c07ad4c...,13866
8996,dataset/13866/a716dcbd-91b7-4f75-adf3-30404bbc...,13866
8997,dataset/13866/97418098-5215-4863-8dcd-7ba48935...,13866
8998,dataset/13866/e7f08476-fa39-47ff-ba38-ff59d6d5...,13866


In [9]:
label2id = {value: i for i, value in enumerate(data['label'].unique())}
data['label_id'] = data['label'].map(label2id)

In [10]:
train, val, _, _ = train_test_split(data, data['label_id'], test_size=0.1)

In [11]:
class ProductDataset(Dataset):

    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = self.meta['image_path'].iloc[idx]
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image = np.asarray(image))['image']
            image = Image.fromarray(image)

        image = feature_extractor(images=image, return_tensors="pt")['pixel_values'][0]
            
        cls = self.meta['label_id'].iloc[idx]

        return image, cls

In [12]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')



In [13]:
train_dataset = ProductDataset(meta=train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=12, pin_memory=True)

val_dataset = ProductDataset(meta=val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=12, pin_memory=True)

# Создание модели и обучение

![clip](https://assets-global.website-files.com/5d7b77b063a9066d83e1209c/639b1df59b5ec8f6e5fdb8cf_transformer%20gif.gif)

In [14]:
model = ViTModel.from_pretrained('google/vit-base-patch16-224')
model.to(device)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [15]:
model.eval()


train_predictions = []
train_targets = []
for batch, targets in tqdm(train_dataloader):
    with torch.no_grad():

        batch = batch.to(device)

        predictions = model(batch).pooler_output

        train_predictions.extend(predictions.cpu().numpy())
        train_targets.extend(targets.tolist())

train_predictions = np.array(train_predictions)
train_targets = np.array(train_targets)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 254/254 [01:29<00:00,  2.83it/s]


In [16]:
model.eval()

val_predictions = []
val_targets = []
for batch, targets in tqdm(val_dataloader):

    with torch.no_grad():

        batch = batch.to(device)
        predictions = model(batch).pooler_output


        val_predictions.extend(predictions.cpu().numpy())
        val_targets.extend(targets.tolist())

val_predictions = np.array(val_predictions)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:10<00:00,  2.88it/s]


In [17]:
val_predictions = train_targets[cosine_similarity(val_predictions, train_predictions).argmax(axis=1)]

In [18]:
print('Val f1:', f1_score(val_targets, val_predictions, average='weighted'))

Val f1: 0.8585437390817071
