In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

from typing import Iterable

def normalize_vector(vector):
    norm = np.sqrt(np.sum(np.square(vector)))
    if norm > 0.001:
        return vector / norm
    else:
        return vector

# Чтение данных, преобразование в нужный формат, а также нормализация

In [2]:
images_embeds_df = pd.read_parquet('/kaggle/input/resnet/images_embed.parquet')

In [3]:
images_embeds_df["main_pic_embeddings_resnet_v1"] = images_embeds_df["main_pic_embeddings_resnet_v1"].apply(lambda x: normalize_vector(x[0]))

In [4]:
texts_embeds_df = pd.read_parquet("/kaggle/input/ozon-for-hse/text_and_bert.parquet")[["variantid", "name_bert_64"]]
texts_embeds_df["name_bert_64"] = texts_embeds_df["name_bert_64"].apply(normalize_vector)

In [5]:
embeddings_df = texts_embeds_df.merge(images_embeds_df, on="variantid")

In [6]:
del images_embeds_df, texts_embeds_df

In [7]:
embeddings_df.head()

Unnamed: 0,variantid,name_bert_64,main_pic_embeddings_resnet_v1
0,47920382,"[-0.05876269060643361, 0.1514294495403412, 0.0...","[0.14319316885121922, 0.16504079909482125, 0.0..."
1,49801845,"[-0.1543456495446977, 0.08829657672401005, 0.1...","[-0.0920594657957012, -0.036786389998702776, -..."
2,49853444,"[-0.06491635238860302, 0.08863572598744762, 0....","[0.021624698388432357, -0.06500051838396713, -..."
3,49893028,"[-0.13959296579755773, 0.10721153735083641, 0....","[0.030369836841500398, 0.040941960232568256, 0..."
4,49987483,"[-0.07627172262796215, 0.10504576447490983, 0....","[0.11102656253931156, 0.024067826844113762, -0..."


Далее загрузим тренировочный и тестовый дата-сеты с variantid_1 и variantid_2, а также target'ом

In [8]:
train_df, test_df = train_test_split(pd.read_parquet("/kaggle/input/ozon-for-hse/train.parquet"), test_size=0.2, random_state=42)

In [9]:
# Переведем в словарь, чтобы было легче работать
embed_dict = embeddings_df.set_index("variantid").to_dict()

In [10]:
class VariantPairDataset(Dataset):
    def __init__(
        self,
        variant_pairs: Iterable[tuple[int, int]],
        text_embeddings: dict[int, np.ndarray],
        img_embeddings: dict[int, np.ndarray],
        targets: np.ndarray
    ) -> None:
        self.variant_pairs = variant_pairs
        self.text_embeddings = text_embeddings
        self.img_embeddings = img_embeddings
        self.targets = targets

    def __len__(self) -> int:
        return len(self.variant_pairs)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        variantid1, variantid2 = self.variant_pairs[idx]

        text_emb1 = self.text_embeddings[variantid1]
        img_emb1 = self.img_embeddings[variantid1]
        text_emb2 = self.text_embeddings[variantid2]
        img_emb2 = self.img_embeddings[variantid2]

        target = self.targets[idx]

        sample = {
            'text_emb1': torch.tensor(text_emb1, dtype=torch.float32),
            'img_emb1': torch.tensor(img_emb1, dtype=torch.float32),
            'text_emb2': torch.tensor(text_emb2, dtype=torch.float32),
            'img_emb2': torch.tensor(img_emb2, dtype=torch.float32),
            'target': torch.tensor(target, dtype=torch.float32)
        }

        return sample

## Создание экземпляров VariantPairDataset
Теперь, когда данные подготовлены, можно создать экземпляры класса VariantPairDataset для тренировочного и тестового наборов. Эти экземпляры можно использовать для обучения и валидации модели.

In [11]:
train_dataset = VariantPairDataset(
    train_df[["variantid1", "variantid2"]].values,
    embed_dict["name_bert_64"],
    embed_dict["main_pic_embeddings_resnet_v1"],
    train_df["target"].values
)

test_dataset = VariantPairDataset(
    test_df[["variantid1", "variantid2"]].values,
    embed_dict["name_bert_64"],
    embed_dict["main_pic_embeddings_resnet_v1"],
    test_df["target"].values
)

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True)
test_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=False)

# Инициализация архитектуры модели

Входная размерность: 2 * (text_emb_size + img_emb_size) - по сути делаем конкатенацию эмбедингов двух товаров с учетом эмбедингов текстов и картинок

### Слои сети
Многоуровневая полносвязная часть:

Для каждого слоя используется линейное преобразование с активацией PReLU

Слой нормализации (BatchNorm)

Количество скрытых слоёв (nlayers) задаётся гиперпараметром. Каждый слой имеет размерность hidden_size

На первом слое размер входных данных равен 2 * (text_emb_size + img_emb_size), а на остальных слоях — hidden_size.

### Выход
на выходе применяется сигмоида для получения значения от 0 до 1, что соответствует вероятности принадлежности пары к положительному классу.

Модель обучается с использованием стандартной функции потерь для бинарной классификации BCEWithLogitsLoss

In [13]:
class PairwiseBinaryClassifier(nn.Module):
    def __init__(
        self,
        text_emb_size: int,
        img_emb_size: int,
        hidden_size: int,
        nlayers: int
    ) -> None:
        super(PairwiseBinaryClassifier, self).__init__()
        input_size = 2 * (text_emb_size + img_emb_size)
        layers = []
        for i in range(nlayers):
            layers.extend(
                [
                    nn.Linear(input_size if i == 0 else hidden_size, hidden_size),
                    nn.BatchNorm1d(hidden_size),
                    nn.PReLU()
                ]
            )
        self.layers = nn.Sequential(*layers)
        self.scorer = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self._init_params()

    def _init_params(self):
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

    def forward(self, text_emb1, img_emb1, text_emb2, img_emb2):
        x = torch.cat((text_emb1, img_emb1, text_emb2, img_emb2), dim=-1)
        x = self.layers(x)
        x = self.sigmoid(self.scorer(x))
        return x

## Инициализация модели с гиперпараметрами

In [14]:
model = PairwiseBinaryClassifier(text_emb_size=64, img_emb_size=128, hidden_size=512, nlayers=5)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [15]:
def evaluate_model(model: nn.Module, dataloader: DataLoader) -> tuple[list[float], list[float], list[float], float]:
    model.eval()
    all_targets = []
    all_predictions = []
    all_probas = []

    with torch.no_grad():
        for batch in dataloader:
            text_emb1 = batch['text_emb1']
            img_emb1 = batch['img_emb1']
            text_emb2 = batch['text_emb2']
            img_emb2 = batch['img_emb2']
            targets = batch['target']

            outputs = model(text_emb1, img_emb1, text_emb2, img_emb2)
            loss = criterion(outputs, batch["target"].view(-1, 1))
            predictions = (outputs > 0.5).float()

            all_targets.extend(targets.cpu().numpy().tolist())
            all_predictions.extend(predictions.squeeze().cpu().numpy().tolist())
            all_probas.extend(outputs.squeeze().cpu().numpy().tolist())

    return all_targets, all_predictions, all_probas, loss.item()

## Обучение модели на 10 эпохах

In [16]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()

    for batch in train_dataloader:
        outputs = model(*list(batch.values())[:-1])
        loss = criterion(outputs, batch["target"].view(-1, 1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    _, _, _, eval_loss = evaluate_model(model, test_dataloader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Eval Loss: {eval_loss:.4f}')

Epoch [1/10], Train Loss: 0.4810, Eval Loss: 0.4963
Epoch [2/10], Train Loss: 0.4017, Eval Loss: 0.4201
Epoch [3/10], Train Loss: 0.3998, Eval Loss: 0.3667
Epoch [4/10], Train Loss: 0.3453, Eval Loss: 0.3646
Epoch [5/10], Train Loss: 0.3350, Eval Loss: 0.3444
Epoch [6/10], Train Loss: 0.3577, Eval Loss: 0.3130
Epoch [7/10], Train Loss: 0.3097, Eval Loss: 0.3014
Epoch [8/10], Train Loss: 0.3380, Eval Loss: 0.2668
Epoch [9/10], Train Loss: 0.3198, Eval Loss: 0.2510
Epoch [10/10], Train Loss: 0.2655, Eval Loss: 0.2326


In [17]:
torch.save(model.state_dict(), 'pairwise_binary_classifier.pth')

## Замеряем метрику на тестовом дата-сете

In [18]:
real, preds, probas, loss = evaluate_model(model, test_dataloader)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, auc

# Compute evaluation metrics
accuracy = accuracy_score(real, preds)
precision = precision_score(real, preds)
recall = recall_score(real, preds)
prauc_precision, prauc_recall, _ = precision_recall_curve(real, probas)
prauc = auc(prauc_recall, prauc_precision)
f1 = f1_score(real, preds)


print(f'Loss: {loss:.4f}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'PR-AUC: {prauc:.4f}')
print(f'F1 Score: {f1:.4f}')

Loss: 0.2326
Accuracy: 0.9066
Precision: 0.8865
Recall: 0.9241
PR-AUC: 0.9683
F1 Score: 0.9049


In [36]:
import pandas as pd
import numpy as np

sampled_df = test_df.sample(n=15, random_state=42)

sampled_variant_pairs = sampled_df[["variantid1", "variantid2", "target"]]

sampled_variant_pairs["name_bert_64_1"] = sampled_variant_pairs["variantid1"].map(embed_dict["name_bert_64"])
sampled_variant_pairs["name_bert_64_2"] = sampled_variant_pairs["variantid2"].map(embed_dict["name_bert_64"])
sampled_variant_pairs["main_pic_embeddings_resnet_v1_1"] = sampled_variant_pairs["variantid1"].map(embed_dict["main_pic_embeddings_resnet_v1"])
sampled_variant_pairs["main_pic_embeddings_resnet_v1_2"] = sampled_variant_pairs["variantid2"].map(embed_dict["main_pic_embeddings_resnet_v1"])

sampled_variant_pairs.to_csv("sampled_test_data_with_embeddings.csv", index=False)
sampled_variant_pairs.to_pickle("sampled_test_data_with_embeddings.pkl")
print("CSV файл с 15 семплами и embeddings создан!")



CSV файл с 15 семплами и embeddings создан!


In [37]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

sampled_df = pd.read_pickle("/kaggle/working/sampled_test_data_with_embeddings.pkl")

text_embeddings_sampled = embed_dict["name_bert_64"]
img_embeddings_sampled = embed_dict["main_pic_embeddings_resnet_v1"]

sampled_variant_pairs = sampled_df[["variantid1", "variantid2"]].values
targets_sampled = sampled_df["target"].values

sampled_dataset = VariantPairDataset(
    variant_pairs=sampled_variant_pairs,
    text_embeddings=text_embeddings_sampled,
    img_embeddings=img_embeddings_sampled,
    targets=targets_sampled
)

# Даталоадер для предсказания
sampled_dataloader = DataLoader(sampled_dataset, batch_size=15, shuffle=False)

# Оценка модели
def evaluate_model(model: nn.Module, dataloader: DataLoader) -> tuple[list[float], list[float], list[float], float]:
    model.eval()
    all_targets = []
    all_predictions = []
    all_probas = []

    with torch.no_grad():
        for batch in dataloader:
            text_emb1 = batch['text_emb1']
            img_emb1 = batch['img_emb1']
            text_emb2 = batch['text_emb2']
            img_emb2 = batch['img_emb2']
            targets = batch['target']

            # Прогоняем данные через модель
            outputs = model(text_emb1, img_emb1, text_emb2, img_emb2)
            loss = criterion(outputs, batch["target"].view(-1, 1))
            predictions = (outputs > 0.5).float()

            all_targets.extend(targets.cpu().numpy().tolist())
            all_predictions.extend(predictions.squeeze().cpu().numpy().tolist())
            all_probas.extend(outputs.squeeze().cpu().numpy().tolist())

    return all_targets, all_predictions, all_probas, loss.item()

# Получаем результаты предсказаний
real, preds, probas, loss = evaluate_model(model, sampled_dataloader)

# Метрики
accuracy = accuracy_score(real, preds)
f1 = f1_score(real, preds)
roc_auc = roc_auc_score(real, probas)

print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

Loss: 0.2778693437576294
Accuracy: 0.8666666666666667
F1 Score: 0.8000000000000002
ROC AUC: 0.96


In [39]:
sampled_df

Unnamed: 0,variantid1,variantid2,target,name_bert_64_1,name_bert_64_2,main_pic_embeddings_resnet_v1_1,main_pic_embeddings_resnet_v1_2
586673,447116673,447276570,0,"[-0.11546845693788657, 0.11879767000499351, 0....","[-0.12460829199407925, 0.12305943289675993, 0....","[-0.052273153188848204, 0.10378567635655774, -...","[-0.052273153188848204, 0.10378567635655774, -..."
110057,588412848,1169982188,0,"[-0.08414979233219054, 0.1594500221920774, 0.1...","[-0.06564694978218584, 0.16773359051108175, 0....","[0.0028043567225936416, 0.08737022878496849, 0...","[0.020681329515193053, 0.09606176851262922, 0...."
402656,1538045212,1260308147,1,"[-0.06026564376806319, 0.06466471119274592, 0....","[-0.06026564376806319, 0.06466471119274592, 0....","[0.013333064400492953, 0.04302197479255666, -0...","[0.027618132749886747, 0.043863263628167844, 0..."
603778,624543430,643995467,1,"[-0.10795983864026687, 0.11143058981914807, 0....","[-0.10795983864026687, 0.11143058981914807, 0....","[-0.1444987461840761, 0.012642815029058228, 0....","[-0.1444987461840761, 0.012642815029058228, 0...."
990986,549717139,770507056,0,"[-0.05780336186312863, 0.10871770418453706, 0....","[-0.10695534967774047, 0.10274007141308132, 0....","[-0.11563188984453558, -0.0300171418274068, 0....","[-0.14109791208323372, -0.09175968737285321, 0..."
409047,649797520,879623472,0,"[-0.12076712039491183, 0.07852172420462776, 0....","[-0.06267111313383358, 0.10102431998347174, 0....","[0.026176950310800598, 0.07444865815055808, 0....","[0.09139589627693984, 0.0473513305769275, -0.0..."
702968,520370280,519611246,0,"[-0.11797401983607729, 0.10413861452454366, 0....","[-0.10429043371336227, 0.15591572990719407, 0....","[0.15868388728853233, -0.12677397737166818, 0....","[0.15863902201190758, -0.12766167187419677, 0...."
159390,1378854094,1479216673,0,"[-0.15941642796337185, 0.09956636748152727, 0....","[-0.14322157790637444, 0.09359274834060663, 0....","[-0.09069716768735359, 0.0842488682283678, 0.0...","[-0.034625219577910406, 0.03753115899603996, -..."
669154,559634167,1068285114,1,"[-0.19295330385439521, 0.1060335238520659, 0.1...","[-0.1421385807586221, 0.10561073868838532, 0.1...","[0.095310759569876, 0.020021074148214812, -0.0...","[0.095310759569876, 0.020021074148214812, -0.0..."
962862,1198430731,1320472216,1,"[-0.16471791832722063, 0.13700978647391915, 0....","[-0.14285859017240532, 0.10894745194399306, 0....","[0.15620091271218106, -0.09836465236844918, -0...","[0.15622335486895064, -0.09833839443225638, -0..."


In [42]:
for col in ["name_bert_64_1", "name_bert_64_2", "main_pic_embeddings_resnet_v1_1", "main_pic_embeddings_resnet_v1_2"]:
    print(f"Проверка {col}: {sampled_df[col].apply(lambda x: isinstance(x, np.ndarray)).all()}")


Проверка name_bert_64_1: True
Проверка name_bert_64_2: True
Проверка main_pic_embeddings_resnet_v1_1: True
Проверка main_pic_embeddings_resnet_v1_2: True


In [41]:
for col in ["name_bert_64_1", "name_bert_64_2", "main_pic_embeddings_resnet_v1_1", "main_pic_embeddings_resnet_v1_2"]:
    print(f"{col}: размеры массивов")
    print(sampled_df[col].apply(lambda x: x.shape if isinstance(x, np.ndarray) else "Некорректное значение").value_counts())


name_bert_64_1: размеры массивов
name_bert_64_1
(64,)    15
Name: count, dtype: int64
name_bert_64_2: размеры массивов
name_bert_64_2
(64,)    15
Name: count, dtype: int64
main_pic_embeddings_resnet_v1_1: размеры массивов
main_pic_embeddings_resnet_v1_1
(128,)    15
Name: count, dtype: int64
main_pic_embeddings_resnet_v1_2: размеры массивов
main_pic_embeddings_resnet_v1_2
(128,)    15
Name: count, dtype: int64


In [52]:
text_emb1 = torch.tensor(sampled_df["name_bert_64_1"].tolist(), dtype=torch.float32)
text_emb2 = torch.tensor(sampled_df["name_bert_64_2"].tolist(), dtype=torch.float32)
img_emb1 = torch.tensor(sampled_df["main_pic_embeddings_resnet_v1_1"].tolist(), dtype=torch.float32)
img_emb2 = torch.tensor(sampled_df["main_pic_embeddings_resnet_v1_2"].tolist(), dtype=torch.float32)

# Прогоняем данные через модель
model.eval()  # Устанавливаем модель в режим оценки
with torch.no_grad():
    outputs = model(text_emb1, img_emb1, text_emb2, img_emb2)
    probas = outputs.squeeze().cpu().numpy()
    predictions = (outputs > 0.5).float().squeeze().cpu().numpy()

# Добавляем предсказания в DataFrame
sampled_df["predictions"] = predictions
sampled_df["probas"] = probas

In [53]:
sampled_df

Unnamed: 0,variantid1,variantid2,target,name_bert_64_1,name_bert_64_2,main_pic_embeddings_resnet_v1_1,main_pic_embeddings_resnet_v1_2,predictions,probas
586673,447116673,447276570,0,"[-0.11546845693788657, 0.11879767000499351, 0....","[-0.12460829199407925, 0.12305943289675993, 0....","[-0.052273153188848204, 0.10378567635655774, -...","[-0.052273153188848204, 0.10378567635655774, -...",0.0,0.165524
110057,588412848,1169982188,0,"[-0.08414979233219054, 0.1594500221920774, 0.1...","[-0.06564694978218584, 0.16773359051108175, 0....","[0.0028043567225936416, 0.08737022878496849, 0...","[0.020681329515193053, 0.09606176851262922, 0....",1.0,0.662866
402656,1538045212,1260308147,1,"[-0.06026564376806319, 0.06466471119274592, 0....","[-0.06026564376806319, 0.06466471119274592, 0....","[0.013333064400492953, 0.04302197479255666, -0...","[0.027618132749886747, 0.043863263628167844, 0...",1.0,0.726535
603778,624543430,643995467,1,"[-0.10795983864026687, 0.11143058981914807, 0....","[-0.10795983864026687, 0.11143058981914807, 0....","[-0.1444987461840761, 0.012642815029058228, 0....","[-0.1444987461840761, 0.012642815029058228, 0....",1.0,0.821573
990986,549717139,770507056,0,"[-0.05780336186312863, 0.10871770418453706, 0....","[-0.10695534967774047, 0.10274007141308132, 0....","[-0.11563188984453558, -0.0300171418274068, 0....","[-0.14109791208323372, -0.09175968737285321, 0...",0.0,0.1351
409047,649797520,879623472,0,"[-0.12076712039491183, 0.07852172420462776, 0....","[-0.06267111313383358, 0.10102431998347174, 0....","[0.026176950310800598, 0.07444865815055808, 0....","[0.09139589627693984, 0.0473513305769275, -0.0...",0.0,0.008954
702968,520370280,519611246,0,"[-0.11797401983607729, 0.10413861452454366, 0....","[-0.10429043371336227, 0.15591572990719407, 0....","[0.15868388728853233, -0.12677397737166818, 0....","[0.15863902201190758, -0.12766167187419677, 0....",0.0,0.038423
159390,1378854094,1479216673,0,"[-0.15941642796337185, 0.09956636748152727, 0....","[-0.14322157790637444, 0.09359274834060663, 0....","[-0.09069716768735359, 0.0842488682283678, 0.0...","[-0.034625219577910406, 0.03753115899603996, -...",0.0,0.393728
669154,559634167,1068285114,1,"[-0.19295330385439521, 0.1060335238520659, 0.1...","[-0.1421385807586221, 0.10561073868838532, 0.1...","[0.095310759569876, 0.020021074148214812, -0.0...","[0.095310759569876, 0.020021074148214812, -0.0...",0.0,0.490262
962862,1198430731,1320472216,1,"[-0.16471791832722063, 0.13700978647391915, 0....","[-0.14285859017240532, 0.10894745194399306, 0....","[0.15620091271218106, -0.09836465236844918, -0...","[0.15622335486895064, -0.09833839443225638, -0...",1.0,0.595072


In [54]:
def make_predictions(model, sampled_df):
    # Подготовка данных для предсказания
    text_emb1 = torch.tensor(sampled_df["name_bert_64_1"].tolist(), dtype=torch.float32)
    text_emb2 = torch.tensor(sampled_df["name_bert_64_2"].tolist(), dtype=torch.float32)
    img_emb1 = torch.tensor(sampled_df["main_pic_embeddings_resnet_v1_1"].tolist(), dtype=torch.float32)
    img_emb2 = torch.tensor(sampled_df["main_pic_embeddings_resnet_v1_2"].tolist(), dtype=torch.float32)

    # Прогоняем данные через модель
    model.eval()  # Устанавливаем модель в режим оценки
    with torch.no_grad():
        outputs = model(text_emb1, img_emb1, text_emb2, img_emb2)
        probas = outputs.squeeze().cpu().numpy()
        predictions = (outputs > 0.5).float().squeeze().cpu().numpy()

    # Добавляем предсказания в DataFrame
    sampled_df["predictions"] = predictions
    sampled_df["probas"] = probas
    
    return sampled_df

# Применяем функцию для получения предсказаний
sampled_df = make_predictions(model, sampled_df)