In [None]:
import os
import re
from collections import Counter
from collections.abc import Iterable
from typing import Any, Dict, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from catboost import CatBoostRegressor, Pool
from IPython.display import display_html
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T
from torchvision.models import efficientnet_v2_s
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train = pd.read_parquet("/kaggle/input/aaa-exam-data/train.parquet")
train = train.set_index("item_id")
test = pd.read_parquet("/kaggle/input/aaa-exam-data/test.parquet")
test = test.set_index("item_id")

target_columns = ["real_weight", "real_length", "real_width", "real_height"]

У меня появилась гипотеза о том, что длина >= ширина >= высота

In [4]:
dim_data = train[["real_height", "real_length", "real_width"]].values

temp_analyze = train[["microcat_name"]].copy()
temp_analyze["real_min"] = np.min(dim_data, axis=1)
temp_analyze["real_mid"] = np.median(dim_data, axis=1)
temp_analyze["real_max"] = np.max(dim_data, axis=1)


In [5]:
temp_analyze.groupby(by="microcat_name")[["real_min", "real_mid", "real_max"]].mean()

Unnamed: 0_level_0,real_min,real_mid,real_max
microcat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"CD, DVD и Blu-ray приводы",10.171717,17.707071,22.575758
GPS-навигаторы,10.755102,16.474490,21.852041
MP3-плееры,10.103321,15.442804,20.693727
Автоакустика,17.773181,26.845934,36.281027
Автосвет,20.250000,29.076923,42.625000
...,...,...,...
Электронные книги,10.033654,19.173077,24.961538
Электрооборудование,11.977264,18.324899,27.255226
Эпиляторы,11.071429,17.914286,22.871429
"Этикетки, бутылки, пробки",14.445876,21.007732,30.201031


In [6]:
train.groupby(by="microcat_name")[["real_height", "real_width", "real_length"]].mean()

Unnamed: 0_level_0,real_height,real_width,real_length
microcat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"CD, DVD и Blu-ray приводы",10.171717,17.707071,22.575758
GPS-навигаторы,10.755102,16.474490,21.852041
MP3-плееры,10.103321,15.442804,20.693727
Автоакустика,17.773181,26.845934,36.281027
Автосвет,20.250000,29.076923,42.625000
...,...,...,...
Электронные книги,10.033654,19.173077,24.961538
Электрооборудование,11.977264,18.324899,27.255226
Эпиляторы,11.071429,17.914286,22.871429
"Этикетки, бутылки, пробки",14.445876,21.007732,30.201031


In [None]:
np.all(temp_analyze.groupby(by="microcat_name")[["real_min", "real_mid", "real_max"]].mean().values == train.groupby(by="microcat_name")[["real_height", "real_width", "real_length"]].mean().values)

np.True_

Средние по всем микрокатегориям совпадают, так что скорее всего гипотеза верна

In [8]:
def remove_outliers(df: pd.DataFrame, cat_col: str = "microcat_name") -> pd.DataFrame:
    """
    Удаления экстремальных выбросов из датафрейма.
    Группирует и считает верхние и нижние границы выбросов по колонке cat_col.

    Args:
        df (pd.DataFrame): Исходный датафрейм
        cat_col (str, optional): Колонка для группировки. Defaults to "microcat_name".

    Returns:
        pd.DataFrame: Датафрейм, очищенный от выбросов
    """
    df_clean = df.copy()

    target_cols = ["real_weight", "real_length", "real_width", "real_height"]

    for col in target_cols:
        upper_limit = df_clean.groupby(cat_col)[col].transform(lambda x: x.quantile(0.99))
        lower_limit = df_clean.groupby(cat_col)[col].transform(lambda x: x.quantile(0.01))

        mask = (df_clean[col] < upper_limit) & (df_clean[col] > lower_limit)
        mask = mask | (upper_limit == lower_limit)
        df_clean = df_clean[mask]

        print(mask.sum())

    # return df_clean # закомментируем на время

In [9]:
remove_outliers(train)

305560
287105
273759
241035


При полном последовательном очищении таргетов мы теряем большое число наблюдений.

Для каждого таргета будем использовать свою маску валидного датасета, тк для обучения предсказания длины, высота в принципе может являться выбросом

In [10]:
def get_valid_mask(df: pd.DataFrame, target: str, cat_col: str = "microcat_name") -> List:
    """
    Поиск выбросов по конкретному таргету в датафрейме.
    1 - валидная строка, 0 - выброс по target

    Args:
        df (pd.DataFrame): Исходный датафрейм
        target (str): Таргет
        cat_col (str, optional): Колонка для группировки. Defaults to "microcat_name".

    Returns:
        List: Бинарную маску валидных строк датафрейм
    """
    gb = df.groupby(cat_col)[target]
    upper = gb.transform(lambda x: x.quantile(0.999))
    lower = gb.transform(lambda x: x.quantile(0.001))

    quantile_mask = (df[target] <= upper) & (df[target] >= lower)

    mask = quantile_mask | (upper == lower)

    return mask

In [11]:
def text_clean(text: str) -> str:
    """
    Форматирование и очистка текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        str: Очищенная и отформатированная строка
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r"[^а-яёa-z0-9\s\.\,\-\*]", " ", text)

    text = " ".join(text.split())

    return text


def extract_weight(text: str) -> float:
    """Извлечение веса из текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        float: Извлеченной значение веса в кг
    """
    if pd.isna(text):
        return None

    text = text.lower()
    pattern = r"(\d+[.,]?\d*)\s*(кг|kg|г|гр|g|ml|мл)"

    match = re.search(pattern, text)
    if match:
        value_str = match.group(1).replace(",", ".")
        unit = match.group(2)

        try:
            value = float(value_str)
        except ValueError:
            return None

        if unit in ["г", "гр", "g", "ml", "мл"]:
            return value / 1000.0
        return value

    return None


def extract_dimensions(text: str) -> tuple[float, float, float]:
    """Извлечение Длины, Ширины и Высоты из текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        tuple[float, float, float]: Извлеченные Длина, Ширина и Высота
    """
    if pd.isna(text) or text == "":
        return -1, -1, -1

    text = str(text).lower()
    text = text.replace(",", ".")

    units_map = {"мм": 0.1, "mm": 0.1, "см": 1.0, "cm": 1.0, "м": 100.0, "m": 100.0}

    pattern_3d = r"(\d+(?:\.\d+)?)\s*[xх*]\s*(\d+(?:\.\d+)?)\s*[xх*]\s*(\d+(?:\.\d+)?)\s*(мм|mm|см|cm|м|m)?"
    match = re.search(pattern_3d, text)
    dims = []

    if match:
        try:
            d1 = float(match.group(1))
            d2 = float(match.group(2))
            d3 = float(match.group(3))
            unit = match.group(4)

            dims = [d1, d2, d3]

            if unit in units_map:
                factor = units_map[unit]
                dims = [d * factor for d in dims]

        except ValueError:
            return -1, -1, -1
    else:
        return -1, -1, -1

    dims.sort()  # [Min, Mid, Max]
    return dims[0], dims[1], dims[2]


In [12]:
def data_prepare(df: pd.DataFrame) -> pd.DataFrame:
    """Подготовка датафрейма к работе.
    Заполняет пропуски, форматирует и чистит текст, извлекает из текста фичи, дропает ненужные колонки

    Args:
        df (pd.DataFrame): Исходный датафрейм

    Returns:
        pd.DataFrame: Подготовленный датафрейм
    """
    df["item_condition"] = df["item_condition"].fillna("Unknown")

    df["text"] = df["title"].fillna("") + " " + df["description"].fillna("")
    df["text"] = df["text"].apply(text_clean)

    df["extracted_weight"] = df["text"].apply(extract_weight)

    dims_list = df["text"].apply(extract_dimensions).tolist()
    dims_df = pd.DataFrame(dims_list, columns=["extracted_length", "extracted_width", "extracted_height"], index=df.index)
    df = pd.concat([df, dims_df], axis=1)

    for col in ["extracted_weight", "extracted_length", "extracted_width", "extracted_height"]:
        df[col] = df[col].fillna(-1)

    df = df.drop(columns=["order_date", "seller_id", "buyer_id", "description", "title"])

    return df


train = data_prepare(train)
test = data_prepare(test)


Добавим логарифмированные таргеты. Будем обучать модель именно по ним, для соответствия с пространством метрики

In [13]:
for target in target_columns:
    train["log_" + target] = np.log(1 + train[target])

log_target_columns = ["log_" + target for target in target_columns]
all_targets = target_columns + log_target_columns

In [14]:
train.head()

Unnamed: 0_level_0,item_condition,item_price,category_name,subcategory_name,microcat_name,image_name,real_weight,real_height,real_length,real_width,text,extracted_weight,extracted_length,extracted_width,extracted_height,log_real_weight,log_real_length,log_real_width,log_real_height
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
185689,Б/у,3000.0,Транспорт,Запчасти и аксессуары,Салон,185689.jpg,0.37,10.0,23.0,19.0,ручка акпп mercedes w203 avangarde ручка рычаг...,-1.0,-1.0,-1.0,-1.0,0.314811,3.178054,2.995732,2.397895
1914373,Новое с биркой,5990.0,Личные вещи,"Одежда, обувь, аксессуары",Зимние куртки и пуховики,1914373.jpg,2.486,14.0,37.0,24.0,пуховик moncler голубой 52 размер объявление д...,-1.0,-1.0,-1.0,-1.0,1.248755,3.637586,3.218876,2.70805
361626,Новое,1200.0,Транспорт,Запчасти и аксессуары,Двигатель,361626.jpg,0.64,7.0,23.0,18.0,"запчасти на ford фокус1 опора задняя,двигатель...",-1.0,-1.0,-1.0,-1.0,0.494696,3.178054,2.944439,2.079442
534927,Б/у,13000.0,Электроника,"Игры, приставки и программы",Игровые приставки и аксессуары,534927.jpg,7.1,20.0,35.0,20.0,ps3 cechc 08 скальпирована hen полностью испра...,-1.0,-1.0,-1.0,-1.0,2.091864,3.583519,3.044522,3.044522
199043,Отличное,300.0,Личные вещи,"Одежда, обувь, аксессуары","Джемперы, свитеры, кардиганы",199043.jpg,0.4,7.0,23.0,11.0,свитер трикотаж 44 р-р reserved свитер pull be...,-1.0,-1.0,-1.0,-1.0,0.336472,3.178054,2.484907,2.079442


In [15]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=all_targets), train[all_targets], test_size=0.2, random_state=42)

temp_train = pd.concat([X_train, y_train], axis=1)
temp_val = pd.concat([X_val, y_val], axis=1)

Добавим новые фичи - средние значения логарифма таргетов по их микрокатегории. Посчитаем средние значения на train, их же будем использовать в val и test, во избежание утечки

In [17]:
for log_target in log_target_columns:
    gb = temp_train.groupby(by="microcat_name")[log_target].mean()
    mapper = dict(gb.items())
    X_train["mean_microcat_" + log_target] = X_train["microcat_name"].apply(lambda x: mapper[x])
    X_val["mean_microcat_" + log_target] = X_val["microcat_name"].apply(lambda x: mapper[x])
    test["mean_microcat_" + log_target] = test["microcat_name"].apply(lambda x: mapper[x])

В качестве бейзлайна возьмем это же среднее логарифмов таргета по микрокатегориям

In [18]:
scores_baseline = {log_target: mean_absolute_error(y_train[log_target], X_train["mean_microcat_" + log_target]) for log_target in log_target_columns}
results_targets = pd.DataFrame(
    {
        "Target": target_columns,
        "Baseline (mean_microcat)": [scores_baseline["log_" + t] for t in target_columns],
    }
)
results_targets = results_targets.set_index("Target")

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat)
Target,Unnamed: 1_level_1
real_weight,0.350348
real_length,0.319958
real_width,0.325912
real_height,0.552994


В качестве модели я решил использовать CatBoost, тк он из коробки отлично справляется с обработкой категориальных фичей (мне кажется категории имеют большое влияние на таргет) и текста, который также несет в себя большую часть информации по объекту, а так же умеет работать с эмбеддингами (для использования изображений товаров)

Попробуем обучать CatBoost независимо под каждый таргет, пока без эмбеддингов изображений

In [19]:
cat_feat = ["item_condition", "category_name", "subcategory_name", "microcat_name"]

In [None]:
preds_independent = {}
cb_independent = {}
scores_indep = {}

for target, log_target in zip(target_columns, log_target_columns):
    print(f"Training for {target}...")
    train_mask = get_valid_mask(temp_train, target)
    print(f"Clean Train size: {train_mask.sum()}")

    cb_train = Pool(X_train[train_mask].drop(columns=["image_name"]), label=y_train.loc[train_mask, log_target], cat_features=cat_feat, text_features=["text"])
    cb_val = Pool(X_val.drop(columns=["image_name"]), label=y_val[log_target], cat_features=cat_feat, text_features=["text"])

    cb = CatBoostRegressor(learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MAE", eval_metric="MAE", task_type="GPU", devices="0:1")

    cb.fit(cb_train, eval_set=cb_val, early_stopping_rounds=50, verbose=500)
    cb_independent[target] = cb

    full_val_pool = Pool(X_val.drop(columns=["image_name"]), cat_features=cat_feat, text_features=["text"])

    full_preds = cb.predict(full_val_pool)
    preds_independent[target] = full_preds

    real_score = mean_absolute_error(y_val[log_target], full_preds)
    scores_indep[target] = real_score

    print(f"\n  > REAL Score for {target}: {real_score:.5f}\n")

avg_score_indep = np.mean(list(scores_indep.values()))
print(f"=== ИТОГ Independent (w/o Image embeddings): {avg_score_indep:.5f} ===\n")

Training for real_weight...
Clean Train size: 249661


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.4011205	test: 0.4019997	best: 0.4019997 (0)	total: 108ms	remaining: 18m
500:	learn: 0.2793553	test: 0.2833484	best: 0.2833484 (500)	total: 5.87s	remaining: 1m 51s
1000:	learn: 0.2709423	test: 0.2765048	best: 0.2765048 (1000)	total: 10.7s	remaining: 1m 35s
1500:	learn: 0.2659616	test: 0.2732406	best: 0.2732406 (1500)	total: 15.6s	remaining: 1m 28s
2000:	learn: 0.2621929	test: 0.2712421	best: 0.2712421 (2000)	total: 20.5s	remaining: 1m 21s
2500:	learn: 0.2591141	test: 0.2699129	best: 0.2699129 (2500)	total: 25.4s	remaining: 1m 16s
3000:	learn: 0.2563970	test: 0.2689241	best: 0.2689241 (3000)	total: 30.4s	remaining: 1m 10s
3500:	learn: 0.2540267	test: 0.2682523	best: 0.2682523 (3500)	total: 35.3s	remaining: 1m 5s
4000:	learn: 0.2517327	test: 0.2676015	best: 0.2676010 (3975)	total: 40.2s	remaining: 1m
4500:	learn: 0.2496143	test: 0.2670656	best: 0.2670656 (4500)	total: 45.3s	remaining: 55.3s
5000:	learn: 0.2476580	test: 0.2667139	best: 0.2667139 (5000)	total: 50.2s	remaining: 5

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3493462	test: 0.3530040	best: 0.3530040 (0)	total: 27.1ms	remaining: 4m 31s
500:	learn: 0.2935348	test: 0.2994328	best: 0.2994328 (500)	total: 5.62s	remaining: 1m 46s
1000:	learn: 0.2893679	test: 0.2974066	best: 0.2974066 (999)	total: 10.6s	remaining: 1m 34s
1500:	learn: 0.2864638	test: 0.2964193	best: 0.2964193 (1500)	total: 15.4s	remaining: 1m 26s
2000:	learn: 0.2839125	test: 0.2958708	best: 0.2958703 (1999)	total: 20.2s	remaining: 1m 20s
2500:	learn: 0.2816804	test: 0.2954551	best: 0.2954545 (2499)	total: 25.1s	remaining: 1m 15s
bestTest = 0.2952892056
bestIteration = 2734
Shrink model to first 2735 iterations.

  > REAL Score for real_length: 0.29529

Training for real_width...
Clean Train size: 249706


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3566468	test: 0.3590276	best: 0.3590276 (0)	total: 29.3ms	remaining: 4m 52s
500:	learn: 0.3015442	test: 0.3045787	best: 0.3045787 (500)	total: 5.85s	remaining: 1m 50s
1000:	learn: 0.2975935	test: 0.3026059	best: 0.3026059 (1000)	total: 10.8s	remaining: 1m 37s
1500:	learn: 0.2946035	test: 0.3015562	best: 0.3015562 (1500)	total: 15.7s	remaining: 1m 28s
2000:	learn: 0.2921006	test: 0.3009815	best: 0.3009815 (2000)	total: 20.7s	remaining: 1m 22s
2500:	learn: 0.2898571	test: 0.3005375	best: 0.3005375 (2500)	total: 25.4s	remaining: 1m 16s
3000:	learn: 0.2877718	test: 0.3002338	best: 0.3002338 (3000)	total: 30.3s	remaining: 1m 10s
3500:	learn: 0.2857746	test: 0.2999712	best: 0.2999680 (3475)	total: 35.2s	remaining: 1m 5s
4000:	learn: 0.2839284	test: 0.2997675	best: 0.2997645 (3997)	total: 40s	remaining: 59.9s
4500:	learn: 0.2821689	test: 0.2996386	best: 0.2996335 (4487)	total: 44.8s	remaining: 54.7s
bestTest = 0.2995648459
bestIteration = 4790
Shrink model to first 4791 iterations

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.5844734	test: 0.5845014	best: 0.5845014 (0)	total: 33.1ms	remaining: 5m 30s
500:	learn: 0.5076215	test: 0.5122960	best: 0.5122960 (500)	total: 5.86s	remaining: 1m 51s
1000:	learn: 0.5021291	test: 0.5089391	best: 0.5089391 (1000)	total: 10.7s	remaining: 1m 36s
1500:	learn: 0.4984938	test: 0.5073207	best: 0.5073207 (1500)	total: 15.6s	remaining: 1m 28s
2000:	learn: 0.4954781	test: 0.5062817	best: 0.5062817 (2000)	total: 20.5s	remaining: 1m 22s
2500:	learn: 0.4927753	test: 0.5055125	best: 0.5055125 (2500)	total: 25.5s	remaining: 1m 16s
3000:	learn: 0.4902923	test: 0.5049479	best: 0.5049456 (2998)	total: 30.3s	remaining: 1m 10s
3500:	learn: 0.4880175	test: 0.5044865	best: 0.5044865 (3500)	total: 35.2s	remaining: 1m 5s
4000:	learn: 0.4858451	test: 0.5040747	best: 0.5040747 (4000)	total: 40.1s	remaining: 1m
4500:	learn: 0.4837408	test: 0.5037896	best: 0.5037882 (4492)	total: 45.1s	remaining: 55.1s
5000:	learn: 0.4817707	test: 0.5035877	best: 0.5035865 (4980)	total: 50.1s	remainin

In [21]:
results_targets["Independent CatBoost (w/o Image embeddings)"] = [scores_indep[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings)
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
real_weight,0.350348,0.265358
real_length,0.319958,0.295289
real_width,0.325912,0.299565
real_height,0.552994,0.503075


Далее идут блоки кода извлечения эмбеддингов из изображений при помощи EfficientNet_v2_S. Выполнял я их в колабе, прикрепив гугл диск с zip файлами изображений. Остальную же работу я выполнял на каггле, поэтому я выгрузил результаты в файлы и пользовался ими

In [None]:
# !unzip /content/drive/MyDrive/train.zip -d /content

In [None]:
# !unzip /content/drive/MyDrive/test.zip -d /content

In [27]:
# MEAN = np.array([0.485, 0.456, 0.406])
# STD = np.array([0.229, 0.224, 0.225])

# TRAIN_IMAGES = 'train'
# TEST_IMAGES = 'test'

In [28]:
# class ImageDataset(Dataset):
#     def __init__(self, images, images_folder):
#         self.images = images.values if hasattr(images, 'values') else images
#         self.images_folder = images_folder
#         self.transform = T.Compose([
#             T.Resize((384, 384), interpolation=T.InterpolationMode.BILINEAR),
#             T.CenterCrop(384),
#             T.ToTensor(),
#             T.Normalize(MEAN, STD)
#         ])

#     def __len__(self):
#         return len(self.images)

#     def __getitem__(self, index):
#         path = os.path.join(self.images_folder, self.images[index])
#         try:
#             img = Image.open(path).convert('RGB')
#             return self.transform(img)
#         except Exception as e:
#             print(f"Ошибка загрузки {path}: {e}")
#             return torch.zeros((3, 384, 384))

# train_ds = ImageDataset(train['image_name'], TRAIN_IMAGES)
# test_ds = ImageDataset(test['image_name'], TEST_IMAGES)


Необходимо заменить классификатор (последний блок) модели (тк она обучалась на ImageNet) на тождественное преобразование для получения фичей изображения

In [29]:
# vmodel = efficientnet_v2_s(weights='IMAGENET1K_V1')
# vmodel.classifier = nn.Identity()
# vmodel.to(device)
# optimized_vmodel = torch.compile(vmodel)

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth


100%|██████████| 82.7M/82.7M [00:00<00:00, 197MB/s]


In [30]:
# def image_extract(dataset, model):
#     loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)
#     all_features = []

#     with torch.inference_mode():
#         for batch in tqdm(loader):
#             batch = batch.to(device, non_blocking=True)

#             with torch.amp.autocast('cuda'):
#                 features = model(batch)

#             all_features.append(features.cpu())

#     return torch.cat(all_features).numpy()

# train_emb = image_extract(train_ds, optimized_vmodel)
# test_emb = image_extract(test_ds, optimized_vmodel)

  0%|          | 0/2445 [00:00<?, ?it/s]W0103 13:39:30.852000 160 torch/_inductor/utils.py:1558] [0/0] Not enough SMs to use max_autotune_gemm mode
100%|██████████| 2445/2445 [47:10<00:00,  1.16s/it]
100%|██████████| 550/550 [10:42<00:00,  1.17s/it]


In [None]:
# train_emb_df = pd.DataFrame(train_emb, index=train.index).add_prefix('emb_')
# test_emb_df = pd.DataFrame(test_emb, index=test.index).add_prefix('emb_')

# train_emb_df.to_parquet("contest_data/train_embeddings.parquet")
# test_emb_df.to_parquet("contest_data/test_embeddings.parquet")

Также я посчитал решил посчитать отношение ширины к высоте изображения, так как при извлечении эмбеддингов оно приводится к квадрату 384х384 и соотношение сторон теряется

In [None]:
# def get_image_aspect_ratio(image_names, folder_path):
#     """
#     Считывает размеры изображений и возвращает их отношение (Width / Height).
#     """
#     ratios = []

#     for name in image_names:
#         path = os.path.join(folder_path, name)
#         try:
#             with Image.open(path) as img:
#                 w, h = img.size
#                 ratios.append(w / h)
#         except Exception as e:
#             ratios.append(1.0)

#     return np.array(ratios, dtype=np.float32)


# train_aspect_ratios = get_image_aspect_ratio(train['image_name'], TRAIN_IMAGES)
# test_aspect_ratios = get_image_aspect_ratio(test['image_name'], TEST_IMAGES)

# pd.DataFrame(train_aspect_ratios).to_parquet("train_aspect_ratios.parquet")
# pd.DataFrame(test_aspect_ratios).to_parquet("test_aspect_ratios.parquet")

In [None]:
train_embs = pd.read_parquet("/kaggle/input/aaa-exam-data/train_embeddings.parquet")
test_embs = pd.read_parquet("/kaggle/input/aaa-exam-data/test_embeddings.parquet")

train_aspect_ratios = pd.read_parquet("/kaggle/input/aaa-exam-data/train_aspect_ratios.parquet").set_index(train.index).rename(columns={0: "aspect_ratio"})
test_aspect_ratios = pd.read_parquet("/kaggle/input/aaa-exam-data/test_aspect_ratios.parquet").set_index(test.index).rename(columns={0: "aspect_ratio"})

train = train.join(train_aspect_ratios).join(train_embs).copy()
test = test.join(test_aspect_ratios).join(test_embs).copy()

train = train.drop(columns=["image_name"])
test = test.drop(columns=["image_name"])

Для каждого изображения мы получили вектор из 1280 компонент. Это слишком много для сырой подачи в нашу модель, эмбеддинги могут быть слишком разрежены и скореллированы.

Применим к ним PCA, чтобы сократить использование памяти и время обучения, оставив только самые важные признаки.

In [23]:
emb_cols = [f"emb_{i}" for i in range(1280)]
pca = PCA(n_components=96, random_state=42)

train_pca = pca.fit_transform(train[emb_cols])
test_pca = pca.transform(test[emb_cols])

pca_cols = [f"img_pca_{i}" for i in range(96)]
train_pca_df = pd.DataFrame(train_pca, columns=pca_cols, index=train.index)
test_pca_df = pd.DataFrame(test_pca, columns=pca_cols, index=test.index)

train = train.join(train_pca_df).copy()
test = test.join(test_pca_df).copy()


train.drop(columns=emb_cols, inplace=True)
test.drop(columns=emb_cols, inplace=True)

Я решил добавить еще фичей, которые могут быть полезны

In [24]:
train["log_price"] = np.log1p(train["item_price"])
price_stats = train.groupby("microcat_name")["log_price"].transform("mean")
train["price_rel_to_category"] = train["log_price"] - price_stats

test["log_price"] = np.log1p(test["item_price"])
price_stats = test.groupby("microcat_name")["log_price"].transform("mean")
test["price_rel_to_category"] = test["log_price"] - price_stats

In [25]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=all_targets), train[all_targets], test_size=0.2, random_state=42)
temp_train = pd.concat([X_train, y_train], axis=1)
temp_val = pd.concat([X_val, y_val], axis=1)

X_test = test.copy()

In [None]:
for log_target in log_target_columns:
    gb = temp_train.groupby(by="microcat_name")[log_target].mean()
    mapper = dict(gb.items())
    X_train["mean_microcat_" + log_target] = X_train["microcat_name"].apply(lambda x: mapper[x])
    X_val["mean_microcat_" + log_target] = X_val["microcat_name"].apply(lambda x: mapper[x])
    X_test["mean_microcat_" + log_target] = X_test["microcat_name"].apply(lambda x: mapper[x])

Снова обучим CatBoost независимо под каждый таргет с новыми добавленными фичами

In [None]:
preds_independent_w_img = {}
cb_independent_w_img = {}
scores_indep_w_img = {}

for target, log_target in zip(target_columns, log_target_columns):
    print(f"Training for {target}...")
    train_mask = get_valid_mask(temp_train, target)
    print(f"Clean Train size: {train_mask.sum()}")

    cb_train = Pool(X_train[train_mask], label=y_train.loc[train_mask, log_target], cat_features=cat_feat, text_features=["text"])
    cb_val = Pool(X_val, label=y_val[log_target], cat_features=cat_feat, text_features=["text"])

    cb = CatBoostRegressor(learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MAE", eval_metric="MAE", task_type="GPU", devices="0:1")

    cb.fit(cb_train, eval_set=cb_val, early_stopping_rounds=50, verbose=500)
    cb_independent_w_img[target] = cb

    full_val_pool = Pool(X_val, cat_features=cat_feat, text_features=["text"])

    full_preds = cb.predict(full_val_pool)
    preds_independent_w_img[target] = full_preds

    real_score = mean_absolute_error(y_val[log_target], full_preds)
    scores_indep_w_img[target] = real_score

    print(f"\n  > REAL Score for {target}: {real_score:.5f}\n")


avg_score_indep_w_img = np.mean(list(scores_indep_w_img.values()))
print(f"=== ИТОГ Independent: {avg_score_indep_w_img:.5f} ===\n")

Training for real_weight...
Clean Train size: 249661


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.4008669	test: 0.4017280	best: 0.4017280 (0)	total: 32.1ms	remaining: 5m 20s
500:	learn: 0.2675166	test: 0.2732651	best: 0.2732651 (500)	total: 6.75s	remaining: 2m 7s
1000:	learn: 0.2581424	test: 0.2667336	best: 0.2667336 (1000)	total: 12.4s	remaining: 1m 51s
1500:	learn: 0.2519958	test: 0.2637888	best: 0.2637888 (1500)	total: 18.2s	remaining: 1m 43s
2000:	learn: 0.2469597	test: 0.2621194	best: 0.2621194 (2000)	total: 24.1s	remaining: 1m 36s
2500:	learn: 0.2424642	test: 0.2609167	best: 0.2609167 (2500)	total: 30.1s	remaining: 1m 30s
3000:	learn: 0.2383436	test: 0.2600415	best: 0.2600415 (3000)	total: 36s	remaining: 1m 24s
3500:	learn: 0.2344490	test: 0.2593418	best: 0.2593397 (3499)	total: 42.2s	remaining: 1m 18s
4000:	learn: 0.2307766	test: 0.2588550	best: 0.2588550 (4000)	total: 48.3s	remaining: 1m 12s
4500:	learn: 0.2271883	test: 0.2584698	best: 0.2584575 (4473)	total: 54.6s	remaining: 1m 6s
bestTest = 0.2584574819
bestIteration = 4473
Shrink model to first 4474 iteration

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3490948	test: 0.3527501	best: 0.3527501 (0)	total: 35.1ms	remaining: 5m 51s
500:	learn: 0.2850625	test: 0.2924172	best: 0.2924172 (500)	total: 7.11s	remaining: 2m 14s
1000:	learn: 0.2791696	test: 0.2901644	best: 0.2901644 (1000)	total: 13s	remaining: 1m 56s
1500:	learn: 0.2745098	test: 0.2891611	best: 0.2891611 (1500)	total: 18.9s	remaining: 1m 47s
2000:	learn: 0.2702640	test: 0.2885678	best: 0.2885678 (2000)	total: 25.1s	remaining: 1m 40s
2500:	learn: 0.2662923	test: 0.2881773	best: 0.2881706 (2493)	total: 31.3s	remaining: 1m 33s
3000:	learn: 0.2625229	test: 0.2879200	best: 0.2879200 (3000)	total: 37.5s	remaining: 1m 27s
bestTest = 0.2877942889
bestIteration = 3211
Shrink model to first 3212 iterations.

  > REAL Score for real_length: 0.28779

Training for real_width...
Clean Train size: 249706


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3562959	test: 0.3586633	best: 0.3586633 (0)	total: 31ms	remaining: 5m 9s
500:	learn: 0.2938633	test: 0.2987571	best: 0.2987571 (500)	total: 7.2s	remaining: 2m 16s
1000:	learn: 0.2881989	test: 0.2968472	best: 0.2968472 (1000)	total: 13.2s	remaining: 1m 58s
1500:	learn: 0.2836370	test: 0.2960053	best: 0.2960051 (1498)	total: 19.2s	remaining: 1m 48s
2000:	learn: 0.2794393	test: 0.2955019	best: 0.2955019 (2000)	total: 25.3s	remaining: 1m 41s
2500:	learn: 0.2754796	test: 0.2951328	best: 0.2951328 (2500)	total: 31.3s	remaining: 1m 33s
3000:	learn: 0.2716177	test: 0.2948908	best: 0.2948816 (2983)	total: 37.5s	remaining: 1m 27s
bestTest = 0.2948816153
bestIteration = 2983
Shrink model to first 2984 iterations.

  > REAL Score for real_width: 0.29488

Training for real_height...
Clean Train size: 249889


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.5844155	test: 0.5844358	best: 0.5844358 (0)	total: 31.8ms	remaining: 5m 17s
500:	learn: 0.4997584	test: 0.5061510	best: 0.5061510 (500)	total: 7.14s	remaining: 2m 15s
1000:	learn: 0.4927963	test: 0.5028432	best: 0.5028432 (1000)	total: 13.1s	remaining: 1m 57s
1500:	learn: 0.4874950	test: 0.5011686	best: 0.5011673 (1499)	total: 19s	remaining: 1m 47s
2000:	learn: 0.4828854	test: 0.5002560	best: 0.5002560 (2000)	total: 25s	remaining: 1m 40s
2500:	learn: 0.4786293	test: 0.4995497	best: 0.4995497 (2500)	total: 31s	remaining: 1m 32s
3000:	learn: 0.4745321	test: 0.4991697	best: 0.4991694 (2998)	total: 37.1s	remaining: 1m 26s
3500:	learn: 0.4705619	test: 0.4987374	best: 0.4987363 (3491)	total: 43.2s	remaining: 1m 20s
bestTest = 0.4984985636
bestIteration = 3810
Shrink model to first 3811 iterations.

  > REAL Score for real_height: 0.49850

=== ИТОГ Independent: 0.33491 ===



In [28]:
results_targets["Independent CatBoost"] = [scores_indep_w_img[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
real_weight,0.350348,0.265358,0.258457
real_length,0.319958,0.295289,0.287794
real_width,0.325912,0.299565,0.294882
real_height,0.552994,0.503075,0.498499


Результат стал действительно лучше, особенно в предсказании длины и ширины

Посмотрим на влияние фичей на предсказания моделей

In [29]:
cb_models = [cb_independent_w_img[target] for target in target_columns]

all_importances = pd.DataFrame({"Feature": cb_models[0].feature_names_})

for target, model in zip(target_columns, cb_models):
    all_importances[target] = model.get_feature_importance()

all_importances = all_importances.set_index("Feature")

In [30]:
html_str = ""
for col in target_columns[:4]:
    df_slice = all_importances[[col]].sort_values(by=col, ascending=False).head(10)
    html_str += df_slice.style.set_table_attributes("style='display:inline; margin-right:20px;'")._repr_html_()

display_html(html_str, raw=True)

Unnamed: 0_level_0,real_weight
Feature,Unnamed: 1_level_1
mean_microcat_log_real_weight,19.256811
text,17.652679
price_rel_to_category,7.666386
img_pca_8,6.121401
item_price,3.220253
item_condition,2.956574
log_price,2.836974
subcategory_name,2.554452
mean_microcat_log_real_width,1.739531
category_name,1.383248

Unnamed: 0_level_0,real_length
Feature,Unnamed: 1_level_1
text,15.768886
mean_microcat_log_real_length,14.228564
img_pca_8,6.026024
price_rel_to_category,5.097316
mean_microcat_log_real_width,3.137174
item_price,1.979972
item_condition,1.845732
img_pca_1,1.825936
mean_microcat_log_real_weight,1.767614
log_price,1.621581

Unnamed: 0_level_0,real_width
Feature,Unnamed: 1_level_1
mean_microcat_log_real_width,15.510462
text,15.123537
price_rel_to_category,8.517669
img_pca_8,6.224542
mean_microcat_log_real_length,2.225331
img_pca_1,1.594938
item_condition,1.580517
subcategory_name,1.423684
mean_microcat_log_real_weight,1.402347
img_pca_6,1.353318

Unnamed: 0_level_0,real_height
Feature,Unnamed: 1_level_1
mean_microcat_log_real_height,18.861499
text,18.789685
price_rel_to_category,6.471033
img_pca_8,3.82739
item_condition,2.265083
img_pca_2,2.264941
item_price,2.10245
log_price,1.949451
mean_microcat_log_real_width,1.857025
subcategory_name,1.697377


In [31]:
html_str = ""
for col in target_columns[:4]:
    df_slice = all_importances[[col]].sort_values(by=col, ascending=True).head(10)
    html_str += df_slice.style.set_table_attributes("style='display:inline; margin-right:20px;'")._repr_html_()

display_html(html_str, raw=True)


Unnamed: 0_level_0,real_weight
Feature,Unnamed: 1_level_1
extracted_length,0.043929
extracted_height,0.05092
extracted_width,0.092096
img_pca_50,0.169123
img_pca_87,0.170142
img_pca_95,0.172075
img_pca_88,0.173572
img_pca_57,0.189569
img_pca_44,0.189848
img_pca_83,0.191843

Unnamed: 0_level_0,real_length
Feature,Unnamed: 1_level_1
extracted_length,0.031269
extracted_width,0.033076
microcat_name,0.072412
extracted_height,0.200472
img_pca_75,0.239882
img_pca_79,0.254828
img_pca_60,0.258272
img_pca_78,0.268321
img_pca_48,0.272452
img_pca_83,0.278653

Unnamed: 0_level_0,real_width
Feature,Unnamed: 1_level_1
extracted_length,0.038246
extracted_height,0.050875
microcat_name,0.056721
extracted_width,0.208459
img_pca_62,0.234589
img_pca_77,0.245924
img_pca_60,0.255676
img_pca_83,0.257881
img_pca_88,0.264585
img_pca_38,0.271464

Unnamed: 0_level_0,real_height
Feature,Unnamed: 1_level_1
extracted_width,0.008652
extracted_height,0.013771
extracted_length,0.10281
microcat_name,0.19732
img_pca_69,0.201089
img_pca_68,0.212482
img_pca_50,0.222909
img_pca_95,0.232772
img_pca_60,0.233427
img_pca_92,0.237455


CatBoost отлично справился с извлечением информации из текста и ему вообще не пригодились извлеченные мной размерности

Также можно отметить, что огромную роль сыграли средние значения таргета по микрокатегориям

Некоторые фичи изображений имеют неплохое влияние

Попробую обучить CatBoost на предсказание сразу всех 4 таргетов

In [None]:
train_mask_m = get_valid_mask(temp_train, "real_weight") & get_valid_mask(temp_train, "real_height") & get_valid_mask(temp_train, "real_length") & get_valid_mask(temp_train, "real_width")

cb_train_m = Pool(
    X_train[train_mask_m],
    label=y_train.loc[train_mask_m, log_target_columns],
    cat_features=["item_condition", "category_name", "subcategory_name", "microcat_name"],
    text_features=["text"],
)
cb_val_m = Pool(
    X_val,
    label=y_val[log_target_columns],
    cat_features=["item_condition", "category_name", "subcategory_name", "microcat_name"],
    text_features=["text"],
)

cb_m = CatBoostRegressor(learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MultiRMSE", eval_metric="MultiRMSE", task_type="GPU", devices="0:1")
cb_m.fit(cb_train_m, eval_set=cb_val_m, early_stopping_rounds=50, verbose=500)

full_val_m = Pool(
    X_val,
    cat_features=cat_feat,
    text_features=["text"],
)

preds_m = cb_m.predict(full_val_m)

scores_multi = {}
for i, target in enumerate(log_target_columns):
    p = preds_m[:, i]

    score = mean_absolute_error(y_val[target], p)
    scores_multi[target] = score
    print(f"\n  > Score for {target}: {score:.5f}")

avg_score_multi = np.mean(list(scores_multi.values()))
print(f"\n=== ИТОГ MultiRMSE: {avg_score_multi:.5f} ===")

0:	learn: 1.1691725	test: 1.1822326	best: 1.1822326 (0)	total: 149ms	remaining: 24m 50s
500:	learn: 0.9800124	test: 1.0041901	best: 1.0041901 (500)	total: 11.8s	remaining: 3m 43s
1000:	learn: 0.9681333	test: 0.9976474	best: 0.9976474 (1000)	total: 22.8s	remaining: 3m 24s
1500:	learn: 0.9602025	test: 0.9945357	best: 0.9945357 (1500)	total: 33.1s	remaining: 3m 7s
2000:	learn: 0.9536521	test: 0.9925655	best: 0.9925655 (2000)	total: 43.2s	remaining: 2m 52s
2500:	learn: 0.9476920	test: 0.9912177	best: 0.9912177 (2500)	total: 53.3s	remaining: 2m 39s
3000:	learn: 0.9423214	test: 0.9902370	best: 0.9902370 (3000)	total: 1m 3s	remaining: 2m 27s
3500:	learn: 0.9372624	test: 0.9894870	best: 0.9894870 (3500)	total: 1m 13s	remaining: 2m 15s
4000:	learn: 0.9324028	test: 0.9888814	best: 0.9888759 (3998)	total: 1m 23s	remaining: 2m 5s
4500:	learn: 0.9276553	test: 0.9883356	best: 0.9883335 (4494)	total: 1m 34s	remaining: 1m 55s
5000:	learn: 0.9228782	test: 0.9878944	best: 0.9878908 (4987)	total: 1m 44s	

In [33]:
results_targets["Multi CatBoost"] = [scores_multi[t] for t in log_target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost,Multi CatBoost
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
real_weight,0.350348,0.265358,0.258457,0.268604
real_length,0.319958,0.295289,0.287794,0.290994
real_width,0.325912,0.299565,0.294882,0.296282
real_height,0.552994,0.503075,0.498499,0.513128


In [34]:
print(f"\nFinal Score Diff: {avg_score_indep_w_img} vs {avg_score_multi}")


Final Score Diff: 0.3349079916976393 vs 0.3422520639619967


Ожидаемо результат оказался хуже, так как для каждого таргета будет свой набор важных фичей

Далее я решил отойти от CatBoost и попробовать обучить нейросеть. Возможно она сможет лучше уловить зависимости

Для начала подготовим данные

In [None]:
num_feat = [
    "item_price",
    "extracted_weight",
    "extracted_length",
    "extracted_width",
    "extracted_height",
    "aspect_ratio",
    "log_price",
    "price_rel_to_category",
    "mean_microcat_log_real_weight",
    "mean_microcat_log_real_length",
    "mean_microcat_log_real_width",
    "mean_microcat_log_real_height",
]

X_train_num = X_train[num_feat].fillna(0).values.astype(np.float32)
X_val_num = X_val[num_feat].fillna(0).values.astype(np.float32)
X_test_num = X_test[num_feat].fillna(0).values.astype(np.float32)

# Важно нормировать все численные фичи
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_val_num = scaler.transform(X_val_num)
X_test_num = scaler.transform(X_test_num)

# Также закодируем категориальные переменные LabelEncoding-ом. Я выбрал его, а не OHE, так как получим слишком много колонок
cat_feat = ["item_condition", "category_name", "subcategory_name", "microcat_name"]
cat_encoders = {}
X_train_cat = []
X_val_cat = []
X_test_cat = []

for target in cat_feat:
    le = LabelEncoder()
    all_values = pd.concat([X_train[target], X_val[target], X_test[target]]).astype(str).fillna("Unknown")
    le.fit(all_values)

    X_train_cat.append(le.transform(X_train[target].astype(str).fillna("Unknown")))
    X_val_cat.append(le.transform(X_val[target].astype(str).fillna("Unknown")))
    X_test_cat.append(le.transform(X_test[target].astype(str).fillna("Unknown")))
    cat_encoders[target] = le

X_train_cat = np.stack(X_train_cat, axis=1).astype(np.int64)
X_val_cat = np.stack(X_val_cat, axis=1).astype(np.int64)
X_test_cat = np.stack(X_test_cat, axis=1).astype(np.int64)

Я решил использовать предобученную модель RuBERT-tiny2 для более качественного извлечения эмбеддингов текста

In [None]:
MODEL_NAME = "cointegrated/rubert-tiny2"


def get_text_embeddings(texts: List, model_name: str = MODEL_NAME) -> np.ndarray:
    """
    Генерирует эмбеддинги текста с помощью предобученного трансформера.
    """
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), 256), desc="Extracting Text Embeddings"):
            batch_texts = texts[i : i + 256]

            encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=64, return_tensors="pt")
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            model_output = model(**encoded_input)
            cls_embeddings = model_output.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(embeddings)


all_texts_list = pd.concat([X_train["text"], X_val["text"], X_test["text"]]).tolist()

text_embs = get_text_embeddings(all_texts_list)

X_train_text_embs = text_embs[: len(X_train)]
X_val_text_embs = text_embs[len(X_train) : len(X_train) + len(X_val)]
X_test_text_embs = text_embs[len(X_train) + len(X_val) :]

Loading cointegrated/rubert-tiny2...


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

2026-01-15 18:28:27.145716: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768501707.329626      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768501707.381406      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768501707.821471      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768501707.821499      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768501707.821502      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Extracting Text Embeddings: 100%|██████████| 1497/1497 [01:33<00:00, 15.98it/s]


In [37]:
pca_cols = [f"img_pca_{i}" for i in range(96)]
X_train_img = X_train[pca_cols].values
X_val_img = X_val[pca_cols].values
X_test_img = X_test[pca_cols].values

In [None]:
# для обучения нейросети, которая будет выдавать все 4 таргета, придется убрать все выбросы сразу
train_mask = get_valid_mask(temp_train, "real_weight") & get_valid_mask(temp_train, "real_height") & get_valid_mask(temp_train, "real_length") & get_valid_mask(temp_train, "real_width")

X_train_num_m = X_train_num[train_mask]
X_train_cat_m = X_train_cat[train_mask]
X_train_text_embs_m = X_train_text_embs[train_mask]
X_train_img_m = X_train_img[train_mask]
y_train_m = y_train[train_mask]

In [None]:
class WaDDataset(Dataset):
    def __init__(self, nums, cats, text, imgs, targets=None):
        self.nums = torch.FloatTensor(nums)
        self.cats = torch.LongTensor(cats)
        self.text = torch.FloatTensor(text)
        self.imgs = torch.FloatTensor(imgs)
        self.targets = targets

    def __len__(self):
        return len(self.nums)

    def __getitem__(self, idx):
        inputs = {"num": self.nums[idx], "cat": self.cats[idx], "text": self.text[idx], "img": self.imgs[idx]}

        if self.targets is not None:
            return inputs, self.targets[idx]

        return inputs


In [None]:
train_ds_m = WaDDataset(X_train_num_m, X_train_cat_m, X_train_text_embs_m, X_train_img_m, y_train_m[log_target_columns].values)
val_ds_m = WaDDataset(X_val_num, X_val_cat, X_val_text_embs, X_val_img, y_val[log_target_columns].values)
test_ds = WaDDataset(X_test_num, X_test_cat, X_test_text_embs, X_test_img, None)


train_loader_m = DataLoader(train_ds_m, batch_size=256, shuffle=True)
val_loader_m = DataLoader(val_ds_m, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)

In [None]:
class MultiFusionNet(nn.Module):
    def __init__(
        self,
        num_feat_dim,
        cat_counts,  # [кол-во уникальных для cat1, cat2...]
        text_input_dim=312,
        img_input_dim=1280,
    ):
        super().__init__()

        # --- 1. Ветка Категорий ---
        self.cat_embs = nn.ModuleList([nn.Embedding(c, min(50, (c + 1) // 2)) for c in cat_counts])

        # --- 2. Ветка Текста ---
        self.text_fc = nn.Sequential(nn.Linear(text_input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.1))

        # --- 3. Ветка Картинок ---
        self.img_fc = nn.Sequential(nn.Linear(img_input_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2))

        # --- 4. Ветка Чисел ---
        self.num_fc = nn.Sequential(nn.Linear(num_feat_dim, 64), nn.ReLU())

        # --- 5. Fusion (Слияние) ---
        fusion_dim = sum(e.embedding_dim for e in self.cat_embs) + 128 + 256 + 64

        self.head = nn.Sequential(nn.Linear(fusion_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3), nn.Linear(512, 128), nn.ReLU(), nn.Linear(128, 4))

    def forward(self, x):
        # 1. Categories
        cat_out = [emb(x["cat"][:, i]) for i, emb in enumerate(self.cat_embs)]
        cat_out = torch.cat(cat_out, dim=1)

        # 2. Text
        text_out = self.text_fc(x["text"])

        # 3. Image
        img_out = self.img_fc(x["img"])

        # 4. Numerical
        num_out = self.num_fc(x["num"])

        # Concat
        combined = torch.cat([cat_out, text_out, img_out, num_out], dim=1)

        return self.head(combined)

In [None]:
def train_nn(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, epochs: int = 15, lr: float = 1e-3, device: str = "cuda") -> nn.Module:
    """Функция тренировки нейросети, сохраняет веса с наименьшим лоссом и загружает их в конце обучения

    Args:
        model (nn.Module): Модель
        train_loader (DataLoader): Тренировочный даталоадер
        val_loader (DataLoader): Валидационный даталоадер
        epochs (int, optional): Кол-во эпох обучения. Defaults to 15.
        lr (float, optional): Learning Rate. Defaults to 1e-3.
        device (str, optional): device. Defaults to "cuda".

    Returns:
        nn.Module: Обученная модель
    """
    criterion = nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)

    best_loss = float("inf")
    best_weights = None

    print(f"Start training on {device}...")

    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0

        for inputs, targets in train_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            targets = targets.to(device)

            # Если таргет одномерный (Independent), делаем (Batch, 1)
            # Если многомерный (Multi), он уже (Batch, 4), ничего не меняется
            if targets.ndim == 1:
                targets = targets.view(-1, 1)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        val_loss = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                targets = targets.to(device)

                if targets.ndim == 1:
                    targets = targets.view(-1, 1)

                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        scheduler.step(avg_val_loss)

        saved_msg = ""
        if best_loss - avg_val_loss >= 0.0001:
            best_loss = avg_val_loss
            best_weights = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            saved_msg = "-> Saved Best!"

        print(f"Epoch {epoch + 1}/{epochs} | Train MAE: {avg_train_loss:.5f} | Val MAE: {avg_val_loss:.5f} {saved_msg}")

    print(f"Training finished. Best Val MAE: {best_loss:.5f}")
    if best_weights is not None:
        model.load_state_dict(best_weights)

    return model

In [None]:
cat_counts = [len(enc.classes_) for enc in cat_encoders.values()]

net_m = MultiFusionNet(num_feat_dim=X_train_num.shape[1], cat_counts=cat_counts, text_input_dim=312, img_input_dim=96).to(device)

net_m = train_nn(net_m, train_loader_m, val_loader_m, epochs=30, lr=1e-3)

net_m.eval()
nn_m_preds = []
with torch.no_grad():
    for inputs, targets in val_loader_m:
        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = targets.to(device)

        nn_m_preds.append(net_m(inputs).cpu().numpy())
nn_m_preds = np.vstack(nn_m_preds)

scores_multi_nn = {}
for i, target in enumerate(log_target_columns):
    p = nn_m_preds[:, i]

    score = mean_absolute_error(y_val[target], p)
    scores_multi_nn[target] = score
    print(f"  > Score for {target}: {score:.5f}")

Start training on cuda...


  3%|▎         | 1/30 [00:08<04:17,  8.87s/it]

Epoch 1/30 | Train MAE: 0.39553 | Val MAE: 0.35576 -> Saved Best!


  7%|▋         | 2/30 [00:17<04:10,  8.95s/it]

Epoch 2/30 | Train MAE: 0.35983 | Val MAE: 0.34730 -> Saved Best!


 10%|█         | 3/30 [00:26<03:56,  8.74s/it]

Epoch 3/30 | Train MAE: 0.34991 | Val MAE: 0.34349 -> Saved Best!


 13%|█▎        | 4/30 [00:34<03:45,  8.66s/it]

Epoch 4/30 | Train MAE: 0.34400 | Val MAE: 0.33843 -> Saved Best!


 17%|█▋        | 5/30 [00:43<03:35,  8.62s/it]

Epoch 5/30 | Train MAE: 0.33843 | Val MAE: 0.33662 -> Saved Best!


 20%|██        | 6/30 [00:52<03:27,  8.65s/it]

Epoch 6/30 | Train MAE: 0.33497 | Val MAE: 0.33278 -> Saved Best!


 23%|██▎       | 7/30 [01:00<03:18,  8.62s/it]

Epoch 7/30 | Train MAE: 0.33149 | Val MAE: 0.33337 


 27%|██▋       | 8/30 [01:09<03:09,  8.61s/it]

Epoch 8/30 | Train MAE: 0.32919 | Val MAE: 0.33156 -> Saved Best!


 30%|███       | 9/30 [01:17<03:00,  8.60s/it]

Epoch 9/30 | Train MAE: 0.32614 | Val MAE: 0.32990 -> Saved Best!


 33%|███▎      | 10/30 [01:26<02:51,  8.59s/it]

Epoch 10/30 | Train MAE: 0.32404 | Val MAE: 0.33219 


 37%|███▋      | 11/30 [01:34<02:42,  8.57s/it]

Epoch 11/30 | Train MAE: 0.32167 | Val MAE: 0.32921 -> Saved Best!


 40%|████      | 12/30 [01:43<02:34,  8.57s/it]

Epoch 12/30 | Train MAE: 0.31986 | Val MAE: 0.32997 


 43%|████▎     | 13/30 [01:52<02:25,  8.54s/it]

Epoch 13/30 | Train MAE: 0.31797 | Val MAE: 0.32947 


 47%|████▋     | 14/30 [02:00<02:16,  8.53s/it]

Epoch 14/30 | Train MAE: 0.31649 | Val MAE: 0.32792 -> Saved Best!


 50%|█████     | 15/30 [02:09<02:08,  8.57s/it]

Epoch 15/30 | Train MAE: 0.31491 | Val MAE: 0.33132 


 53%|█████▎    | 16/30 [02:17<01:59,  8.55s/it]

Epoch 16/30 | Train MAE: 0.31357 | Val MAE: 0.32823 


 57%|█████▋    | 17/30 [02:26<01:50,  8.53s/it]

Epoch 17/30 | Train MAE: 0.31211 | Val MAE: 0.32842 


 60%|██████    | 18/30 [02:34<01:42,  8.52s/it]

Epoch 18/30 | Train MAE: 0.31056 | Val MAE: 0.32854 


 63%|██████▎   | 19/30 [02:43<01:33,  8.53s/it]

Epoch 19/30 | Train MAE: 0.30943 | Val MAE: 0.32837 


 67%|██████▋   | 20/30 [02:51<01:25,  8.56s/it]

Epoch 20/30 | Train MAE: 0.30828 | Val MAE: 0.32808 


 70%|███████   | 21/30 [03:00<01:16,  8.55s/it]

Epoch 21/30 | Train MAE: 0.30690 | Val MAE: 0.32819 


 73%|███████▎  | 22/30 [03:08<01:08,  8.56s/it]

Epoch 22/30 | Train MAE: 0.30588 | Val MAE: 0.33052 


 77%|███████▋  | 23/30 [03:17<00:59,  8.56s/it]

Epoch 23/30 | Train MAE: 0.30502 | Val MAE: 0.33128 


 80%|████████  | 24/30 [03:26<00:51,  8.54s/it]

Epoch 24/30 | Train MAE: 0.30387 | Val MAE: 0.32927 


 83%|████████▎ | 25/30 [03:34<00:42,  8.55s/it]

Epoch 25/30 | Train MAE: 0.30283 | Val MAE: 0.32952 


 87%|████████▋ | 26/30 [03:43<00:34,  8.54s/it]

Epoch 26/30 | Train MAE: 0.30173 | Val MAE: 0.32901 


 90%|█████████ | 27/30 [03:51<00:25,  8.55s/it]

Epoch 27/30 | Train MAE: 0.30067 | Val MAE: 0.33119 


 93%|█████████▎| 28/30 [04:00<00:17,  8.55s/it]

Epoch 28/30 | Train MAE: 0.30018 | Val MAE: 0.32968 


 97%|█████████▋| 29/30 [04:08<00:08,  8.54s/it]

Epoch 29/30 | Train MAE: 0.29922 | Val MAE: 0.32921 


100%|██████████| 30/30 [04:17<00:00,  8.58s/it]

Epoch 30/30 | Train MAE: 0.29864 | Val MAE: 0.32907 
Training finished. Best Val MAE: 0.32792





  > Score for log_real_weight: 0.24412
  > Score for log_real_length: 0.28372
  > Score for log_real_width: 0.29146
  > Score for log_real_height: 0.49220


In [None]:
results_targets["Multi NN"] = [scores_multi_nn[t] for t in log_target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost,Multi CatBoost,Multi NN
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
real_weight,0.350348,0.265358,0.258457,0.268604,0.244118
real_length,0.319958,0.295289,0.287794,0.290994,0.283718
real_width,0.325912,0.299565,0.294882,0.296282,0.291456
real_height,0.552994,0.503075,0.498499,0.513128,0.492199


Нейросеть превзошла CatBoost

Стоит попробовать обучать модель под каждый таргет независимо, раз это дало значительное улучшение в случае с CatBoost

In [None]:
class IndependentFusionNet(nn.Module):
    def __init__(self, num_feat_dim, cat_counts, text_input_dim=312, img_input_dim=96):
        super().__init__()

        self.cat_embs = nn.ModuleList([nn.Embedding(c, min(50, (c + 1) // 2)) for c in cat_counts])
        self.text_fc = nn.Sequential(nn.Linear(text_input_dim, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.1))
        self.img_fc = nn.Sequential(nn.Linear(img_input_dim, 128), nn.ReLU(), nn.Dropout(0.3))
        self.num_fc = nn.Sequential(nn.Linear(num_feat_dim, 64), nn.ReLU())

        fusion_dim = sum(e.embedding_dim for e in self.cat_embs) + 128 + 128 + 64
        self.head = nn.Sequential(nn.Linear(fusion_dim, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.4), nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1))

    def forward(self, x):
        cats = [emb(x["cat"][:, i]) for i, emb in enumerate(self.cat_embs)]
        x_cat = torch.cat(cats, 1)
        x_txt = self.text_fc(x["text"])
        x_img = self.img_fc(x["img"])
        x_num = self.num_fc(x["num"])
        return self.head(torch.cat([x_cat, x_txt, x_img, x_num], 1))

In [None]:
nn_models = {}
nn_scores = {}

print("Starting Independent NN Training...")

for target, log_target in zip(target_columns, log_target_columns):
    print(f"\n>>> Training NN for {target} ({log_target})...")

    train_mask = get_valid_mask(temp_train, target).values
    val_mask = get_valid_mask(temp_val, target).values

    print(f"    Clean Train: {train_mask.sum()} / {len(train_mask)}")

    tr_num = X_train_num[train_mask]
    tr_cat = X_train_cat[train_mask]
    tr_txt = X_train_text_embs[train_mask]
    tr_img = X_train_img[train_mask]
    tr_y = y_train[log_target].values.astype(np.float32)[train_mask]

    curr_train_ds = WaDDataset(tr_num, tr_cat, tr_txt, tr_img, tr_y)
    curr_val_ds = WaDDataset(X_val_num, X_val_cat, X_val_text_embs, X_val_img, y_val[log_target].values.astype(np.float32))

    curr_train_loader = DataLoader(curr_train_ds, batch_size=256, shuffle=True, num_workers=0)
    curr_val_loader = DataLoader(curr_val_ds, batch_size=256, shuffle=False, num_workers=0)

    cat_counts = [len(enc.classes_) for enc in cat_encoders.values()]
    model = IndependentFusionNet(num_feat_dim=X_train_num.shape[1], cat_counts=cat_counts, text_input_dim=312, img_input_dim=96).to(device)

    model = train_nn(model, curr_train_loader, curr_val_loader, lr=1e-3, epochs=20)
    nn_models[target] = model

    model.eval()
    preds = []
    with torch.no_grad():
        for inputs, _ in curr_val_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            out = model(inputs)
            preds.append(out.cpu().numpy())

    full_preds = np.vstack(preds).flatten()
    real_score = mean_absolute_error(y_val[log_target], full_preds)
    nn_scores[target] = real_score
    print(f"Final NN Independent {target} Score: {real_score}")


avg_nn_score = np.mean(list(nn_scores.values()))
print(f"\n=== FINAL NN Independent Score: {avg_nn_score:.5f} ===")

Starting Independent NN Training...

>>> Training NN for real_weight (log_real_weight)...
    Clean Train: 249661 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:07<02:22,  7.52s/it]

Epoch 1/20 | Train MAE: 0.27882 | Val MAE: 0.26084 -> Saved Best!


 10%|█         | 2/20 [00:15<02:18,  7.69s/it]

Epoch 2/20 | Train MAE: 0.26108 | Val MAE: 0.25402 -> Saved Best!


 15%|█▌        | 3/20 [00:22<02:06,  7.42s/it]

Epoch 3/20 | Train MAE: 0.25516 | Val MAE: 0.24983 -> Saved Best!


 20%|██        | 4/20 [00:30<02:01,  7.58s/it]

Epoch 4/20 | Train MAE: 0.25109 | Val MAE: 0.24942 -> Saved Best!


 25%|██▌       | 5/20 [00:37<01:53,  7.58s/it]

Epoch 5/20 | Train MAE: 0.24816 | Val MAE: 0.25088 


 30%|███       | 6/20 [00:45<01:45,  7.55s/it]

Epoch 6/20 | Train MAE: 0.24515 | Val MAE: 0.24731 -> Saved Best!


 35%|███▌      | 7/20 [00:53<01:39,  7.69s/it]

Epoch 7/20 | Train MAE: 0.24291 | Val MAE: 0.24485 -> Saved Best!


 40%|████      | 8/20 [01:00<01:31,  7.64s/it]

Epoch 8/20 | Train MAE: 0.24055 | Val MAE: 0.24419 -> Saved Best!


 45%|████▌     | 9/20 [01:08<01:23,  7.60s/it]

Epoch 9/20 | Train MAE: 0.23928 | Val MAE: 0.24506 


 50%|█████     | 10/20 [01:15<01:15,  7.56s/it]

Epoch 10/20 | Train MAE: 0.23719 | Val MAE: 0.24400 -> Saved Best!


 55%|█████▌    | 11/20 [01:23<01:07,  7.55s/it]

Epoch 11/20 | Train MAE: 0.23543 | Val MAE: 0.24463 


 60%|██████    | 12/20 [01:31<01:01,  7.65s/it]

Epoch 12/20 | Train MAE: 0.23429 | Val MAE: 0.24262 -> Saved Best!


 65%|██████▌   | 13/20 [01:38<00:53,  7.59s/it]

Epoch 13/20 | Train MAE: 0.23325 | Val MAE: 0.24293 


 70%|███████   | 14/20 [01:46<00:45,  7.55s/it]

Epoch 14/20 | Train MAE: 0.23124 | Val MAE: 0.24300 


 75%|███████▌  | 15/20 [01:54<00:38,  7.67s/it]

Epoch 15/20 | Train MAE: 0.23069 | Val MAE: 0.24239 -> Saved Best!


 80%|████████  | 16/20 [02:01<00:30,  7.52s/it]

Epoch 16/20 | Train MAE: 0.22956 | Val MAE: 0.24179 -> Saved Best!


 85%|████████▌ | 17/20 [02:09<00:22,  7.63s/it]

Epoch 17/20 | Train MAE: 0.22846 | Val MAE: 0.24248 


 90%|█████████ | 18/20 [02:16<00:15,  7.59s/it]

Epoch 18/20 | Train MAE: 0.22738 | Val MAE: 0.24222 


 95%|█████████▌| 19/20 [02:24<00:07,  7.59s/it]

Epoch 19/20 | Train MAE: 0.22646 | Val MAE: 0.24178 


100%|██████████| 20/20 [02:32<00:00,  7.60s/it]

Epoch 20/20 | Train MAE: 0.22569 | Val MAE: 0.24141 -> Saved Best!
Training finished. Best Val MAE: 0.24141





Final NN Independent real_weight Score: 0.24131014759039968

>>> Training NN for real_length (log_real_length)...
    Clean Train: 249598 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:07<02:23,  7.55s/it]

Epoch 1/20 | Train MAE: 0.42329 | Val MAE: 0.33477 -> Saved Best!


 10%|█         | 2/20 [00:15<02:20,  7.79s/it]

Epoch 2/20 | Train MAE: 0.33316 | Val MAE: 0.31198 -> Saved Best!


 15%|█▌        | 3/20 [00:22<02:09,  7.59s/it]

Epoch 3/20 | Train MAE: 0.30450 | Val MAE: 0.29173 -> Saved Best!


 20%|██        | 4/20 [00:30<02:03,  7.72s/it]

Epoch 4/20 | Train MAE: 0.29693 | Val MAE: 0.29177 


 25%|██▌       | 5/20 [00:38<01:54,  7.66s/it]

Epoch 5/20 | Train MAE: 0.29243 | Val MAE: 0.28865 -> Saved Best!


 30%|███       | 6/20 [00:45<01:46,  7.62s/it]

Epoch 6/20 | Train MAE: 0.28902 | Val MAE: 0.29014 


 35%|███▌      | 7/20 [00:53<01:40,  7.72s/it]

Epoch 7/20 | Train MAE: 0.28547 | Val MAE: 0.28576 -> Saved Best!


 40%|████      | 8/20 [01:01<01:32,  7.71s/it]

Epoch 8/20 | Train MAE: 0.28288 | Val MAE: 0.28462 -> Saved Best!


 45%|████▌     | 9/20 [01:09<01:24,  7.67s/it]

Epoch 9/20 | Train MAE: 0.28055 | Val MAE: 0.28420 -> Saved Best!


 50%|█████     | 10/20 [01:16<01:16,  7.64s/it]

Epoch 10/20 | Train MAE: 0.27817 | Val MAE: 0.28358 -> Saved Best!


 55%|█████▌    | 11/20 [01:24<01:08,  7.62s/it]

Epoch 11/20 | Train MAE: 0.27645 | Val MAE: 0.28427 


 60%|██████    | 12/20 [01:32<01:01,  7.71s/it]

Epoch 12/20 | Train MAE: 0.27483 | Val MAE: 0.28284 -> Saved Best!


 65%|██████▌   | 13/20 [01:39<00:53,  7.67s/it]

Epoch 13/20 | Train MAE: 0.27365 | Val MAE: 0.28407 


 70%|███████   | 14/20 [01:47<00:45,  7.66s/it]

Epoch 14/20 | Train MAE: 0.27201 | Val MAE: 0.28239 -> Saved Best!


 75%|███████▌  | 15/20 [01:55<00:38,  7.74s/it]

Epoch 15/20 | Train MAE: 0.27137 | Val MAE: 0.28241 


 80%|████████  | 16/20 [02:02<00:30,  7.61s/it]

Epoch 16/20 | Train MAE: 0.27002 | Val MAE: 0.28265 


 85%|████████▌ | 17/20 [02:10<00:23,  7.70s/it]

Epoch 17/20 | Train MAE: 0.26896 | Val MAE: 0.28279 


 90%|█████████ | 18/20 [02:18<00:15,  7.69s/it]

Epoch 18/20 | Train MAE: 0.26779 | Val MAE: 0.28294 


 95%|█████████▌| 19/20 [02:25<00:07,  7.67s/it]

Epoch 19/20 | Train MAE: 0.26725 | Val MAE: 0.28202 -> Saved Best!


100%|██████████| 20/20 [02:33<00:00,  7.69s/it]

Epoch 20/20 | Train MAE: 0.26601 | Val MAE: 0.28343 
Training finished. Best Val MAE: 0.28202





Final NN Independent real_length Score: 0.2819979989535552

>>> Training NN for real_width (log_real_width)...
    Clean Train: 249706 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:07<02:25,  7.66s/it]

Epoch 1/20 | Train MAE: 0.41242 | Val MAE: 0.31619 -> Saved Best!


 10%|█         | 2/20 [00:15<02:20,  7.80s/it]

Epoch 2/20 | Train MAE: 0.33663 | Val MAE: 0.31095 -> Saved Best!


 15%|█▌        | 3/20 [00:22<02:08,  7.54s/it]

Epoch 3/20 | Train MAE: 0.31254 | Val MAE: 0.30948 -> Saved Best!


 20%|██        | 4/20 [00:30<02:02,  7.68s/it]

Epoch 4/20 | Train MAE: 0.30513 | Val MAE: 0.30590 -> Saved Best!


 25%|██▌       | 5/20 [00:38<01:54,  7.64s/it]

Epoch 5/20 | Train MAE: 0.30053 | Val MAE: 0.29951 -> Saved Best!


 30%|███       | 6/20 [00:45<01:46,  7.64s/it]

Epoch 6/20 | Train MAE: 0.29824 | Val MAE: 0.29443 -> Saved Best!


 35%|███▌      | 7/20 [00:53<01:41,  7.77s/it]

Epoch 7/20 | Train MAE: 0.29530 | Val MAE: 0.29341 -> Saved Best!


 40%|████      | 8/20 [01:01<01:32,  7.72s/it]

Epoch 8/20 | Train MAE: 0.29363 | Val MAE: 0.29934 


 45%|████▌     | 9/20 [01:09<01:24,  7.66s/it]

Epoch 9/20 | Train MAE: 0.29152 | Val MAE: 0.29399 


 50%|█████     | 10/20 [01:16<01:16,  7.62s/it]

Epoch 10/20 | Train MAE: 0.28935 | Val MAE: 0.29524 


 55%|█████▌    | 11/20 [01:24<01:08,  7.60s/it]

Epoch 11/20 | Train MAE: 0.28752 | Val MAE: 0.29293 -> Saved Best!


 60%|██████    | 12/20 [01:32<01:01,  7.71s/it]

Epoch 12/20 | Train MAE: 0.28657 | Val MAE: 0.29226 -> Saved Best!


 65%|██████▌   | 13/20 [01:39<00:53,  7.68s/it]

Epoch 13/20 | Train MAE: 0.28474 | Val MAE: 0.29159 -> Saved Best!


 70%|███████   | 14/20 [01:47<00:46,  7.67s/it]

Epoch 14/20 | Train MAE: 0.28333 | Val MAE: 0.29231 


 75%|███████▌  | 15/20 [01:54<00:38,  7.65s/it]

Epoch 15/20 | Train MAE: 0.28218 | Val MAE: 0.29367 


 80%|████████  | 16/20 [02:02<00:30,  7.61s/it]

Epoch 16/20 | Train MAE: 0.28128 | Val MAE: 0.29152 


 85%|████████▌ | 17/20 [02:10<00:23,  7.73s/it]

Epoch 17/20 | Train MAE: 0.27950 | Val MAE: 0.29249 


 90%|█████████ | 18/20 [02:18<00:15,  7.68s/it]

Epoch 18/20 | Train MAE: 0.27824 | Val MAE: 0.29288 


 95%|█████████▌| 19/20 [02:25<00:07,  7.66s/it]

Epoch 19/20 | Train MAE: 0.27763 | Val MAE: 0.29147 -> Saved Best!


100%|██████████| 20/20 [02:33<00:00,  7.68s/it]

Epoch 20/20 | Train MAE: 0.27691 | Val MAE: 0.29231 
Training finished. Best Val MAE: 0.29147





Final NN Independent real_width Score: 0.29143795147255663

>>> Training NN for real_height (log_real_height)...
    Clean Train: 249889 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:07<02:25,  7.65s/it]

Epoch 1/20 | Train MAE: 0.56829 | Val MAE: 0.50851 -> Saved Best!


 10%|█         | 2/20 [00:15<02:17,  7.65s/it]

Epoch 2/20 | Train MAE: 0.52356 | Val MAE: 0.51969 


 15%|█▌        | 3/20 [00:22<02:09,  7.63s/it]

Epoch 3/20 | Train MAE: 0.51228 | Val MAE: 0.50507 -> Saved Best!


 20%|██        | 4/20 [00:30<02:04,  7.78s/it]

Epoch 4/20 | Train MAE: 0.50532 | Val MAE: 0.51810 


 25%|██▌       | 5/20 [00:38<01:55,  7.70s/it]

Epoch 5/20 | Train MAE: 0.50195 | Val MAE: 0.50675 


 30%|███       | 6/20 [00:46<01:47,  7.68s/it]

Epoch 6/20 | Train MAE: 0.49843 | Val MAE: 0.50357 -> Saved Best!


 35%|███▌      | 7/20 [00:54<01:41,  7.78s/it]

Epoch 7/20 | Train MAE: 0.49540 | Val MAE: 0.49865 -> Saved Best!


 40%|████      | 8/20 [01:01<01:32,  7.72s/it]

Epoch 8/20 | Train MAE: 0.49254 | Val MAE: 0.50989 


 45%|████▌     | 9/20 [01:09<01:24,  7.71s/it]

Epoch 9/20 | Train MAE: 0.48967 | Val MAE: 0.49843 -> Saved Best!


 50%|█████     | 10/20 [01:17<01:16,  7.69s/it]

Epoch 10/20 | Train MAE: 0.48719 | Val MAE: 0.49908 


 55%|█████▌    | 11/20 [01:24<01:09,  7.68s/it]

Epoch 11/20 | Train MAE: 0.48492 | Val MAE: 0.49634 -> Saved Best!


 60%|██████    | 12/20 [01:32<01:02,  7.80s/it]

Epoch 12/20 | Train MAE: 0.48228 | Val MAE: 0.49558 -> Saved Best!


 65%|██████▌   | 13/20 [01:40<00:54,  7.72s/it]

Epoch 13/20 | Train MAE: 0.47984 | Val MAE: 0.49633 


 70%|███████   | 14/20 [01:47<00:46,  7.68s/it]

Epoch 14/20 | Train MAE: 0.47816 | Val MAE: 0.50017 


 75%|███████▌  | 15/20 [01:55<00:38,  7.65s/it]

Epoch 15/20 | Train MAE: 0.47609 | Val MAE: 0.49686 


 80%|████████  | 16/20 [02:03<00:30,  7.63s/it]

Epoch 16/20 | Train MAE: 0.47349 | Val MAE: 0.49796 


 85%|████████▌ | 17/20 [02:10<00:23,  7.70s/it]

Epoch 17/20 | Train MAE: 0.47125 | Val MAE: 0.49684 


 90%|█████████ | 18/20 [02:18<00:15,  7.65s/it]

Epoch 18/20 | Train MAE: 0.46992 | Val MAE: 0.49948 


 95%|█████████▌| 19/20 [02:26<00:07,  7.63s/it]

Epoch 19/20 | Train MAE: 0.46828 | Val MAE: 0.50331 


100%|██████████| 20/20 [02:33<00:00,  7.70s/it]

Epoch 20/20 | Train MAE: 0.46670 | Val MAE: 0.49828 
Training finished. Best Val MAE: 0.49558





Final NN Independent real_height Score: 0.4955475874968849

=== FINAL NN Independent Score: 0.32757 ===


In [None]:
results_targets["Independent NN"] = [nn_scores[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost,Multi CatBoost,Multi NN,Independent NN
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
real_weight,0.350348,0.265358,0.258457,0.268604,0.244118,0.24131
real_length,0.319958,0.295289,0.287794,0.290994,0.283718,0.281998
real_width,0.325912,0.299565,0.294882,0.296282,0.291456,0.291438
real_height,0.552994,0.503075,0.498499,0.513128,0.492199,0.495548


Независимый подход не дал значительного улучшения

In [50]:
result_scores = results_targets.mean().sort_values()

result_scores

Independent NN                                 0.327573
Multi NN                                       0.327873
Independent CatBoost                           0.334908
Independent CatBoost (w/o Image embeddings)    0.340822
Multi CatBoost                                 0.342252
Baseline (mean_microcat)                       0.387303
dtype: float64

In [None]:
def get_submission_nn(model, test_loader):
    submission = pd.DataFrame()
    submission["item_id"] = test.index
    submission_preds = []

    with torch.no_grad():
        if isinstance(model, dict):
            target_results = {target: [] for target in target_columns}
            for inputs in test_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                for target in target_columns:
                    outputs = model[target](inputs)
                    target_results[target].append(outputs.cpu().numpy())

            cols = [np.vstack(target_results[t]) for t in target_columns]
            submission_preds = np.hstack(cols)
        else:
            for inputs in test_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                outputs = model(inputs)
                submission_preds.append(outputs.cpu().numpy())
            submission_preds = np.vstack(submission_preds)

    submission_preds = np.maximum(0, np.expm1(submission_preds))
    submission["weight"] = submission_preds[:, 0]
    submission["height"] = submission_preds[:, 3]
    submission["length"] = submission_preds[:, 1]
    submission["width"] = submission_preds[:, 2]

    submission.to_csv("submission.csv", index=False)
    print("submission.csv saved")
    return submission

In [53]:
sub = get_submission_nn(nn_models, test_loader)

sub

submission.csv saved


Unnamed: 0,item_id,weight,height,length,width
0,163755,0.332757,7.021416,23.686356,22.572178
1,1339648,2.011257,18.881567,37.210915,30.044701
2,21095,1.088169,12.535995,34.194679,25.167265
3,925424,0.391760,3.183912,32.440109,23.186459
4,780125,3.386568,17.440935,35.311657,26.748854
...,...,...,...,...,...
70269,1207676,0.347877,3.783942,32.915623,23.594051
70270,1614448,0.725141,13.584761,25.660780,20.045477
70271,1787906,0.326247,7.646830,18.148447,13.503795
70272,897587,0.661844,8.373682,34.022106,25.754971


### Лучший тестовый скор на Stepik показала Multi NN. 0.318857
