In [1]:
import os
import re
from collections import Counter
from collections.abc import Iterable
from typing import Any, Dict, List
from IPython.display import display_html

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from catboost import CatBoostRegressor, Pool
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T
from torchvision.models import efficientnet_v2_s
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train = pd.read_parquet("/kaggle/input/aaa-exam-data/train.parquet")
train = train.set_index("item_id")
test = pd.read_parquet("/kaggle/input/aaa-exam-data/test.parquet")
test = test.set_index("item_id")

target_columns = ["real_weight", "real_length", "real_width", "real_height"]

У меня появилась гипотеза о том, что длина >= ширина >= высота

In [4]:
dim_data = train[["real_height", "real_length", "real_width"]].values

temp_analyze = train[["microcat_name"]].copy()
temp_analyze["real_min"] = np.min(dim_data, axis=1)
temp_analyze["real_mid"] = np.median(dim_data, axis=1)
temp_analyze["real_max"] = np.max(dim_data, axis=1)


In [5]:
temp_analyze.groupby(by="microcat_name")[["real_min", "real_mid", "real_max"]].mean()

Unnamed: 0_level_0,real_min,real_mid,real_max
microcat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"CD, DVD и Blu-ray приводы",10.171717,17.707071,22.575758
GPS-навигаторы,10.755102,16.474490,21.852041
MP3-плееры,10.103321,15.442804,20.693727
Автоакустика,17.773181,26.845934,36.281027
Автосвет,20.250000,29.076923,42.625000
...,...,...,...
Электронные книги,10.033654,19.173077,24.961538
Электрооборудование,11.977264,18.324899,27.255226
Эпиляторы,11.071429,17.914286,22.871429
"Этикетки, бутылки, пробки",14.445876,21.007732,30.201031


In [6]:
train.groupby(by="microcat_name")[["real_height", "real_width", "real_length"]].mean()

Unnamed: 0_level_0,real_height,real_width,real_length
microcat_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"CD, DVD и Blu-ray приводы",10.171717,17.707071,22.575758
GPS-навигаторы,10.755102,16.474490,21.852041
MP3-плееры,10.103321,15.442804,20.693727
Автоакустика,17.773181,26.845934,36.281027
Автосвет,20.250000,29.076923,42.625000
...,...,...,...
Электронные книги,10.033654,19.173077,24.961538
Электрооборудование,11.977264,18.324899,27.255226
Эпиляторы,11.071429,17.914286,22.871429
"Этикетки, бутылки, пробки",14.445876,21.007732,30.201031


In [7]:
np.all(
    temp_analyze.groupby(by="microcat_name")[["real_min", "real_mid", "real_max"]].mean().values
    == train.groupby(by="microcat_name")[["real_height", "real_width", "real_length"]].mean().values
)

np.True_

Средние по всем микрокатегориям совпадают, так что скорее всего гипотеза верна

In [4]:
def remove_outliers(df: pd.DataFrame, cat_col: str = "microcat_name") -> pd.DataFrame:
    """
    Удаления экстремальных выбросов из датафрейма.
    Группирует и считает верхние и нижние границы выбросов по колонке cat_col.

    Args:
        df (pd.DataFrame): Исходный датафрейм
        cat_col (str, optional): Колонка для группировки. Defaults to "microcat_name".

    Returns:
        pd.DataFrame: Датафрейм, очищенный от выбросов
    """
    df_clean = df.copy()

    target_cols = ["real_weight", "real_length", "real_width", "real_height"]

    for col in target_cols:
        upper_limit = df_clean.groupby(cat_col)[col].transform(lambda x: x.quantile(0.99))
        lower_limit = df_clean.groupby(cat_col)[col].transform(lambda x: x.quantile(0.01))

        mask = (df_clean[col] < upper_limit) & (df_clean[col] > lower_limit)
        mask = mask | (upper_limit == lower_limit)
        df_clean = df_clean[mask]

        print(mask.sum())

    # return df_clean # закомментируем на время

In [9]:
remove_outliers(train)

305560
287105
273759
241035


При полном последовательном очищении таргетов мы теряем большое число наблюдений.

Для каждого таргета будем использовать свою маску валидного датасета, тк для обучения предсказания длины, высота в принципе может являться выбросом

In [5]:
def get_valid_mask(df: pd.DataFrame, target: str, cat_col: str = "microcat_name") -> List:
    """
    Поиск выбросов по конкретному таргету в датафрейме.
    1 - валидная строка, 0 - выброс по target

    Args:
        df (pd.DataFrame): Исходный датафрейм
        target (str): Таргет
        cat_col (str, optional): Колонка для группировки. Defaults to "microcat_name".

    Returns:
        List: Бинарную маску валидных строк датафрейм
    """
    gb = df.groupby(cat_col)[target]
    upper = gb.transform(lambda x: x.quantile(0.999))
    lower = gb.transform(lambda x: x.quantile(0.001))

    quantile_mask = (df[target] <= upper) & (df[target] >= lower)

    mask = quantile_mask | (upper == lower)

    return mask

In [6]:
def text_clean(text: str) -> str:
    """
    Форматирование и очистка текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        str: Очищенная и отформатированная строка
    """
    if pd.isna(text):
        return ""

    text = str(text).lower()
    text = text.replace("\n", " ").replace("\r", " ")
    text = re.sub(r"[^а-яёa-z0-9\s\.\,\-\*]", " ", text)

    text = " ".join(text.split())

    return text


def extract_weight(text: str) -> float:
    """Извлечение веса из текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        float: Извлеченной значение веса в кг
    """
    if pd.isna(text):
        return None

    text = text.lower()
    pattern = r"(\d+[.,]?\d*)\s*(кг|kg|г|гр|g|ml|мл)"

    match = re.search(pattern, text)
    if match:
        value_str = match.group(1).replace(",", ".")
        unit = match.group(2)

        try:
            value = float(value_str)
        except ValueError:
            return None

        if unit in ["г", "гр", "g", "ml", "мл"]:
            return value / 1000.0
        return value

    return None


def extract_dimensions(text: str) -> tuple[float, float, float]:
    """Извлечение Длины, Ширины и Высоты из текста при помощи регулярного выражения

    Args:
        text (str): Исходный текст

    Returns:
        tuple[float, float, float]: Извлеченные Длина, Ширина и Высота
    """
    if pd.isna(text) or text == "":
        return -1, -1, -1

    text = str(text).lower()
    text = text.replace(",", ".")

    units_map = {"мм": 0.1, "mm": 0.1, "см": 1.0, "cm": 1.0, "м": 100.0, "m": 100.0}

    pattern_3d = r"(\d+(?:\.\d+)?)\s*[xх*]\s*(\d+(?:\.\d+)?)\s*[xх*]\s*(\d+(?:\.\d+)?)\s*(мм|mm|см|cm|м|m)?"
    match = re.search(pattern_3d, text)
    dims = []

    if match:
        try:
            d1 = float(match.group(1))
            d2 = float(match.group(2))
            d3 = float(match.group(3))
            unit = match.group(4)

            dims = [d1, d2, d3]

            if unit in units_map:
                factor = units_map[unit]
                dims = [d * factor for d in dims]

        except ValueError:
            return -1, -1, -1
    else:
        return -1, -1, -1

    dims.sort()  # [Min, Mid, Max]
    return dims[0], dims[1], dims[2]


In [7]:
def data_prepare(df: pd.DataFrame) -> pd.DataFrame:
    """Подготовка датафрейма к работе.
    Заполняет пропуски, форматирует и чистит текст, извлекает из текста фичи, дропает ненужные колонки

    Args:
        df (pd.DataFrame): Исходный датафрейм

    Returns:
        pd.DataFrame: Подготовленный датафрейм
    """
    df["item_condition"] = df["item_condition"].fillna("Unknown")

    df["text"] = df["title"].fillna("") + " " + df["description"].fillna("")
    df["text"] = df["text"].apply(text_clean)

    df["extracted_weight"] = df["text"].apply(extract_weight)

    dims_list = df["text"].apply(extract_dimensions).tolist()
    dims_df = pd.DataFrame(dims_list, columns=["extracted_length", "extracted_width", "extracted_height"], index=df.index)
    df = pd.concat([df, dims_df], axis=1)

    for col in ["extracted_weight", "extracted_length", "extracted_width", "extracted_height"]:
        df[col] = df[col].fillna(-1)

    df = df.drop(columns=["order_date", "seller_id", "buyer_id", "description", "title"])

    return df


train = data_prepare(train)
test = data_prepare(test)


Добавим логарифмированные таргеты. Будем обучать модель именно по ним, для соответствия с пространством метрики

In [11]:
for target in target_columns:
    train["log_" + target] = np.log(1 + train[target])

log_target_columns = ["log_" + target for target in target_columns]
all_targets = target_columns + log_target_columns

In [8]:
train.head()

Unnamed: 0_level_0,item_condition,item_price,category_name,subcategory_name,microcat_name,image_name,real_weight,real_height,real_length,real_width,text,extracted_weight,extracted_length,extracted_width,extracted_height
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
185689,Б/у,3000.0,Транспорт,Запчасти и аксессуары,Салон,185689.jpg,0.37,10.0,23.0,19.0,ручка акпп mercedes w203 avangarde ручка рычаг...,-1.0,-1.0,-1.0,-1.0
1914373,Новое с биркой,5990.0,Личные вещи,"Одежда, обувь, аксессуары",Зимние куртки и пуховики,1914373.jpg,2.486,14.0,37.0,24.0,пуховик moncler голубой 52 размер объявление д...,-1.0,-1.0,-1.0,-1.0
361626,Новое,1200.0,Транспорт,Запчасти и аксессуары,Двигатель,361626.jpg,0.64,7.0,23.0,18.0,"запчасти на ford фокус1 опора задняя,двигатель...",-1.0,-1.0,-1.0,-1.0
534927,Б/у,13000.0,Электроника,"Игры, приставки и программы",Игровые приставки и аксессуары,534927.jpg,7.1,20.0,35.0,20.0,ps3 cechc 08 скальпирована hen полностью испра...,-1.0,-1.0,-1.0,-1.0
199043,Отличное,300.0,Личные вещи,"Одежда, обувь, аксессуары","Джемперы, свитеры, кардиганы",199043.jpg,0.4,7.0,23.0,11.0,свитер трикотаж 44 р-р reserved свитер pull be...,-1.0,-1.0,-1.0,-1.0


In [12]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=all_targets), train[all_targets], test_size=0.2, random_state=42)

temp_train = pd.concat([X_train, y_train], axis=1)
temp_val = pd.concat([X_val, y_val], axis=1)

Добавим новые фичи - средние значения логарифма таргетов по их микрокатегории. Посчитаем средние значения на train, их же будем использовать в val и test, во избежание утечки

In [13]:
for log_target in log_target_columns:
    gb = temp_train.groupby(by="microcat_name")[log_target].mean()
    mapper = dict(gb.items())
    X_train["mean_microcat_" + log_target] = X_train["microcat_name"].apply(lambda x: mapper[x])
    X_val["mean_microcat_" + log_target] = X_val["microcat_name"].apply(lambda x: mapper[x])
    test["mean_microcat_" + log_target] = test["microcat_name"].apply(lambda x: mapper[x])

В качестве бейзлайна возьмем это же среднее логарифмов таргета по микрокатегориям

In [14]:
scores_baseline = {log_target: mean_absolute_error(y_train[log_target], X_train["mean_microcat_" + log_target]) for log_target in log_target_columns}
results_targets = pd.DataFrame(
    {
        "Target": target_columns,
        "Baseline (mean_microcat)": [scores_baseline["log_" + t] for t in target_columns],
    }
)
results_targets = results_targets.set_index("Target")

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat)
Target,Unnamed: 1_level_1
real_weight,0.350348
real_length,0.319958
real_width,0.325912
real_height,0.552994


В качестве модели я решил использовать CatBoost, тк он из коробки отлично справляется с обработкой категориальных фичей (мне кажется категории имеют большое влияние на таргет) и текста, который также несет в себя большую часть информации по объекту, а так же умеет работать с эмбеддингами (для использования изображений товаров)

Попробуем обучать CatBoost независимо под каждый таргет, пока без эмбеддингов изображений

In [16]:
cat_feat = ["item_condition", "category_name", "subcategory_name", "microcat_name"]

In [20]:
preds_independent = {}
cb_independent = {}
scores_indep = {}

for target, log_target in zip(target_columns, log_target_columns):
    print(f"Training for {target}...")
    train_mask = get_valid_mask(temp_train, target)
    print(f"Clean Train size: {train_mask.sum()}")

    cb_train = Pool(
        X_train[train_mask].drop(columns=["image_name"]), label=y_train.loc[train_mask, log_target], cat_features=cat_feat, text_features=["text"]
    )
    cb_val = Pool(X_val.drop(columns=["image_name"]), label=y_val[log_target], cat_features=cat_feat, text_features=["text"])

    cb = CatBoostRegressor(
        learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MAE", eval_metric="MAE", task_type="GPU", devices="0:1"
    )

    cb.fit(cb_train, eval_set=cb_val, early_stopping_rounds=50, verbose=500)
    cb_independent[target] = cb

    full_val_pool = Pool(X_val.drop(columns=["image_name"]), cat_features=cat_feat, text_features=["text"])

    full_preds = cb.predict(full_val_pool)
    preds_independent[target] = full_preds

    # MAE на логарифмах (это и есть Log-MAE)
    real_score = mean_absolute_error(y_val[log_target], full_preds)
    scores_indep[target] = real_score

    print(f"\n  > REAL Score for {target}: {real_score:.5f}\n")

avg_score_indep = np.mean(list(scores_indep.values()))
print(f"=== ИТОГ Independent (w/o Image embeddings): {avg_score_indep:.5f} ===\n")

Training for real_weight...
Clean Train size: 249661


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.4011205	test: 0.4019997	best: 0.4019997 (0)	total: 108ms	remaining: 18m
500:	learn: 0.2793553	test: 0.2833484	best: 0.2833484 (500)	total: 5.87s	remaining: 1m 51s
1000:	learn: 0.2709423	test: 0.2765048	best: 0.2765048 (1000)	total: 10.7s	remaining: 1m 35s
1500:	learn: 0.2659616	test: 0.2732406	best: 0.2732406 (1500)	total: 15.6s	remaining: 1m 28s
2000:	learn: 0.2621929	test: 0.2712421	best: 0.2712421 (2000)	total: 20.5s	remaining: 1m 21s
2500:	learn: 0.2591141	test: 0.2699129	best: 0.2699129 (2500)	total: 25.4s	remaining: 1m 16s
3000:	learn: 0.2563970	test: 0.2689241	best: 0.2689241 (3000)	total: 30.4s	remaining: 1m 10s
3500:	learn: 0.2540267	test: 0.2682523	best: 0.2682523 (3500)	total: 35.3s	remaining: 1m 5s
4000:	learn: 0.2517327	test: 0.2676015	best: 0.2676010 (3975)	total: 40.2s	remaining: 1m
4500:	learn: 0.2496143	test: 0.2670656	best: 0.2670656 (4500)	total: 45.3s	remaining: 55.3s
5000:	learn: 0.2476580	test: 0.2667139	best: 0.2667139 (5000)	total: 50.2s	remaining: 5

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3493462	test: 0.3530040	best: 0.3530040 (0)	total: 27.1ms	remaining: 4m 31s
500:	learn: 0.2935348	test: 0.2994328	best: 0.2994328 (500)	total: 5.62s	remaining: 1m 46s
1000:	learn: 0.2893679	test: 0.2974066	best: 0.2974066 (999)	total: 10.6s	remaining: 1m 34s
1500:	learn: 0.2864638	test: 0.2964193	best: 0.2964193 (1500)	total: 15.4s	remaining: 1m 26s
2000:	learn: 0.2839125	test: 0.2958708	best: 0.2958703 (1999)	total: 20.2s	remaining: 1m 20s
2500:	learn: 0.2816804	test: 0.2954551	best: 0.2954545 (2499)	total: 25.1s	remaining: 1m 15s
bestTest = 0.2952892056
bestIteration = 2734
Shrink model to first 2735 iterations.

  > REAL Score for real_length: 0.29529

Training for real_width...
Clean Train size: 249706


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3566468	test: 0.3590276	best: 0.3590276 (0)	total: 29.3ms	remaining: 4m 52s
500:	learn: 0.3015442	test: 0.3045787	best: 0.3045787 (500)	total: 5.85s	remaining: 1m 50s
1000:	learn: 0.2975935	test: 0.3026059	best: 0.3026059 (1000)	total: 10.8s	remaining: 1m 37s
1500:	learn: 0.2946035	test: 0.3015562	best: 0.3015562 (1500)	total: 15.7s	remaining: 1m 28s
2000:	learn: 0.2921006	test: 0.3009815	best: 0.3009815 (2000)	total: 20.7s	remaining: 1m 22s
2500:	learn: 0.2898571	test: 0.3005375	best: 0.3005375 (2500)	total: 25.4s	remaining: 1m 16s
3000:	learn: 0.2877718	test: 0.3002338	best: 0.3002338 (3000)	total: 30.3s	remaining: 1m 10s
3500:	learn: 0.2857746	test: 0.2999712	best: 0.2999680 (3475)	total: 35.2s	remaining: 1m 5s
4000:	learn: 0.2839284	test: 0.2997675	best: 0.2997645 (3997)	total: 40s	remaining: 59.9s
4500:	learn: 0.2821689	test: 0.2996386	best: 0.2996335 (4487)	total: 44.8s	remaining: 54.7s
bestTest = 0.2995648459
bestIteration = 4790
Shrink model to first 4791 iterations

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.5844734	test: 0.5845014	best: 0.5845014 (0)	total: 33.1ms	remaining: 5m 30s
500:	learn: 0.5076215	test: 0.5122960	best: 0.5122960 (500)	total: 5.86s	remaining: 1m 51s
1000:	learn: 0.5021291	test: 0.5089391	best: 0.5089391 (1000)	total: 10.7s	remaining: 1m 36s
1500:	learn: 0.4984938	test: 0.5073207	best: 0.5073207 (1500)	total: 15.6s	remaining: 1m 28s
2000:	learn: 0.4954781	test: 0.5062817	best: 0.5062817 (2000)	total: 20.5s	remaining: 1m 22s
2500:	learn: 0.4927753	test: 0.5055125	best: 0.5055125 (2500)	total: 25.5s	remaining: 1m 16s
3000:	learn: 0.4902923	test: 0.5049479	best: 0.5049456 (2998)	total: 30.3s	remaining: 1m 10s
3500:	learn: 0.4880175	test: 0.5044865	best: 0.5044865 (3500)	total: 35.2s	remaining: 1m 5s
4000:	learn: 0.4858451	test: 0.5040747	best: 0.5040747 (4000)	total: 40.1s	remaining: 1m
4500:	learn: 0.4837408	test: 0.5037896	best: 0.5037882 (4492)	total: 45.1s	remaining: 55.1s
5000:	learn: 0.4817707	test: 0.5035877	best: 0.5035865 (4980)	total: 50.1s	remainin

In [21]:
results_targets["Independent CatBoost (w/o Image embeddings)"] = [scores_indep[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings)
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
real_weight,0.350348,0.265358
real_length,0.319958,0.295289
real_width,0.325912,0.299565
real_height,0.552994,0.503075


Далее идут блоки кода извлечения эмбеддингов из изображений при помощи EfficientNet_v2_S. Выполнял я их в колабе, прикрепив гугл диск с zip файлами изображений. Остальную же работу я выполнял на каггле, поэтому я выгрузил результаты в файлы и пользовался ими

In [None]:
# !unzip /content/drive/MyDrive/train.zip -d /content

In [None]:
# !unzip /content/drive/MyDrive/test.zip -d /content

In [27]:
# MEAN = np.array([0.485, 0.456, 0.406])
# STD = np.array([0.229, 0.224, 0.225])

# TRAIN_IMAGES = 'train'
# TEST_IMAGES = 'test'

In [28]:
# class ImageDataset(Dataset):
#     def __init__(self, images, images_folder):
#         self.images = images.values if hasattr(images, 'values') else images
#         self.images_folder = images_folder
#         self.transform = T.Compose([
#             T.Resize((384, 384), interpolation=T.InterpolationMode.BILINEAR),
#             T.CenterCrop(384),
#             T.ToTensor(),
#             T.Normalize(MEAN, STD)
#         ])

#     def __len__(self):
#         return len(self.images)

#     def __getitem__(self, index):
#         path = os.path.join(self.images_folder, self.images[index])
#         try:
#             img = Image.open(path).convert('RGB')
#             return self.transform(img)
#         except Exception as e:
#             print(f"Ошибка загрузки {path}: {e}")
#             return torch.zeros((3, 384, 384))

# train_ds = ImageDataset(train['image_name'], TRAIN_IMAGES)
# test_ds = ImageDataset(test['image_name'], TEST_IMAGES)


Необходимо заменить классификатор (последний блок) модели (тк она обучалась на ImageNet) на тождественное преобразование для получения фичей изображения

In [29]:
# vmodel = efficientnet_v2_s(weights='IMAGENET1K_V1')
# vmodel.classifier = nn.Identity()
# vmodel.to(device)
# optimized_vmodel = torch.compile(vmodel)

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_v2_s-dd5fe13b.pth


100%|██████████| 82.7M/82.7M [00:00<00:00, 197MB/s]


In [30]:
# def image_extract(dataset, model):
#     loader = DataLoader(dataset, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)
#     all_features = []

#     with torch.inference_mode():
#         for batch in tqdm(loader):
#             batch = batch.to(device, non_blocking=True)

#             with torch.amp.autocast('cuda'):
#                 features = model(batch)

#             all_features.append(features.cpu())

#     return torch.cat(all_features).numpy()

# train_emb = image_extract(train_ds, optimized_vmodel)
# test_emb = image_extract(test_ds, optimized_vmodel)

  0%|          | 0/2445 [00:00<?, ?it/s]W0103 13:39:30.852000 160 torch/_inductor/utils.py:1558] [0/0] Not enough SMs to use max_autotune_gemm mode
100%|██████████| 2445/2445 [47:10<00:00,  1.16s/it]
100%|██████████| 550/550 [10:42<00:00,  1.17s/it]


In [None]:
# train_emb_df = pd.DataFrame(train_emb, index=train.index).add_prefix('emb_')
# test_emb_df = pd.DataFrame(test_emb, index=test.index).add_prefix('emb_')

# train_emb_df.to_parquet("contest_data/train_embeddings.parquet")
# test_emb_df.to_parquet("contest_data/test_embeddings.parquet")

Также я посчитал решил посчитать отношение ширины к высоте изображения, так как при извлечении эмбеддингов оно приводится к квадрату 384х384 и соотношение сторон теряется

In [None]:
# def get_image_aspect_ratio(image_names, folder_path):
#     """
#     Считывает размеры изображений и возвращает их отношение (Width / Height).
#     """
#     ratios = []

#     for name in image_names:
#         path = os.path.join(folder_path, name)
#         try:
#             with Image.open(path) as img:
#                 w, h = img.size
#                 ratios.append(w / h)
#         except Exception as e:
#             ratios.append(1.0)

#     return np.array(ratios, dtype=np.float32)


# train_aspect_ratios = get_image_aspect_ratio(train['image_name'], TRAIN_IMAGES)
# test_aspect_ratios = get_image_aspect_ratio(test['image_name'], TEST_IMAGES)

# pd.DataFrame(train_aspect_ratios).to_parquet("train_aspect_ratios.parquet")
# pd.DataFrame(test_aspect_ratios).to_parquet("test_aspect_ratios.parquet")

In [18]:
train_embs = pd.read_parquet("/kaggle/input/aaa-exam-data/train_embeddings.parquet")
test_embs = pd.read_parquet("/kaggle/input/aaa-exam-data/test_embeddings.parquet")

train_aspect_ratios = pd.read_parquet("/kaggle/input/aaa-exam-data/train_aspect_ratios.parquet").set_index(train.index).rename(columns={0: 'aspect_ratio'})
test_aspect_ratios = pd.read_parquet("/kaggle/input/aaa-exam-data/test_aspect_ratios.parquet").set_index(test.index).rename(columns={0: 'aspect_ratio'})

train = train.join(train_aspect_ratios).join(train_embs).copy()
test = test.join(test_aspect_ratios).join(test_embs).copy()

train = train.drop(columns=["image_name"])
test = test.drop(columns=["image_name"])

Для каждого изображения мы получили вектор из 1280 компонент. Это слишком много для сырой подачи в нашу модель, эмбеддинги могут быть слишком разрежены и скореллированы.

Применим к ним PCA, чтобы сократить использование памяти и время обучения, оставив только самые важные признаки.

In [19]:
emb_cols = [f"emb_{i}" for i in range(1280)]
pca = PCA(n_components=96, random_state=42)

train_pca = pca.fit_transform(train[emb_cols])
test_pca = pca.transform(test[emb_cols])

pca_cols = [f"img_pca_{i}" for i in range(96)]
train_pca_df = pd.DataFrame(train_pca, columns=pca_cols, index=train.index)
test_pca_df = pd.DataFrame(test_pca, columns=pca_cols, index=test.index)

train = train.join(train_pca_df).copy()
test = test.join(test_pca_df).copy()


train.drop(columns=emb_cols, inplace=True)
test.drop(columns=emb_cols, inplace=True)

Я решил добавить еще фичей, которые могут быть полезны

In [20]:
train["log_price"] = np.log1p(train["item_price"])
price_stats = train.groupby("microcat_name")["log_price"].transform("mean")
train["price_rel_to_category"] = train["log_price"] - price_stats

test["log_price"] = np.log1p(test["item_price"])
price_stats = test.groupby("microcat_name")["log_price"].transform("mean")
test["price_rel_to_category"] = test["log_price"] - price_stats

In [21]:
X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=all_targets), train[all_targets], test_size=0.2, random_state=42)
temp_train = pd.concat([X_train, y_train], axis=1)
temp_val = pd.concat([X_val, y_val], axis=1)

X_test = test.copy()

In [22]:
for log_target in log_target_columns:
    gb = temp_train.groupby(by="microcat_name")[log_target].mean()
    mapper = {microcat: val for microcat, val in gb.items()}
    X_train["mean_microcat_" + log_target] = X_train["microcat_name"].apply(lambda x: mapper[x])
    X_val["mean_microcat_" + log_target] = X_val["microcat_name"].apply(lambda x: mapper[x])
    X_test["mean_microcat_" + log_target] = X_test["microcat_name"].apply(lambda x: mapper[x])

Снова обучим CatBoost независимо под каждый таргет с новыми добавленными фичами

In [23]:
preds_independent_w_img = {}
cb_independent_w_img = {}
scores_indep_w_img = {}

for target, log_target in zip(target_columns, log_target_columns):
    print(f"Training for {target}...")
    train_mask = get_valid_mask(temp_train, target)
    print(f"Clean Train size: {train_mask.sum()}")

    cb_train = Pool(
        X_train[train_mask], label=y_train.loc[train_mask, log_target],
        cat_features=cat_feat, text_features=["text"]
    )
    cb_val = Pool(
        X_val, label=y_val[log_target],
        cat_features=cat_feat, text_features=["text"]
    )

    cb = CatBoostRegressor(
        learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MAE", eval_metric="MAE", task_type="GPU", devices="0:1"
    )

    cb.fit(cb_train, eval_set=cb_val, early_stopping_rounds=50, verbose=500)
    cb_independent_w_img[target] = cb

    full_val_pool = Pool(X_val, cat_features=cat_feat, text_features=["text"])

    full_preds = cb.predict(full_val_pool)
    preds_independent_w_img[target] = full_preds

    # MAE на логарифмах (это и есть Log-MAE)
    real_score = mean_absolute_error(y_val[log_target], full_preds)
    scores_indep_w_img[target] = real_score

    print(f"\n  > REAL Score for {target}: {real_score:.5f}\n")


avg_score_indep_w_img = np.mean(list(scores_indep_w_img.values()))
print(f"=== ИТОГ Independent: {avg_score_indep_w_img:.5f} ===\n")

Training for real_weight...
Clean Train size: 249661


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.4008669	test: 0.4017281	best: 0.4017281 (0)	total: 136ms	remaining: 22m 41s
500:	learn: 0.2675166	test: 0.2732652	best: 0.2732652 (500)	total: 7.18s	remaining: 2m 16s
1000:	learn: 0.2581424	test: 0.2667336	best: 0.2667336 (1000)	total: 13.1s	remaining: 1m 58s
1500:	learn: 0.2519958	test: 0.2637888	best: 0.2637888 (1500)	total: 19.2s	remaining: 1m 48s
2000:	learn: 0.2469598	test: 0.2621194	best: 0.2621194 (2000)	total: 25.2s	remaining: 1m 40s
2500:	learn: 0.2424642	test: 0.2609167	best: 0.2609167 (2500)	total: 31.1s	remaining: 1m 33s
3000:	learn: 0.2383436	test: 0.2600415	best: 0.2600415 (3000)	total: 37.2s	remaining: 1m 26s
3500:	learn: 0.2344491	test: 0.2593419	best: 0.2593397 (3499)	total: 43.3s	remaining: 1m 20s
4000:	learn: 0.2307766	test: 0.2588550	best: 0.2588550 (4000)	total: 49.6s	remaining: 1m 14s
4500:	learn: 0.2271883	test: 0.2584699	best: 0.2584575 (4473)	total: 55.8s	remaining: 1m 8s
bestTest = 0.2584575131
bestIteration = 4473
Shrink model to first 4474 iterat

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3490948	test: 0.3527502	best: 0.3527502 (0)	total: 32.7ms	remaining: 5m 26s
500:	learn: 0.2850625	test: 0.2924172	best: 0.2924172 (500)	total: 7.21s	remaining: 2m 16s
1000:	learn: 0.2791696	test: 0.2901644	best: 0.2901644 (1000)	total: 13.3s	remaining: 1m 59s
1500:	learn: 0.2745098	test: 0.2891612	best: 0.2891612 (1500)	total: 19.5s	remaining: 1m 50s
2000:	learn: 0.2702640	test: 0.2885678	best: 0.2885678 (2000)	total: 25.6s	remaining: 1m 42s
2500:	learn: 0.2662923	test: 0.2881773	best: 0.2881706 (2493)	total: 31.9s	remaining: 1m 35s
3000:	learn: 0.2625229	test: 0.2879200	best: 0.2879200 (3000)	total: 38.2s	remaining: 1m 29s
bestTest = 0.2877942889
bestIteration = 3211
Shrink model to first 3212 iterations.

  > REAL Score for real_length: 0.28779

Training for real_width...
Clean Train size: 249706


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.3562959	test: 0.3586633	best: 0.3586633 (0)	total: 31.5ms	remaining: 5m 14s
500:	learn: 0.2938633	test: 0.2987570	best: 0.2987570 (500)	total: 7.37s	remaining: 2m 19s
1000:	learn: 0.2881989	test: 0.2968472	best: 0.2968472 (1000)	total: 13.5s	remaining: 2m 1s
1500:	learn: 0.2836370	test: 0.2960053	best: 0.2960052 (1498)	total: 19.7s	remaining: 1m 51s
2000:	learn: 0.2794393	test: 0.2955019	best: 0.2955019 (2000)	total: 26s	remaining: 1m 43s
2500:	learn: 0.2754796	test: 0.2951328	best: 0.2951328 (2500)	total: 32.2s	remaining: 1m 36s
3000:	learn: 0.2716177	test: 0.2948908	best: 0.2948816 (2983)	total: 38.6s	remaining: 1m 30s
bestTest = 0.2948816465
bestIteration = 2983
Shrink model to first 2984 iterations.

  > REAL Score for real_width: 0.29488

Training for real_height...
Clean Train size: 249889


Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 0.5844154	test: 0.5844358	best: 0.5844358 (0)	total: 32.4ms	remaining: 5m 23s
500:	learn: 0.4997583	test: 0.5061510	best: 0.5061510 (500)	total: 7.27s	remaining: 2m 17s
1000:	learn: 0.4927963	test: 0.5028432	best: 0.5028432 (1000)	total: 13.4s	remaining: 2m
1500:	learn: 0.4874950	test: 0.5011687	best: 0.5011673 (1499)	total: 19.4s	remaining: 1m 50s
2000:	learn: 0.4828854	test: 0.5002560	best: 0.5002560 (2000)	total: 25.7s	remaining: 1m 42s
2500:	learn: 0.4786294	test: 0.4995497	best: 0.4995497 (2500)	total: 31.9s	remaining: 1m 35s
3000:	learn: 0.4745321	test: 0.4991697	best: 0.4991695 (2998)	total: 38.2s	remaining: 1m 29s
3500:	learn: 0.4705619	test: 0.4987373	best: 0.4987363 (3491)	total: 44.4s	remaining: 1m 22s
bestTest = 0.4984985636
bestIteration = 3810
Shrink model to first 3811 iterations.

  > REAL Score for real_height: 0.49850

=== ИТОГ Independent: 0.33491 ===



In [28]:
results_targets["Independent CatBoost"] = [scores_indep_w_img[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
real_weight,0.350348,0.265358,0.258457
real_length,0.319958,0.295289,0.287794
real_width,0.325912,0.299565,0.294882
real_height,0.552994,0.503075,0.498499


Результат стал действительно лучше, особенно в предсказании длины и ширины

Посмотрим на влияние фичей на предсказания моделей

In [29]:
cb_models = [cb_independent_w_img[target] for target in target_columns]

all_importances = pd.DataFrame({"Feature": cb_models[0].feature_names_})

for target, model in zip(target_columns, cb_models):
    all_importances[target] = model.get_feature_importance()

all_importances = all_importances.set_index("Feature")

In [30]:
html_str = ""
for col in target_columns[:4]:
    df_slice = all_importances[[col]].sort_values(by=col, ascending=False).head(10)
    html_str += df_slice.style.set_table_attributes("style='display:inline; margin-right:20px;'")._repr_html_()

display_html(html_str, raw=True)

Unnamed: 0_level_0,real_weight
Feature,Unnamed: 1_level_1
mean_microcat_log_real_weight,19.256811
text,17.652679
price_rel_to_category,7.666386
img_pca_8,6.121401
item_price,3.220253
item_condition,2.956574
log_price,2.836974
subcategory_name,2.554452
mean_microcat_log_real_width,1.739531
category_name,1.383248

Unnamed: 0_level_0,real_length
Feature,Unnamed: 1_level_1
text,15.768886
mean_microcat_log_real_length,14.228564
img_pca_8,6.026024
price_rel_to_category,5.097316
mean_microcat_log_real_width,3.137174
item_price,1.979972
item_condition,1.845732
img_pca_1,1.825936
mean_microcat_log_real_weight,1.767614
log_price,1.621581

Unnamed: 0_level_0,real_width
Feature,Unnamed: 1_level_1
mean_microcat_log_real_width,15.510462
text,15.123537
price_rel_to_category,8.517669
img_pca_8,6.224542
mean_microcat_log_real_length,2.225331
img_pca_1,1.594938
item_condition,1.580517
subcategory_name,1.423684
mean_microcat_log_real_weight,1.402347
img_pca_6,1.353318

Unnamed: 0_level_0,real_height
Feature,Unnamed: 1_level_1
mean_microcat_log_real_height,18.861499
text,18.789685
price_rel_to_category,6.471033
img_pca_8,3.82739
item_condition,2.265083
img_pca_2,2.264941
item_price,2.10245
log_price,1.949451
mean_microcat_log_real_width,1.857025
subcategory_name,1.697377


In [31]:
html_str = ""
for col in target_columns[:4]:
    df_slice = all_importances[[col]].sort_values(by=col, ascending=True).head(10)
    html_str += df_slice.style.set_table_attributes("style='display:inline; margin-right:20px;'")._repr_html_()

display_html(html_str, raw=True)


Unnamed: 0_level_0,real_weight
Feature,Unnamed: 1_level_1
extracted_length,0.043929
extracted_height,0.05092
extracted_width,0.092096
img_pca_50,0.169123
img_pca_87,0.170142
img_pca_95,0.172075
img_pca_88,0.173572
img_pca_57,0.189569
img_pca_44,0.189848
img_pca_83,0.191843

Unnamed: 0_level_0,real_length
Feature,Unnamed: 1_level_1
extracted_length,0.031269
extracted_width,0.033076
microcat_name,0.072412
extracted_height,0.200472
img_pca_75,0.239882
img_pca_79,0.254828
img_pca_60,0.258272
img_pca_78,0.268321
img_pca_48,0.272452
img_pca_83,0.278653

Unnamed: 0_level_0,real_width
Feature,Unnamed: 1_level_1
extracted_length,0.038246
extracted_height,0.050875
microcat_name,0.056721
extracted_width,0.208459
img_pca_62,0.234589
img_pca_77,0.245924
img_pca_60,0.255676
img_pca_83,0.257881
img_pca_88,0.264585
img_pca_38,0.271464

Unnamed: 0_level_0,real_height
Feature,Unnamed: 1_level_1
extracted_width,0.008652
extracted_height,0.013771
extracted_length,0.10281
microcat_name,0.19732
img_pca_69,0.201089
img_pca_68,0.212482
img_pca_50,0.222909
img_pca_95,0.232772
img_pca_60,0.233427
img_pca_92,0.237455


CatBoost отлично справился с извлечением информации из текста и ему вообще не пригодились извлеченные мной размерности

Также можно отметить, что огромную роль сыграли средние значения таргета по микрокатегориям

Некоторые фичи изображений имеют неплохое влияние

Попробую обучить CatBoost на предсказание сразу всех 4 таргетов

In [32]:
train_mask_m = (
    get_valid_mask(temp_train, "real_weight")
    & get_valid_mask(temp_train, "real_height")
    & get_valid_mask(temp_train, "real_length")
    & get_valid_mask(temp_train, "real_width")
)

cb_train_m = Pool(
    X_train[train_mask_m],
    label=y_train.loc[train_mask_m, log_target_columns],
    cat_features=["item_condition", "category_name", "subcategory_name", "microcat_name"],
    text_features=["text"],
)
cb_val_m = Pool(
    X_val,
    label=y_val[log_target_columns],
    cat_features=["item_condition", "category_name", "subcategory_name", "microcat_name"],
    text_features=["text"],
)

cb_m = CatBoostRegressor(
    learning_rate=0.05, iterations=10000, use_best_model=True, loss_function="MultiRMSE", eval_metric="MultiRMSE", task_type="GPU", devices="0:1"
)
cb_m.fit(cb_train_m, eval_set=cb_val_m, early_stopping_rounds=50, verbose=500)

full_val_m = Pool(
    X_val,
    cat_features=cat_feat,
    text_features=["text"],
)

preds_m = cb_m.predict(full_val_m)

scores_multi = {}
for i, target in enumerate(log_target_columns):
    p = preds_m[:, i]

    score = mean_absolute_error(y_val[target], p)
    scores_multi[target] = score
    print(f"\n  > Score for {target}: {score:.5f}")

avg_score_multi = np.mean(list(scores_multi.values()))
print(f"\n=== ИТОГ MultiRMSE: {avg_score_multi:.5f} ===")

0:	learn: 1.1691725	test: 1.1822326	best: 1.1822326 (0)	total: 149ms	remaining: 24m 50s
500:	learn: 0.9800124	test: 1.0041901	best: 1.0041901 (500)	total: 11.8s	remaining: 3m 43s
1000:	learn: 0.9681333	test: 0.9976474	best: 0.9976474 (1000)	total: 22.8s	remaining: 3m 24s
1500:	learn: 0.9602025	test: 0.9945357	best: 0.9945357 (1500)	total: 33.1s	remaining: 3m 7s
2000:	learn: 0.9536521	test: 0.9925655	best: 0.9925655 (2000)	total: 43.2s	remaining: 2m 52s
2500:	learn: 0.9476920	test: 0.9912177	best: 0.9912177 (2500)	total: 53.3s	remaining: 2m 39s
3000:	learn: 0.9423214	test: 0.9902370	best: 0.9902370 (3000)	total: 1m 3s	remaining: 2m 27s
3500:	learn: 0.9372624	test: 0.9894870	best: 0.9894870 (3500)	total: 1m 13s	remaining: 2m 15s
4000:	learn: 0.9324028	test: 0.9888814	best: 0.9888759 (3998)	total: 1m 23s	remaining: 2m 5s
4500:	learn: 0.9276553	test: 0.9883356	best: 0.9883335 (4494)	total: 1m 34s	remaining: 1m 55s
5000:	learn: 0.9228782	test: 0.9878944	best: 0.9878908 (4987)	total: 1m 44s	

In [33]:
results_targets["Multi CatBoost"] = [scores_multi[t] for t in log_target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Independent CatBoost (w/o Image embeddings),Independent CatBoost,Multi CatBoost
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
real_weight,0.350348,0.265358,0.258457,0.268604
real_length,0.319958,0.295289,0.287794,0.290994
real_width,0.325912,0.299565,0.294882,0.296282
real_height,0.552994,0.503075,0.498499,0.513128


In [34]:
print(f"\nFinal Score Diff: {avg_score_indep_w_img} vs {avg_score_multi}")


Final Score Diff: 0.3349079916976393 vs 0.3422520639619967


Ожидаемо результат оказался хуже, так как для каждого таргета будет свой набор важных фичей

Далее я решил отойти от CatBoost и попробовать обучить нейросеть. Возможно она сможет лучше уловить зависимости

Для начала подготовим данные

In [24]:
num_feat = [
    'item_price',
    'extracted_weight',
    'extracted_length',
    'extracted_width',
    'extracted_height',
    'aspect_ratio',
    'log_price',
    'price_rel_to_category',
    'mean_microcat_log_real_weight',
    'mean_microcat_log_real_length',
    'mean_microcat_log_real_width',
    'mean_microcat_log_real_height'
]

X_train_num = X_train[num_feat].fillna(0).values.astype(np.float32)
X_val_num = X_val[num_feat].fillna(0).values.astype(np.float32)
X_test_num = X_test[num_feat].fillna(0).values.astype(np.float32)

# Важно нормировать все численные фичи
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_val_num = scaler.transform(X_val_num)
X_test_num = scaler.transform(X_test_num)

# Также закодируем категориальные переменные LabelEncoding-ом. Я выбрал его, а не OHE, так как получим слишком много колонок
cat_feat = ["item_condition", "category_name", "subcategory_name", "microcat_name"]
cat_encoders = {}
X_train_cat = []
X_val_cat = []
X_test_cat = []

for target in cat_feat:
    le = LabelEncoder()
    all_values = pd.concat([X_train[target], X_val[target], X_test[target]]).astype(str).fillna("Unknown")
    le.fit(all_values)

    X_train_cat.append(le.transform(X_train[target].astype(str).fillna("Unknown")))
    X_val_cat.append(le.transform(X_val[target].astype(str).fillna("Unknown")))
    X_test_cat.append(le.transform(X_test[target].astype(str).fillna("Unknown")))
    cat_encoders[target] = le

X_train_cat = np.stack(X_train_cat, axis=1).astype(np.int64)
X_val_cat = np.stack(X_val_cat, axis=1).astype(np.int64)
X_test_cat = np.stack(X_test_cat, axis=1).astype(np.int64)

Я решил использовать предобученную модель RuBERT-tiny2 для более качественного извлечения эмбеддингов текста

In [25]:
MODEL_NAME = "cointegrated/rubert-tiny2"

def get_text_embeddings(texts: List, model_name: str = MODEL_NAME) -> np.ndarray:
    """
    Генерирует эмбеддинги текста с помощью предобученного трансформера.
    """
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), 256), desc="Extracting Text Embeddings"):
            batch_texts = texts[i : i + 256]
            
            encoded_input = tokenizer(
                batch_texts, 
                padding=True, 
                truncation=True, 
                max_length=64, 
                return_tensors='pt'
            )
            encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
            model_output = model(**encoded_input)
            cls_embeddings = model_output.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
            
    return np.vstack(embeddings)

all_texts_list = pd.concat([X_train['text'], X_val['text'], X_test['text']]).tolist()

text_embs = get_text_embeddings(all_texts_list)

X_train_text_embs = text_embs[:len(X_train)]
X_val_text_embs = text_embs[len(X_train): len(X_train) + len(X_val)]
X_test_text_embs = text_embs[len(X_train) + len(X_val):]

Loading cointegrated/rubert-tiny2...


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

2026-01-19 17:34:18.430632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768844058.680954      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768844058.743457      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768844059.236030      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768844059.236059      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768844059.236062      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Extracting Text Embeddings: 100%|██████████| 1497/1497 [01:45<00:00, 14.25it/s]


In [26]:
pca_cols = [f"img_pca_{i}" for i in range(96)]
X_train_img = X_train[pca_cols].values
X_val_img = X_val[pca_cols].values
X_test_img = X_test[pca_cols].values

In [27]:
# для обучения нейросети, которая будет выдавать все 4 таргета, придется убрать все выбросы сразу
train_mask = (
    get_valid_mask(temp_train, "real_weight")
    & get_valid_mask(temp_train, "real_height")
    & get_valid_mask(temp_train, "real_length")
    & get_valid_mask(temp_train, "real_width")
)

X_train_num_m = X_train_num[train_mask]
X_train_cat_m = X_train_cat[train_mask]
X_train_text_embs_m = X_train_text_embs[train_mask]
X_train_img_m = X_train_img[train_mask]
y_train_m = y_train[train_mask]

In [28]:
class WaDDataset(Dataset):
    def __init__(self, nums, cats, text, imgs, targets=None):
        self.nums = torch.FloatTensor(nums)
        self.cats = torch.LongTensor(cats)
        self.text = torch.FloatTensor(text)
        self.imgs = torch.FloatTensor(imgs)
        self.targets = targets

    def __len__(self):
        return len(self.nums)

    def __getitem__(self, idx):
        inputs = {"num": self.nums[idx], "cat": self.cats[idx], "text": self.text[idx], "img": self.imgs[idx]}

        if self.targets is not None:
            return inputs, self.targets[idx]

        return inputs


In [29]:
train_ds_m = WaDDataset(X_train_num_m, X_train_cat_m, X_train_text_embs_m, X_train_img_m, y_train_m[log_target_columns].values)
val_ds_m = WaDDataset(X_val_num, X_val_cat, X_val_text_embs, X_val_img, y_val[log_target_columns].values)
test_ds = WaDDataset(X_test_num, X_test_cat, X_test_text_embs, X_test_img, None)


train_loader_m = DataLoader(train_ds_m, batch_size=256, shuffle=True)
val_loader_m = DataLoader(val_ds_m, batch_size=256, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)

In [32]:
class MultiFusionNet(nn.Module):
    def __init__(
        self,
        num_feat_dim,
        cat_counts,  # Список: [кол-во уникальных для cat1, cat2...]
        text_input_dim=312,
        img_input_dim=1280,
    ):
        super().__init__()

        # --- 1. Ветка Категорий ---
        self.cat_embs = nn.ModuleList([nn.Embedding(c, min(50, (c + 1) // 2)) for c in cat_counts])

        # --- 2. Ветка Текста ---
        self.text_fc = nn.Sequential(
            nn.Linear(text_input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

        # --- 3. Ветка Картинок ---
        self.img_fc = nn.Sequential(
            nn.Linear(img_input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # --- 4. Ветка Чисел ---
        self.num_fc = nn.Sequential(
            nn.Linear(num_feat_dim, 64),
            nn.ReLU()
        )

        # --- 5. Fusion (Слияние) ---
        fusion_dim = sum(e.embedding_dim for e in self.cat_embs) + 128 + 256 + 64

        self.head = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 4)
        )

    def forward(self, x):
        # 1. Categories
        cat_out = [emb(x["cat"][:, i]) for i, emb in enumerate(self.cat_embs)]
        cat_out = torch.cat(cat_out, dim=1)

        # 2. Text
        text_out = self.text_fc(x['text']) 

        # 3. Image
        img_out = self.img_fc(x["img"])

        # 4. Numerical
        num_out = self.num_fc(x["num"])

        # Concat
        combined = torch.cat([cat_out, text_out, img_out, num_out], dim=1)

        return self.head(combined)

In [33]:
def train_nn(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, epochs: int = 15, lr: float = 1e-3, device: str = "cuda") -> nn.Module:
    """Функция тренировки нейросети, сохраняет веса с наименьшим лоссом и загружает их в конце обучения

    Args:
        model (nn.Module): Модель
        train_loader (DataLoader): Тренировочный даталоадер
        val_loader (DataLoader): Валидационный даталоадер
        epochs (int, optional): Кол-во эпох обучения. Defaults to 15.
        lr (float, optional): Learning Rate. Defaults to 1e-3.
        device (str, optional): device. Defaults to "cuda".

    Returns:
        nn.Module: Обученная модель
    """
    criterion = nn.L1Loss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)

    best_loss = float("inf")
    best_weights = None

    print(f"Start training on {device}...")

    for epoch in tqdm(range(epochs)):
        model.train()
        train_loss = 0

        for inputs, targets in train_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            targets = targets.to(device)

            # ВАЖНО: Если таргет одномерный (Independent), делаем (Batch, 1)
            # Если многомерный (Multi), он уже (Batch, 4), ничего не меняется
            if targets.ndim == 1:
                targets = targets.view(-1, 1)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        model.eval()
        val_loss = 0

        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                targets = targets.to(device)

                if targets.ndim == 1:
                    targets = targets.view(-1, 1)

                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        scheduler.step(avg_val_loss)

        saved_msg = ""
        if best_loss - avg_val_loss >= 0.0001 :
            best_loss = avg_val_loss
            best_weights = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            saved_msg = "-> Saved Best!"

        print(f"Epoch {epoch + 1}/{epochs} | Train MAE: {avg_train_loss:.5f} | Val MAE: {avg_val_loss:.5f} {saved_msg}")

    print(f"Training finished. Best Val MAE: {best_loss:.5f}")
    if best_weights is not None:
        model.load_state_dict(best_weights)

    return model

In [34]:
cat_counts = [len(enc.classes_) for enc in cat_encoders.values()]

net_m = MultiFusionNet(num_feat_dim=X_train_num.shape[1], cat_counts=cat_counts, text_input_dim=312, img_input_dim=96).to(device)

net_m = train_nn(net_m, train_loader_m, val_loader_m, epochs=30, lr=1e-3)

net_m.eval()
nn_m_preds = []
with torch.no_grad():
    for inputs, targets in val_loader_m:
        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = targets.to(device)

        nn_m_preds.append(net_m(inputs).cpu().numpy())
nn_m_preds = np.vstack(nn_m_preds)

scores_multi_nn = {}
for i, target in enumerate(log_target_columns):
    p = nn_m_preds[:, i]

    score = mean_absolute_error(y_val[target], p)
    scores_multi_nn[target] = score
    print(f"  > Score for {target}: {score:.5f}")

Start training on cuda...


  3%|▎         | 1/30 [00:10<04:56, 10.24s/it]

Epoch 1/30 | Train MAE: 0.41019 | Val MAE: 0.34945 -> Saved Best!


  7%|▋         | 2/30 [00:20<04:40, 10.01s/it]

Epoch 2/30 | Train MAE: 0.35932 | Val MAE: 0.35151 


 10%|█         | 3/30 [00:30<04:31, 10.06s/it]

Epoch 3/30 | Train MAE: 0.34358 | Val MAE: 0.33844 -> Saved Best!


 13%|█▎        | 4/30 [00:39<04:18,  9.94s/it]

Epoch 4/30 | Train MAE: 0.33610 | Val MAE: 0.33255 -> Saved Best!


 17%|█▋        | 5/30 [00:49<04:06,  9.87s/it]

Epoch 5/30 | Train MAE: 0.33170 | Val MAE: 0.33148 -> Saved Best!


 20%|██        | 6/30 [00:59<03:56,  9.85s/it]

Epoch 6/30 | Train MAE: 0.32876 | Val MAE: 0.32946 -> Saved Best!


 23%|██▎       | 7/30 [01:09<03:47,  9.87s/it]

Epoch 7/30 | Train MAE: 0.32608 | Val MAE: 0.32833 -> Saved Best!


 27%|██▋       | 8/30 [01:19<03:36,  9.82s/it]

Epoch 8/30 | Train MAE: 0.32374 | Val MAE: 0.32776 -> Saved Best!


 30%|███       | 9/30 [01:28<03:26,  9.83s/it]

Epoch 9/30 | Train MAE: 0.32131 | Val MAE: 0.32908 


 33%|███▎      | 10/30 [01:38<03:16,  9.80s/it]

Epoch 10/30 | Train MAE: 0.31942 | Val MAE: 0.32684 -> Saved Best!


 37%|███▋      | 11/30 [01:48<03:06,  9.83s/it]

Epoch 11/30 | Train MAE: 0.31780 | Val MAE: 0.32640 -> Saved Best!


 40%|████      | 12/30 [01:58<02:57,  9.85s/it]

Epoch 12/30 | Train MAE: 0.31647 | Val MAE: 0.32859 


 43%|████▎     | 13/30 [02:08<02:47,  9.85s/it]

Epoch 13/30 | Train MAE: 0.31433 | Val MAE: 0.32654 


 47%|████▋     | 14/30 [02:18<02:36,  9.81s/it]

Epoch 14/30 | Train MAE: 0.31302 | Val MAE: 0.32596 -> Saved Best!


 50%|█████     | 15/30 [02:27<02:27,  9.81s/it]

Epoch 15/30 | Train MAE: 0.31179 | Val MAE: 0.32610 


 53%|█████▎    | 16/30 [02:37<02:17,  9.85s/it]

Epoch 16/30 | Train MAE: 0.31082 | Val MAE: 0.32856 


 57%|█████▋    | 17/30 [02:47<02:07,  9.82s/it]

Epoch 17/30 | Train MAE: 0.30956 | Val MAE: 0.32711 


 60%|██████    | 18/30 [02:57<01:57,  9.78s/it]

Epoch 18/30 | Train MAE: 0.30805 | Val MAE: 0.32680 


 63%|██████▎   | 19/30 [03:07<01:47,  9.82s/it]

Epoch 19/30 | Train MAE: 0.30734 | Val MAE: 0.32668 


 67%|██████▋   | 20/30 [03:17<01:38,  9.85s/it]

Epoch 20/30 | Train MAE: 0.30597 | Val MAE: 0.32795 


 70%|███████   | 21/30 [03:27<01:28,  9.88s/it]

Epoch 21/30 | Train MAE: 0.30524 | Val MAE: 0.32791 


 73%|███████▎  | 22/30 [03:36<01:19,  9.88s/it]

Epoch 22/30 | Train MAE: 0.30445 | Val MAE: 0.32789 


 77%|███████▋  | 23/30 [03:46<01:08,  9.86s/it]

Epoch 23/30 | Train MAE: 0.30336 | Val MAE: 0.32684 


 80%|████████  | 24/30 [03:56<00:59,  9.87s/it]

Epoch 24/30 | Train MAE: 0.30237 | Val MAE: 0.32804 


 83%|████████▎ | 25/30 [04:06<00:49,  9.87s/it]

Epoch 25/30 | Train MAE: 0.30165 | Val MAE: 0.32771 


 87%|████████▋ | 26/30 [04:16<00:39,  9.85s/it]

Epoch 26/30 | Train MAE: 0.30094 | Val MAE: 0.32747 


 90%|█████████ | 27/30 [04:26<00:29,  9.82s/it]

Epoch 27/30 | Train MAE: 0.29990 | Val MAE: 0.32833 


 93%|█████████▎| 28/30 [04:35<00:19,  9.82s/it]

Epoch 28/30 | Train MAE: 0.29934 | Val MAE: 0.32924 


 97%|█████████▋| 29/30 [04:45<00:09,  9.83s/it]

Epoch 29/30 | Train MAE: 0.29844 | Val MAE: 0.32860 


100%|██████████| 30/30 [04:55<00:00,  9.85s/it]

Epoch 30/30 | Train MAE: 0.29772 | Val MAE: 0.32862 
Training finished. Best Val MAE: 0.32596





  > Score for log_real_weight: 0.24242
  > Score for log_real_length: 0.28079
  > Score for log_real_width: 0.28899
  > Score for log_real_height: 0.49145


In [39]:
results_targets["Multi NN"] = [scores_multi_nn[t] for t in log_target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Multi NN
Target,Unnamed: 1_level_1,Unnamed: 2_level_1
real_weight,0.350348,0.242417
real_length,0.319958,0.280788
real_width,0.325912,0.288993
real_height,0.552994,0.491454


Нейросеть превзошла CatBoost

Стоит попробовать обучать модель под каждый таргет независимо, раз это дало значительное улучшение в случае с CatBoost

In [37]:
class IndependentFusionNet(nn.Module):
    def __init__(self, num_feat_dim, cat_counts, text_input_dim=312, img_input_dim=96):
        super().__init__()

        self.cat_embs = nn.ModuleList([nn.Embedding(c, min(50, (c + 1) // 2)) for c in cat_counts])
        
        self.text_fc = nn.Sequential(
            nn.Linear(text_input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        
        self.img_fc = nn.Sequential(
            nn.Linear(img_input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        self.num_fc = nn.Sequential(
            nn.Linear(num_feat_dim, 64),
            nn.ReLU()
        )

        fusion_dim = sum(e.embedding_dim for e in self.cat_embs) + 128 + 256 + 64

        self.head = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        cats = [emb(x["cat"][:, i]) for i, emb in enumerate(self.cat_embs)]
        x_cat = torch.cat(cats, 1)
        x_txt = self.text_fc(x['text'])
        x_img = self.img_fc(x["img"])
        x_num = self.num_fc(x["num"])
        return self.head(torch.cat([x_cat, x_txt, x_img, x_num], 1))

In [38]:
nn_models = {}
nn_scores = {}

print("Starting Independent NN Training...")

for target, log_target in zip(target_columns, log_target_columns):
    print(f"\n>>> Training NN for {target} ({log_target})...")

    train_mask = get_valid_mask(temp_train, target).values
    val_mask = get_valid_mask(temp_val, target).values

    print(f"    Clean Train: {train_mask.sum()} / {len(train_mask)}")

    tr_num = X_train_num[train_mask]
    tr_cat = X_train_cat[train_mask]
    tr_txt = X_train_text_embs[train_mask]
    tr_img = X_train_img[train_mask]
    tr_y = y_train[log_target].values.astype(np.float32)[train_mask]

    curr_train_ds = WaDDataset(tr_num, tr_cat, tr_txt, tr_img, tr_y)
    curr_val_ds = WaDDataset(X_val_num, X_val_cat, X_val_text_embs, X_val_img, y_val[log_target].values.astype(np.float32))

    curr_train_loader = DataLoader(curr_train_ds, batch_size=256, shuffle=True, num_workers=0)
    curr_val_loader = DataLoader(curr_val_ds, batch_size=256, shuffle=False, num_workers=0)

    cat_counts = [len(enc.classes_) for enc in cat_encoders.values()]
    model = IndependentFusionNet(num_feat_dim=X_train_num.shape[1], cat_counts=cat_counts, text_input_dim=312, img_input_dim=96).to(device)

    model = train_nn(model, curr_train_loader, curr_val_loader, lr=1e-3, epochs=20)
    nn_models[target] = model

    model.eval()
    preds = []
    with torch.no_grad():
        for inputs, _ in curr_val_loader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            out = model(inputs)
            preds.append(out.cpu().numpy())

    full_preds = np.vstack(preds).flatten()
    real_score = mean_absolute_error(y_val[log_target], full_preds)
    nn_scores[target] = real_score
    print(f"Final NN Independent {target} Score: {real_score:.5f}")


avg_nn_score = np.mean(list(nn_scores.values()))
print(f"\n=== FINAL NN Independent Score: {avg_nn_score:.5f} ===")

Starting Independent NN Training...

>>> Training NN for real_weight (log_real_weight)...
    Clean Train: 249661 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:09<02:51,  9.02s/it]

Epoch 1/20 | Train MAE: 0.27748 | Val MAE: 0.25903 -> Saved Best!


 10%|█         | 2/20 [00:18<02:45,  9.17s/it]

Epoch 2/20 | Train MAE: 0.25909 | Val MAE: 0.25542 -> Saved Best!


 15%|█▌        | 3/20 [00:26<02:29,  8.82s/it]

Epoch 3/20 | Train MAE: 0.25223 | Val MAE: 0.25130 -> Saved Best!


 20%|██        | 4/20 [00:35<02:23,  9.00s/it]

Epoch 4/20 | Train MAE: 0.24759 | Val MAE: 0.25134 


 25%|██▌       | 5/20 [00:44<02:14,  8.94s/it]

Epoch 5/20 | Train MAE: 0.24356 | Val MAE: 0.24561 -> Saved Best!


 30%|███       | 6/20 [00:53<02:04,  8.92s/it]

Epoch 6/20 | Train MAE: 0.23980 | Val MAE: 0.24467 -> Saved Best!


 35%|███▌      | 7/20 [01:02<01:56,  9.00s/it]

Epoch 7/20 | Train MAE: 0.23696 | Val MAE: 0.24490 


 40%|████      | 8/20 [01:11<01:47,  8.95s/it]

Epoch 8/20 | Train MAE: 0.23457 | Val MAE: 0.24434 -> Saved Best!


 45%|████▌     | 9/20 [01:20<01:37,  8.90s/it]

Epoch 9/20 | Train MAE: 0.23149 | Val MAE: 0.24289 -> Saved Best!


 50%|█████     | 10/20 [01:29<01:29,  8.91s/it]

Epoch 10/20 | Train MAE: 0.22913 | Val MAE: 0.24180 -> Saved Best!


 55%|█████▌    | 11/20 [01:38<01:20,  8.89s/it]

Epoch 11/20 | Train MAE: 0.22714 | Val MAE: 0.24172 


 60%|██████    | 12/20 [01:47<01:11,  8.98s/it]

Epoch 12/20 | Train MAE: 0.22481 | Val MAE: 0.24335 


 65%|██████▌   | 13/20 [01:56<01:02,  8.97s/it]

Epoch 13/20 | Train MAE: 0.22306 | Val MAE: 0.24284 


 70%|███████   | 14/20 [02:05<00:53,  8.93s/it]

Epoch 14/20 | Train MAE: 0.22118 | Val MAE: 0.24213 


 75%|███████▌  | 15/20 [02:14<00:45,  9.04s/it]

Epoch 15/20 | Train MAE: 0.21965 | Val MAE: 0.24154 -> Saved Best!


 80%|████████  | 16/20 [02:22<00:35,  8.86s/it]

Epoch 16/20 | Train MAE: 0.21775 | Val MAE: 0.24092 -> Saved Best!


 85%|████████▌ | 17/20 [02:32<00:26,  8.94s/it]

Epoch 17/20 | Train MAE: 0.21635 | Val MAE: 0.24076 -> Saved Best!


 90%|█████████ | 18/20 [02:40<00:17,  8.90s/it]

Epoch 18/20 | Train MAE: 0.21495 | Val MAE: 0.24088 


 95%|█████████▌| 19/20 [02:49<00:08,  8.86s/it]

Epoch 19/20 | Train MAE: 0.21359 | Val MAE: 0.24057 -> Saved Best!


100%|██████████| 20/20 [02:58<00:00,  8.94s/it]

Epoch 20/20 | Train MAE: 0.21205 | Val MAE: 0.24109 
Training finished. Best Val MAE: 0.24057





Final NN Independent real_weight Score: 0.24050126655782328

>>> Training NN for real_length (log_real_length)...
    Clean Train: 249598 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:08<02:47,  8.81s/it]

Epoch 1/20 | Train MAE: 0.39426 | Val MAE: 0.32104 -> Saved Best!


 10%|█         | 2/20 [00:18<02:43,  9.07s/it]

Epoch 2/20 | Train MAE: 0.32638 | Val MAE: 0.33510 


 15%|█▌        | 3/20 [00:26<02:29,  8.81s/it]

Epoch 3/20 | Train MAE: 0.30522 | Val MAE: 0.29049 -> Saved Best!


 20%|██        | 4/20 [00:36<02:25,  9.07s/it]

Epoch 4/20 | Train MAE: 0.29534 | Val MAE: 0.28625 -> Saved Best!


 25%|██▌       | 5/20 [00:44<02:15,  9.02s/it]

Epoch 5/20 | Train MAE: 0.28878 | Val MAE: 0.29232 


 30%|███       | 6/20 [00:54<02:06,  9.06s/it]

Epoch 6/20 | Train MAE: 0.28468 | Val MAE: 0.28750 


 35%|███▌      | 7/20 [01:03<01:58,  9.12s/it]

Epoch 7/20 | Train MAE: 0.28179 | Val MAE: 0.28483 -> Saved Best!


 40%|████      | 8/20 [01:12<01:49,  9.08s/it]

Epoch 8/20 | Train MAE: 0.27903 | Val MAE: 0.28625 


 45%|████▌     | 9/20 [01:21<01:39,  9.03s/it]

Epoch 9/20 | Train MAE: 0.27573 | Val MAE: 0.28182 -> Saved Best!


 50%|█████     | 10/20 [01:30<01:29,  8.99s/it]

Epoch 10/20 | Train MAE: 0.27367 | Val MAE: 0.28129 -> Saved Best!


 55%|█████▌    | 11/20 [01:38<01:20,  8.94s/it]

Epoch 11/20 | Train MAE: 0.27156 | Val MAE: 0.28380 


 60%|██████    | 12/20 [01:48<01:12,  9.01s/it]

Epoch 12/20 | Train MAE: 0.26950 | Val MAE: 0.28228 


 65%|██████▌   | 13/20 [01:57<01:02,  8.97s/it]

Epoch 13/20 | Train MAE: 0.26745 | Val MAE: 0.28370 


 70%|███████   | 14/20 [02:06<00:53,  8.98s/it]

Epoch 14/20 | Train MAE: 0.26582 | Val MAE: 0.28164 


 75%|███████▌  | 15/20 [02:15<00:45,  9.02s/it]

Epoch 15/20 | Train MAE: 0.26457 | Val MAE: 0.28133 


 80%|████████  | 16/20 [02:24<00:35,  8.99s/it]

Epoch 16/20 | Train MAE: 0.26240 | Val MAE: 0.28180 


 85%|████████▌ | 17/20 [02:32<00:26,  8.94s/it]

Epoch 17/20 | Train MAE: 0.26166 | Val MAE: 0.28221 


 90%|█████████ | 18/20 [02:41<00:17,  8.93s/it]

Epoch 18/20 | Train MAE: 0.26028 | Val MAE: 0.28225 


 95%|█████████▌| 19/20 [02:50<00:08,  8.94s/it]

Epoch 19/20 | Train MAE: 0.25888 | Val MAE: 0.28236 


100%|██████████| 20/20 [03:00<00:00,  9.00s/it]

Epoch 20/20 | Train MAE: 0.25755 | Val MAE: 0.28361 
Training finished. Best Val MAE: 0.28129





Final NN Independent real_length Score: 0.2812450682016762

>>> Training NN for real_width (log_real_width)...
    Clean Train: 249706 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:08<02:47,  8.80s/it]

Epoch 1/20 | Train MAE: 0.38769 | Val MAE: 0.32626 -> Saved Best!


 10%|█         | 2/20 [00:18<02:44,  9.13s/it]

Epoch 2/20 | Train MAE: 0.33156 | Val MAE: 0.31732 -> Saved Best!


 15%|█▌        | 3/20 [00:26<02:30,  8.87s/it]

Epoch 3/20 | Train MAE: 0.31395 | Val MAE: 0.34726 


 20%|██        | 4/20 [00:36<02:25,  9.07s/it]

Epoch 4/20 | Train MAE: 0.30487 | Val MAE: 0.33462 


 25%|██▌       | 5/20 [00:45<02:15,  9.02s/it]

Epoch 5/20 | Train MAE: 0.29926 | Val MAE: 0.30772 -> Saved Best!


 30%|███       | 6/20 [00:53<02:05,  8.99s/it]

Epoch 6/20 | Train MAE: 0.29563 | Val MAE: 0.29533 -> Saved Best!


 35%|███▌      | 7/20 [01:03<02:00,  9.30s/it]

Epoch 7/20 | Train MAE: 0.29251 | Val MAE: 0.29554 


 40%|████      | 8/20 [01:13<01:51,  9.28s/it]

Epoch 8/20 | Train MAE: 0.28900 | Val MAE: 0.29809 


 45%|████▌     | 9/20 [01:22<01:42,  9.35s/it]

Epoch 9/20 | Train MAE: 0.28677 | Val MAE: 0.29288 -> Saved Best!


 50%|█████     | 10/20 [01:32<01:34,  9.49s/it]

Epoch 10/20 | Train MAE: 0.28415 | Val MAE: 0.29218 -> Saved Best!


 55%|█████▌    | 11/20 [01:41<01:25,  9.51s/it]

Epoch 11/20 | Train MAE: 0.28233 | Val MAE: 0.29357 


 60%|██████    | 12/20 [01:51<01:17,  9.63s/it]

Epoch 12/20 | Train MAE: 0.28060 | Val MAE: 0.29126 -> Saved Best!


 65%|██████▌   | 13/20 [02:01<01:07,  9.57s/it]

Epoch 13/20 | Train MAE: 0.27827 | Val MAE: 0.29309 


 70%|███████   | 14/20 [02:10<00:57,  9.51s/it]

Epoch 14/20 | Train MAE: 0.27697 | Val MAE: 0.29118 


 75%|███████▌  | 15/20 [02:20<00:47,  9.55s/it]

Epoch 15/20 | Train MAE: 0.27486 | Val MAE: 0.29274 


 80%|████████  | 16/20 [02:29<00:37,  9.38s/it]

Epoch 16/20 | Train MAE: 0.27336 | Val MAE: 0.29280 


 85%|████████▌ | 17/20 [02:38<00:28,  9.42s/it]

Epoch 17/20 | Train MAE: 0.27177 | Val MAE: 0.29249 


 90%|█████████ | 18/20 [02:47<00:18,  9.25s/it]

Epoch 18/20 | Train MAE: 0.27024 | Val MAE: 0.29270 


 95%|█████████▌| 19/20 [02:56<00:09,  9.12s/it]

Epoch 19/20 | Train MAE: 0.26858 | Val MAE: 0.29465 


100%|██████████| 20/20 [03:05<00:00,  9.28s/it]

Epoch 20/20 | Train MAE: 0.26743 | Val MAE: 0.29927 
Training finished. Best Val MAE: 0.29126





Final NN Independent real_width Score: 0.2912304826301817

>>> Training NN for real_height (log_real_height)...
    Clean Train: 249889 / 250326
Start training on cuda...


  5%|▌         | 1/20 [00:08<02:47,  8.82s/it]

Epoch 1/20 | Train MAE: 0.55383 | Val MAE: 0.53191 -> Saved Best!


 10%|█         | 2/20 [00:18<02:42,  9.04s/it]

Epoch 2/20 | Train MAE: 0.51707 | Val MAE: 0.51338 -> Saved Best!


 15%|█▌        | 3/20 [00:26<02:29,  8.76s/it]

Epoch 3/20 | Train MAE: 0.50710 | Val MAE: 0.50667 -> Saved Best!


 20%|██        | 4/20 [00:35<02:22,  8.88s/it]

Epoch 4/20 | Train MAE: 0.50056 | Val MAE: 0.49769 -> Saved Best!


 25%|██▌       | 5/20 [00:44<02:11,  8.80s/it]

Epoch 5/20 | Train MAE: 0.49606 | Val MAE: 0.50141 


 30%|███       | 6/20 [00:52<02:03,  8.80s/it]

Epoch 6/20 | Train MAE: 0.49235 | Val MAE: 0.49518 -> Saved Best!


 35%|███▌      | 7/20 [01:02<01:56,  8.93s/it]

Epoch 7/20 | Train MAE: 0.48957 | Val MAE: 0.50553 


 40%|████      | 8/20 [01:11<01:46,  8.91s/it]

Epoch 8/20 | Train MAE: 0.48570 | Val MAE: 0.49590 


 45%|████▌     | 9/20 [01:19<01:37,  8.88s/it]

Epoch 9/20 | Train MAE: 0.48235 | Val MAE: 0.49467 -> Saved Best!


 50%|█████     | 10/20 [01:28<01:28,  8.86s/it]

Epoch 10/20 | Train MAE: 0.47868 | Val MAE: 0.49525 


 55%|█████▌    | 11/20 [01:37<01:19,  8.80s/it]

Epoch 11/20 | Train MAE: 0.47521 | Val MAE: 0.49501 


 60%|██████    | 12/20 [01:46<01:10,  8.86s/it]

Epoch 12/20 | Train MAE: 0.47123 | Val MAE: 0.49449 -> Saved Best!


 65%|██████▌   | 13/20 [01:55<01:01,  8.83s/it]

Epoch 13/20 | Train MAE: 0.46815 | Val MAE: 0.49600 


 70%|███████   | 14/20 [02:03<00:52,  8.81s/it]

Epoch 14/20 | Train MAE: 0.46561 | Val MAE: 0.49716 


 75%|███████▌  | 15/20 [02:12<00:44,  8.91s/it]

Epoch 15/20 | Train MAE: 0.46275 | Val MAE: 0.49981 


 80%|████████  | 16/20 [02:21<00:35,  8.78s/it]

Epoch 16/20 | Train MAE: 0.45862 | Val MAE: 0.49895 


 85%|████████▌ | 17/20 [02:30<00:26,  8.91s/it]

Epoch 17/20 | Train MAE: 0.45667 | Val MAE: 0.49704 


 90%|█████████ | 18/20 [02:39<00:17,  8.89s/it]

Epoch 18/20 | Train MAE: 0.45319 | Val MAE: 0.50046 


 95%|█████████▌| 19/20 [02:48<00:08,  8.85s/it]

Epoch 19/20 | Train MAE: 0.44991 | Val MAE: 0.49895 


100%|██████████| 20/20 [02:57<00:00,  8.87s/it]

Epoch 20/20 | Train MAE: 0.44825 | Val MAE: 0.49950 
Training finished. Best Val MAE: 0.49449





Final NN Independent real_height Score: 0.49442871243509257

=== FINAL NN Independent Score: 0.32685 ===


In [40]:
results_targets["Independent NN"] = [nn_scores[t] for t in target_columns]

results_targets

Unnamed: 0_level_0,Baseline (mean_microcat),Multi NN,Independent NN
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
real_weight,0.350348,0.242417,0.240501
real_length,0.319958,0.280788,0.281245
real_width,0.325912,0.288993,0.29123
real_height,0.552994,0.491454,0.494429


Независимый подход не дал значительного улучшения

Посмотрим на метрику MPE наших моделей

In [50]:
result_scores = results_targets.mean().sort_values()

result_scores

Independent NN                                 0.327573
Multi NN                                       0.327873
Independent CatBoost                           0.334908
Independent CatBoost (w/o Image embeddings)    0.340822
Multi CatBoost                                 0.342252
Baseline (mean_microcat)                       0.387303
dtype: float64

In [55]:
def get_submission_nn(model, test_loader):
    submission = pd.DataFrame()
    submission["item_id"] = test.index
    submission_preds = []
    
    with torch.no_grad():
        if isinstance(model, dict):
            target_results = {target: [] for target in target_columns}
            for inputs in test_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                for target in target_columns:
                    outputs = model[target](inputs)
                    target_results[target].append(outputs.cpu().numpy())
            
            cols = [np.vstack(target_results[t]) for t in target_columns]
            submission_preds = np.hstack(cols)
        else:
            for inputs in test_loader:
                inputs = {k: v.to(device) for k, v in inputs.items()}
                outputs = model(inputs)
                submission_preds.append(outputs.cpu().numpy())
            submission_preds = np.vstack(submission_preds)

    submission_preds = np.maximum(0, np.expm1(submission_preds))
    submission["weight"] = submission_preds[:, 0]
    submission["height"] = submission_preds[:, 3]
    submission["length"] = submission_preds[:, 1]
    submission["width"] = submission_preds[:, 2]
    
    submission.to_csv("submission.csv", index=False)
    print("submission.csv saved")
    return submission

In [57]:
sub = get_submission_nn(net_m, test_loader)

sub

submission.csv saved


Unnamed: 0,item_id,weight,height,length,width
0,163755,0.350400,7.280837,23.580997,17.703005
1,1339648,1.546855,15.905599,34.268322,26.216501
2,21095,1.083223,12.428748,33.301346,24.478306
3,925424,0.421423,2.438063,35.650543,26.956423
4,780125,2.465785,19.249857,38.023773,27.960434
...,...,...,...,...,...
70269,1207676,0.386202,2.638420,35.933983,26.698406
70270,1614448,0.521674,10.901972,20.987286,16.231644
70271,1787906,0.250951,8.327166,18.975750,12.885837
70272,897587,0.645760,7.358114,34.200413,26.290810


### Лучший тестовый скор на Stepik показала Multi NN. 0.318857
