In [2]:
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import random

import glob
from tqdm import tqdm
from pathlib import Path
import json

import subprocess

from collections import defaultdict
from typing import Optional

from scipy.sparse import coo_matrix, save_npz, load_npz

In [3]:
# read table from SSD
def read_df_from_parquets(path_to_parquets: str, n=None):
    files = glob.glob(path_to_parquets + '/*.parquet')
    count_to_read = len(files) if n is None else min(n, len(files))
    return pl.read_parquet(files[:count_to_read])

# scan table from SSD
def scan_df_from_parquets(path_to_parquets: str, n=None):
    files = glob.glob(path_to_parquets + '/*.parquet')
    count_to_scan = len(files) if n is None else min(n, len(files))
    return pl.scan_parquet(files[:count_to_scan])

# write table to SSD with chunks
def write_parquet_in_chunks(df: pl.DataFrame, path: str, chunk_size=100000):
    out = Path(path)
    out.mkdir(parents=True, exist_ok=True)
    for i in range(0, df.height, chunk_size):
        chunk = df.slice(i, chunk_size)
        file_path = out / f"part_{i // chunk_size:04d}.parquet"
        chunk.write_parquet(file_path)

In [3]:
class CategoryTree:
    def __init__(self):
        self.parent = defaultdict(int)
        self.children = defaultdict(set)
        self.subtree = defaultdict(set)
        self.subtree_children = defaultdict(set)

    @classmethod
    def from_df(cls, category_tree_df: pl.DataFrame):
        self = cls()
        for path in category_tree_df['ids']:
            for i in range(len(path) - 1):
                cur_id = int(path[i])
                parent_id = path[i + 1]

                if cur_id not in self.parent:
                    self.parent[cur_id] = parent_id

                if parent_id != -1:
                    self.children[parent_id].add(cur_id)

                self.children.setdefault(cur_id, set())

        for p in {v for v in self.parent.values() if v != -1}:
            self.children.setdefault(p, set())
            self.parent.setdefault(p, -1)

        return self

    def is_leaf(self, catalog_id: int):
        if catalog_id not in self.children:
            raise RuntimeError("There is no catalog_id in category tree")
        return len(self.children[catalog_id]) == 0

    def get_subtree(self, catalog_id: int):
        # Use pre-calculated subtree
        if catalog_id in self.subtree:
            return self.subtree[catalog_id]

        subtree = set()
        subtree.add(catalog_id)

        for son in self.children[catalog_id]:
            subtree |= self.get_subtree(son)

        self.subtree[catalog_id] = subtree
        return subtree

    def get_subtree_children(self, catalog_id: int):
        # Use pre-calculated subtree
        if catalog_id in self.subtree_children:
            return self.subtree_children[catalog_id]

        subtree_children = set()
        if self.is_leaf(catalog_id):
            subtree_children.add(catalog_id)

        for son in self.children[catalog_id]:
            subtree_children |= self.get_subtree_children(son)

        self.subtree_children[catalog_id] = subtree_children
        return subtree_children

    # Leaves only leaves
    def find_relevant_categories(self, users_interesting_categories: list, k_nearest=5, n=30):
        categories = set(users_interesting_categories)
        relevant_categories = set()

        for catalog_id in categories:
            if catalog_id is None:
                continue
            
            relevant_categories |= self.get_subtree_children(catalog_id)
            # if category is a leaf then we can choose k nearest leaves
            if self.is_leaf(catalog_id):
                parent_id = self.parent.get(catalog_id, -1)
                if parent_id != -1:
                    # choose k random children of the subtree of the parent without current 'catalog_id'
                    siblings = list(self.get_subtree_children(parent_id) - {catalog_id})
                    relevant_categories.update(random.sample(siblings, min(k_nearest, len(siblings))))

        # Take N random relevant categories
        relevant_categories = list(relevant_categories)
        return random.sample(relevant_categories, min(n, len(relevant_categories)))

    def get_users_relevant_categories(self, users_interacted_categories: pl.DataFrame,
                                      interacted_categories_column='interacted_categories',
                                      n=30):
        return (
            users_interacted_categories
            .with_columns(
                pl.col(interacted_categories_column)
                .map_elements(lambda categories: self.find_relevant_categories(categories),
                              return_dtype=pl.List(pl.Int64))
                .alias('relevant_categories')
            )
        )

Создаём дерево категорий и инициализируем его из приложенного датасета

In [4]:
category_tree = CategoryTree.from_df(read_df_from_parquets('./data/ml_ozon_recsys_train_final_categories_tree'))

**ПРОПУСКАЕМ ЭТОТ ШАГ, ТОЛЬКО ЕСЛИ НЕ ХОТИМ СГЕНЕРИРОВАТЬ НОВУЮ ТАБЛИЦУ**

Делаем сводную таблицу с релевантными категориями и брендами, с которыми взаимодействовал тот или иной пользователь(долго считается, у меня заняло час)

In [14]:
interactions = scan_df_from_parquets('./data/interactions')
items = scan_df_from_parquets('data/ml_ozon_recsys_train_final_apparel_items_data')

# if brand name equals 'Нет бренда' then put Null
items_with_brands = items.with_columns(
    pl.col('attributes').map_elements(
        lambda lst: next(
            (
                x['attribute_value']
                if x['attribute_value'] != 'Нет бренда' else None
                for x in lst
                if x['attribute_name'] == 'Brand'
            ),
            None,
        ),
        return_dtype=pl.Utf8,
    ).alias('brand')
)

In [None]:
users_interacted_categories_and_brands = (
    interactions
    .join(
        items_with_brands.select(['item_id', 'catalogid', 'brand']),
        on='item_id',
        how='left'
    )
    .group_by('user_id')
    .agg(
        pl.col('catalogid')
          .drop_nulls()
          .unique()
          .alias('interacted_categories'),
        pl.col('brand')
          .drop_nulls()
          .unique()
          .alias('interacted_brands')
    )
    .collect(engine='streaming')
)

print(users_interacted_categories_and_brands.head())

In [6]:
write_parquet_in_chunks(users_interacted_categories_and_brands, './data/users_interacted_categories_and_brands')

In [5]:
def get_users_relevant_categories_and_brands(
    path_to_interactions: str,
    path_to_items: str,
    weights: dict,
    category_tree: CategoryTree,
    *,
    percent_top_items=0.40,
    n_top_categories=50,
    n_interactions_files=None,
    n_items_files=None
):
    interactions = scan_df_from_parquets(path_to_interactions, n_interactions_files)
    items = scan_df_from_parquets(path_to_items, n_items_files)

    items_with_brands = items.with_columns(
        pl.col("attributes").map_elements(
            lambda lst: next(
                (x["attribute_value"] for x in lst if x["attribute_name"] == "Brand"),
                None
            ),
            return_dtype=pl.Utf8,
        ).alias("brand")
    )
    
    interactions_rated_with_brand_and_category = (
        interactions
        .with_columns(
            pl.sum_horizontal([
                pl.col(c).cast(pl.Float64).fill_null(0.0) * float(w)
                for c, w in weights.items()
            ]).alias("rating")
        )
        .join(
            items_with_brands
            .select([
                "item_id",
                pl.col("catalogid").alias("category"),
                "brand"
            ]),
            on="item_id",
            how="left"
        )
        .select("user_id", "item_id", "category", "brand", "rating")
    )

    # Для каждого пользователя находим самые интересные ему товары (берем долю S наилучших)
    top_items_for_users = (
        interactions_rated_with_brand_and_category
        .sort(["user_id", "rating"], descending=[False, True])
        .with_columns([
            pl.len().over("user_id").alias("n"),
            pl.arange(0, pl.len()).over("user_id").alias("rank"),
        ])
        .with_columns(
            pl.max_horizontal(pl.lit(1), (pl.col("n") * percent_top_items).ceil().cast(pl.Int64)).alias("k")
        )
        .filter(pl.col("rank") < pl.col("k"))
        .drop(['rank', 'n', 'k'])
    )

    # Для каждого пользователя берём N наиболее интересных категорий 
    # Рейтинг категории для пользователя = суммарный рейтинг всех товаров данной категории,
    # с которыми взаимодействовал пользователь
    top_n_categories = (
        top_items_for_users
        .select(['user_id', 'category', 'rating'])
        .group_by(['user_id', 'category'])
        .agg(pl.col('rating').sum().alias("summary_rating"))
        .group_by('user_id')
        .agg(
            pl.col('category')
                .sort_by(pl.col('summary_rating'), descending=True)
                .head(n_top_categories)
                .alias('interacted_categories')
        )
    )

    # Находим ближайших соседей в дереве
    relevant_top_n_categories = category_tree.get_users_relevant_categories(top_n_categories)

    # Аналогично находим наиболее интересные бренды для пользователя
    # Бренды срезать не будем, просто возьмём наиболее релевантные товары
    top_brands = (
        top_items_for_users
        .filter(pl.col("brand").is_not_null() & (pl.col("brand") != "Нет бренда"))
        .group_by("user_id")
        .agg(pl.col("brand").unique().alias("relevant_brands"))
    )

    # Собираем в одну таблицу
    users_relevant_categories_and_brands = (
        relevant_top_n_categories
        .join(top_brands, on='user_id', how='left')
        .select(['user_id', 'relevant_categories', 'relevant_brands'])
    )

    return users_relevant_categories_and_brands

In [None]:
weights = {
    "action_type_to_cart":            2.10161402,
    "action_type_page_view":          0.36151107,
    "action_type_unfavorite":         0.07727582,
    "action_type_review_view":        0.02292266,
    "action_type_view_description":  -0.00414356,
    "action_type_favorite":          -0.0242845,
    "action_type_remove":            -1.4484584
}

users_relevant_categories_and_brands = get_users_relevant_categories_and_brands(
    './data/interactions',
    './data/ml_ozon_recsys_train_final_apparel_items_data',
    weights,
    category_tree
).collect(engine='streaming')

print(users_relevant_categories_and_brands.head())

In [None]:
write_parquet_in_chunks(users_relevant_categories_and_brands, './data/users_relevant_categories_and_brands')

Загружаем из памяти. Пытаемся найти соседей в дереве.

In [19]:
users_interacted_categories_and_brands = read_df_from_parquets('./data/users_interacted_categories_and_brands')
users_relevant_categories_and_brands = category_tree.get_users_relevant_categories(users_interacted_categories_and_brands)
users_relevant_categories_and_brands.head()

user_id,interacted_categories,interacted_brands,relevant_categories
i64,list[i64],list[str],list[i64]
4932450,"[7502, 7503, … 7557000]","[""Sevatex"", ""Mphumee"", … ""DDU""]","[33794, 7683, … 7677]"
1320010,"[7504, 7506, … 76003000]","[""Travel"", ""SENTI-MENTI"", … ""AllaMo""]","[33794, 33796, … 39925]"
2928111,"[7502, 7506, … 7557000]","[""CREANOVA"", ""Митра"", … ""ZinBee""]","[33794, 33824, … 39925]"
3910511,"[7502, 7503, … 7557000]","[""HYPEMARKET"", ""T.TACCARDI"", … ""Ohana market""]","[33794, 33796, … 39925]"
4216070,"[7503, 7506, … 7512000]","[""Newlife"", ""Pierre Cardin"", … ""Селтекс""]","[33794, 36941, … 7671]"


In [21]:
write_parquet_in_chunks(users_relevant_categories_and_brands, './data/users_relevant_categories_and_brands')

Таблица всех брендов (для Арины)

In [13]:
items = scan_df_from_parquets('./data/ml_ozon_recsys_train_final_apparel_items_data')

brands = (
    items.with_columns(
        pl.col('attributes').map_elements(
            lambda lst: next((x['attribute_value'] for x in lst if x['attribute_name'] == 'Brand'), None),
            return_dtype=pl.Utf8
        ).alias('brand')
    )
    .select('brand')
    .drop_nulls()
    .unique()
    .collect(engine='streaming')
)

print(brands)

shape: (48_900, 1)
┌────────────┐
│ brand      │
│ ---        │
│ str        │
╞════════════╡
│ Лукоморье  │
│ FESS       │
│ GMSM       │
│ Biker      │
│ JUST PLAY  │
│ …          │
│ LIFII1     │
│ Оригинал   │
│ Miss grant │
│ DDA        │
│ LG GIFT    │
└────────────┘


In [12]:
write_parquet_in_chunks(brands, './data/brands_list')

In [4]:
user_recommendated_brands = read_df_from_parquets('./data/user_recommendated_brands')
print(user_recommendated_brands.head())

shape: (5, 2)
┌─────────┬─────────────────────────────────┐
│ user_id ┆ recommended_brands              │
│ ---     ┆ ---                             │
│ i64     ┆ list[str]                       │
╞═════════╪═════════════════════════════════╡
│ 163751  ┆ ["ESFASHION", "My Shape", … "F… │
│ 575221  ┆ ["Turkey Insider", "Kawaii Fac… │
│ 2814191 ┆ ["MAICHUYIKE", "Le Cabaret", …… │
│ 1436770 ┆ ["ИвИванушка", "RusFashion", …… │
│ 4027081 ┆ ["Tishka", "Pasadena", … "phum… │
└─────────┴─────────────────────────────────┘
