In [1]:
import os
import json
import pickle

from bisect import bisect_left, bisect_right
from datetime import datetime, timedelta
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

# GPU hack if you need
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
model_type = 'baseline'

In [3]:
def count_plot(df, x, title='', figsize=(18, 6), top=None, **kwargs):
    plt.figure(figsize=(18, 6))

    val_cnt = df[x].value_counts()
    val_cnt = val_cnt.iloc[:top] if top else val_cnt

    sns.countplot(data=df, x=x, order=val_cnt.index, **kwargs)
    plt.xticks(rotation=90, fontsize=12)
    if title:
        plt.title(title, fontsize=16)
    else:
        plt.title(f'{x.capitalize()} count', fontsize=16)
    plt.plot()
    

def calculate_product_popularity():
    product_popularity = dict()

    for df in prod_df_generator():
        df = df[df.product_id != 0]
        for product_id, cnt in df.product_id.value_counts().items():
            if product_id in product_popularity:
                product_popularity[product_id] += cnt
            else:
                product_popularity[product_id] = cnt
                
    total_cnt = sum(v for v in product_popularity.values())
    product_popularity = {k: v/total_cnt for k, v in product_popularity.items()}
    
    return product_popularity


def calcualte_user_orderes():
    user_orderes = dict()

    for df in prod_df_generator():
        for order_id, user_id in df.groupby(['order_id', 'user_id']).size().index:
            if user_id in user_orderes:
                user_orderes[user_id] += 1
            else:
                user_orderes[user_id] = 1
                
    return user_orderes


def create_user_ids():

    user_ids = set()

    for prod_df in prod_df_generator(usecols=['user_id', 'product_id']):
        prod_df = prod_df[prod_df.product_id != 0]
        prod_df.user_id = prod_df.user_id
        user_ids = user_ids.union(prod_df.user_id.unique())

    return [int(v) for v in user_ids]

In [4]:
PATH_TO_DATA = 'data'

PATH_TO_PRODUCTS = f'{PATH_TO_DATA}/sbermarket_tab_2_1/'
PATH_TO_ORDERS = f'{PATH_TO_DATA}/kaggle_tab_1345/tab_1_orders.csv'
PATH_TO_CATS = f'{PATH_TO_DATA}/kaggle_tab_1345/tab_3_categories.csv'
PATH_TO_PROD_PROP = f'{PATH_TO_DATA}/kaggle_tab_1345/tab_5_product_properties.csv'
PATH_TO_USERS = f'{PATH_TO_DATA}/kaggle_tab_1345/tab_4_user_profiles.csv'
PATH_TO_CITIES = f'{PATH_TO_DATA}/tab_6_city.csv'
PATH_TO_SMPL_SUBM = f'{PATH_TO_DATA}/sample_submission.csv'

PRODUCT_TABS = os.listdir(PATH_TO_PRODUCTS)

PATH_TO_PREPROC_DATA = 'preprocessed_data'
os.makedirs(PATH_TO_PREPROC_DATA, exist_ok=True)

In [5]:
UNK_TOKEN = '<UNK>'
UNK_DATE = '0000-00-00 <UNK>'
RANDOM_STATE = 42

DO_DUMP_FEATURES = False

TOP_PRODUCTS = 10000

## Read data

### Orders data

In [6]:
retailer_threshold = 1e-1

orders_df = pd.read_csv(PATH_TO_ORDERS, index_col='order_id')
retailer_popularity = dict((orders_df.retailer.value_counts()/len(orders_df)))

most_popular_retailers = {k: v for k, v in retailer_popularity.items() if v > retailer_threshold}
orders_df.retailer = orders_df.retailer.apply(lambda x: x if x in most_popular_retailers else UNK_TOKEN)

most_popular_platforms = ['app', 'web']
orders_df.platform = orders_df.platform.apply(lambda x: x if x in most_popular_platforms else UNK_TOKEN)
orders_df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,user_id,order_created_time,retailer,store_id,platform
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17431000,72,2020-09-26 10:48:57,METRO,21,app
9718154,83,2020-05-08 09:46:18,METRO,87,web
10056850,142,2020-05-14 15:06:03,METRO,320,app
15952443,187,2020-09-01 17:34:00,<UNK>,533,app
10409918,224,2020-05-20 06:32:50,Ашан,183,web


## Mapping

In [7]:
def create_mapping(values):
    mapping = {UNK_TOKEN: 0}
    return update_mapping(mapping, values)


def update_mapping(mapping, values):
    for v in values:
        v = str(v)
        if v != UNK_TOKEN and v not in mapping:
            mapping[v] = len(mapping)

    return mapping


def prod_df_generator(usecols=None):
    for tab in tqdm(PRODUCT_TABS):
        if usecols:
            yield pd.read_csv(os.path.join(PATH_TO_PRODUCTS, tab), usecols=usecols)
        else:
            yield pd.read_csv(os.path.join(PATH_TO_PRODUCTS, tab))
            

int_or_unk = lambda x: UNK_TOKEN if pd.isna(x) else int(x)


def calculate_top_products():
    prod_cnt = Counter()

    for prod_df in prod_df_generator(['product_id']):

        prod_df = prod_df[prod_df.product_id != 0]
        prod_cnt.update(prod_df.product_id.values)

    prod_cnt_df = pd.DataFrame.from_dict(prod_cnt, orient='index', columns=['product_cnt'])

    prod_cnt_df = prod_cnt_df.sort_values(by='product_cnt', ascending=False)

    return prod_cnt_df


def select_top_product(mappings, top_products):
    prod_mapping = {}
    i = 1
    for k in sorted(mappings['product_id'].keys()):
        if k == UNK_TOKEN:
            prod_mapping[k] = 0
        elif int(k) in top_products:
            prod_mapping[k] = i
            i += 1
        else:
            prod_mapping[k] = 0

    return prod_mapping

In [8]:
if DO_DUMP_FEATURES:
    prod_cnt_df = calculate_top_products()
    top_products = prod_cnt_df.index[:TOP_PRODUCTS].values

In [9]:
feature_cols = [('retailer', orders_df, 'orders'),
                ('platform', orders_df, 'orders'),
                ('store_id', orders_df, 'orders')
               ]

feature_cols_prod = ['brand_name', 'product_id', 'master_category_id', 'parent_category_id']


if DO_DUMP_FEATURES:
    mappings = defaultdict(dict)

    for col, df, pref in tqdm(feature_cols):
        col_values = orders_df[col].astype(str)

        mappings[f'{pref}_{col}'] = create_mapping(col_values.unique())


    for col in feature_cols_prod:
        mappings[col] = create_mapping([])

    for prod_df in prod_df_generator(usecols=feature_cols_prod):
        prod_df = prod_df[prod_df.product_id != 0]

        brand_values = prod_df[feature_cols_prod[0]].fillna(UNK_TOKEN)
        product_values = prod_df[feature_cols_prod[1]].fillna(UNK_TOKEN)
        master_values = prod_df[feature_cols_prod[2]].apply(int_or_unk)
        parent_values = prod_df[feature_cols_prod[3]].apply(int_or_unk)

        mappings[feature_cols_prod[0]] = update_mapping(mappings[feature_cols_prod[0]], brand_values.unique())
        mappings[feature_cols_prod[1]] = update_mapping(mappings[feature_cols_prod[1]], product_values.unique())
        mappings[feature_cols_prod[2]] = update_mapping(mappings[feature_cols_prod[2]], master_values.unique())
        mappings[feature_cols_prod[3]] = update_mapping(mappings[feature_cols_prod[3]], parent_values.unique())

    mappings['product_id'] = select_top_product(mappings, top_products)    
    
    with open(f'{PATH_TO_PREPROC_DATA}/mappings.json', 'w') as f:
        json.dump(mappings, f)

In [10]:
# load mappings
with open(f'{PATH_TO_PREPROC_DATA}/mappings.json', 'r') as f:
     mappings = json.load(f)

## Data preparing

In [11]:
if DO_DUMP_FEATURES:
    order_dates = {int(k):v for k, v in orders_df.order_created_time.items()}
    
    with open(f'{PATH_TO_PREPROC_DATA}/order_dates.json', 'w') as f:
        json.dump(order_dates, f)

with open(f'{PATH_TO_PREPROC_DATA}/order_dates.json', 'r') as f:
     order_dates = json.load(f)

In [12]:
feature_cols = ['user_id',
                'order_id',
                'product_id',
                'price',
                'quantity',
                'discount',
                'brand_name',
                'master_category_id',
                'parent_category_id'
               ]

if DO_DUMP_FEATURES:
    user_dates = defaultdict(list)
    user_prices = defaultdict(list)
    user_quantities = defaultdict(list)
    user_discounts = defaultdict(list)
    user_brand_name = defaultdict(list)
    user_master_category_ids = defaultdict(list)
    user_parent_category_ids = defaultdict(list)
    user_order_ids = defaultdict(list)
    user_product_ids = defaultdict(list)

    for prod_df in prod_df_generator(feature_cols):
        prod_df = prod_df[prod_df.product_id != 0]

        prod_df['price'] = prod_df['price'].fillna(0)
        prod_df['quantity'] = prod_df['quantity'].fillna(1)
        prod_df['discount'] = prod_df['discount'].fillna(0)
        prod_df['brand_name'] = prod_df['brand_name'].fillna(UNK_TOKEN)
        prod_df['master_category_id'] =  prod_df['master_category_id'].apply(int_or_unk).astype(str)
        prod_df['parent_category_id'] = prod_df['parent_category_id'].apply(int_or_unk).astype(str)
        prod_df['product_id'] = prod_df['product_id'].astype(str)

        for i, row in prod_df.iterrows():
            user_dates[row.user_id].append(order_dates.get(str(row.order_id), UNK_DATE + f' {row.order_id}'))
            
            user_prices[row.user_id].append(row.price)
            user_quantities[row.user_id].append(row.quantity)
            user_discounts[row.user_id].append(max(0, row.discount))
            user_brand_name[row.user_id].append(mappings['brand_name'][row.brand_name])

            user_master_category_ids[row.user_id].append(mappings['master_category_id'][row.master_category_id])
            user_parent_category_ids[row.user_id].append(mappings['parent_category_id'][row.parent_category_id])
            user_order_ids[row.user_id].append(row.order_id)
            user_product_ids[row.user_id].append(mappings['product_id'][row.product_id])


    pickle.dump(user_dates, open(f'{PATH_TO_PREPROC_DATA}/user_dates.pkl', 'wb'))
    pickle.dump(user_prices, open(f'{PATH_TO_PREPROC_DATA}/user_prices.pkl', 'wb'))
    pickle.dump(user_discounts, open(f'{PATH_TO_PREPROC_DATA}/user_discounts.pkl', 'wb'))
    pickle.dump(user_quantities, open(f'{PATH_TO_PREPROC_DATA}/user_quantities.pkl', 'wb'))
    pickle.dump(user_brand_name, open(f'{PATH_TO_PREPROC_DATA}/user_brand_name.pkl', 'wb'))
    pickle.dump(user_master_category_ids, open(f'{PATH_TO_PREPROC_DATA}/user_master_category_ids.pkl', 'wb'))
    pickle.dump(user_parent_category_ids, open(f'{PATH_TO_PREPROC_DATA}/user_parent_category_ids.pkl', 'wb'))
    pickle.dump(user_order_ids, open(f'{PATH_TO_PREPROC_DATA}/user_order_ids.pkl', 'wb'))
    pickle.dump(user_product_ids, open(f'{PATH_TO_PREPROC_DATA}/user_product_ids.pkl', 'wb'))

In [13]:
%%time

# load client data
user_dates = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_dates.pkl', 'rb'))
user_prices = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_prices.pkl', 'rb'))
user_discounts = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_discounts.pkl', 'rb'))
user_quantities = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_quantities.pkl', 'rb'))
user_brand_name = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_brand_name.pkl', 'rb'))
user_master_category_ids = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_master_category_ids.pkl', 'rb'))
user_parent_category_ids = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_parent_category_ids.pkl', 'rb'))
user_order_ids = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_order_ids.pkl', 'rb'))
user_product_ids = pickle.load(open(f'{PATH_TO_PREPROC_DATA}/user_product_ids.pkl', 'rb'))

user_ids = json.load(open(f'{PATH_TO_PREPROC_DATA}/user_ids.json'))

CPU times: user 27.9 s, sys: 2.65 s, total: 30.6 s
Wall time: 30.5 s


In [14]:
for user_id in tqdm(user_ids):
    args = np.argsort(user_dates[user_id])

    user_prices[user_id] = np.array(user_prices[user_id])[args]
    user_discounts[user_id] = np.array(user_discounts[user_id])[args]
    user_quantities[user_id] = np.array(user_quantities[user_id])[args]
    user_brand_name[user_id] = np.array(user_brand_name[user_id])[args]
    user_master_category_ids[user_id] = np.array(user_master_category_ids[user_id])[args]
    user_parent_category_ids[user_id] = np.array(user_parent_category_ids[user_id])[args]
    user_order_ids[user_id] = np.array(user_order_ids[user_id])[args]
    user_product_ids[user_id] = np.array(user_product_ids[user_id])[args]

    user_dates[user_id] = np.array(user_dates[user_id])[args]

HBox(children=(FloatProgress(value=0.0, max=657431.0), HTML(value='')))




## Train val split

In [15]:
from sklearn.model_selection import train_test_split

train_user_id, valid_user_id = train_test_split(user_ids, train_size=0.8, random_state=RANDOM_STATE)

print(f'Train: {len(train_user_id)} Val: {len(valid_user_id)}')

Train: 525944 Val: 131487


In [16]:
def prepare_data(user_ids, is_submission=False):
    data_prices = []
    data_discounts = []
    data_quantity = []
    data_brand_name = []
    data_master_category = []
    data_parent_category = []
    data_product = []

    data_labels = []

    for user_id in tqdm(user_ids):
        date_series = user_dates[user_id]

        price_series = user_prices[user_id]
        discount_series = user_discounts[user_id]
        quantity_series = user_quantities[user_id]
        brand_name_series = user_brand_name[user_id]
        master_category_series = user_master_category_ids[user_id]
        parent_category_series = user_parent_category_ids[user_id]

        order_id_series = user_order_ids[user_id]
        product_series = user_product_ids[user_id]

        if is_submission:
            date_range = [None]
        else:
            date_range = np.sort(np.unique(date_series))

        for date_end in date_range:

            if is_submission:
                l, r = len(date_series), len(date_series)
            else:
                l, r = (bisect_left(date_series, date_end),
                        bisect_right(date_series, date_end))

            history_price = price_series[:l]
            history_discount = discount_series[:l]
            history_quantity = quantity_series[:l]
            history_brand_name = brand_name_series[:l]
            history_master_category = master_category_series[:l]
            history_parent_category = parent_category_series[:l]
            history_product = product_series[:l]

            predict_product = product_series[l:r]

            if (len(predict_product) > 0) and l or is_submission:
                data_prices.append(history_price)
                data_discounts.append(history_discount)
                data_quantity.append(history_quantity)
                data_brand_name.append(history_brand_name)
                data_master_category.append(history_master_category)
                data_parent_category.append(history_parent_category)
                data_product.append(history_product)

                data_labels.append(predict_product)


    return (data_prices, data_discounts, data_quantity, data_brand_name, 
            data_master_category, data_parent_category, data_product, data_labels)

In [23]:
(train_prices, train_discounts, train_quantity, train_brand, 
 train_master_category, train_parent_category, train_product, train_labels) = prepare_data(user_ids)

(valid_prices, valid_discounts, valid_quantity, valid_brand, 
 valid_master_category, valid_parent_category, valid_product, valid_labels) = prepare_data(valid_user_id)

HBox(children=(FloatProgress(value=0.0, max=657431.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=131487.0), HTML(value='')))




## PyTorch loaders

In [24]:
import torch
from torch.utils.data import Dataset, DataLoader

In [25]:
PRODUCT_NCLASSES = len(np.unique(list(mappings['product_id'].values())))
MASTER_CAT_NCLASSES = len(mappings['master_category_id'])
PARENT_CAT_NCLASSES = len(mappings['parent_category_id'])
BRAND_NCLASSES = len(mappings['brand_name'])

PADDING_LEN = 300

In [26]:
class ProductHistoryDataset(Dataset):
    def __init__(self, data_prices, data_discounts, data_quantity, data_brand_name, 
                 data_master_category, data_parent_category, data_product, labels=None, is_submission=False):
        super().__init__()

        self.data_prices = data_prices
        self.data_discounts = data_discounts
        self.data_quantity = data_quantity

        self.data_brand_name = data_brand_name
        self.data_master_category = data_master_category
        self.data_parent_category = data_parent_category
        self.data_product = data_product

        self.labels = labels
        self.is_submission = is_submission
        
        self.cat_feature_names = ['brand', 'master_category', 'parent_category', 'product']
        self.cat_features = [self.data_brand_name, self.data_master_category,
                             self.data_parent_category, self.data_product]

    def __len__(self):
        return len(self.data_product)

    def __getitem__(self, idx):
        targets = np.zeros(PRODUCT_NCLASSES - 1, dtype=np.float32)
        if not self.is_submission:
            targets[self.labels[idx] - 1] = 1.

        item = {'features': {}, 'targets': targets}

        data_prices = np.array(self.data_prices[idx][-PADDING_LEN:])
        data_prices = np.vectorize(lambda s: np.log(1 + s))(data_prices)

        data_discounts = np.array(self.data_discounts[idx][-PADDING_LEN:])
        data_discounts = np.vectorize(lambda s: np.log(1 + max(0., s)))(data_discounts) 
        data_quantity = np.array(self.data_quantity[idx][-PADDING_LEN:])
        
    
        values_len = data_prices.shape[0]
        pad = np.zeros(PADDING_LEN - values_len, dtype=np.float32)
        
        data_prices = np.append(data_prices, pad)
        data_discounts = np.append(data_discounts, pad)
        data_quantity = np.append(data_quantity, pad)

        item['features']['price'] = torch.from_numpy(data_prices).float()
        item['features']['discounts'] = torch.from_numpy(data_discounts).float()
        item['features']['quantity'] = torch.from_numpy(data_quantity).float()

        for feature_name, feature_values in zip(self.cat_feature_names, [f[idx] for f in self.cat_features]):

            feature_values = np.append(np.array(feature_values[-PADDING_LEN:]), pad).astype(np.int64)
            mask = np.append(np.ones(values_len, dtype=np.float32), pad)

            item['features'][feature_name] = torch.from_numpy(feature_values).long()
            item['features'][f'{feature_name}_mask'] = torch.from_numpy(mask).float()

        return item

In [27]:
train_dataset = ProductHistoryDataset(train_prices, train_discounts, train_quantity, 
                                      train_brand, train_master_category, train_parent_category, 
                                      train_product, train_labels)
valid_dataset = ProductHistoryDataset(valid_prices, valid_discounts, valid_quantity, 
                                      valid_brand, valid_master_category, valid_parent_category,
                                      valid_product, valid_labels)

In [28]:
train_loader = DataLoader(
    train_dataset, batch_size=64, shuffle=True, num_workers=4
)
valid_loader = DataLoader(
    valid_dataset, batch_size=64, shuffle=False, num_workers=4
)

In [29]:
batch = next(iter(train_loader))
torch.cat([batch['features']['price'][:2].unsqueeze(-1), 
           batch['features']['brand'][:2].unsqueeze(-1)], dim=-1).shape

torch.Size([2, 300, 2])

## Model

In [30]:
import torch.nn as nn

from collections import OrderedDict

In [31]:
PRODUCT_NCLASSES, MASTER_CAT_NCLASSES, PARENT_CAT_NCLASSES, BRAND_NCLASSES

(10001, 612, 120, 7612)

In [32]:
params = {'product_emb_dim': 300,
          'master_cat_emb_dim': 32,
          'parent_cat_emb_dim': 15,
          'brand_emb_dim': 32,

          'transformer_nhead': 2,
          'transformer_dim_feedforward': 300,

          'transformer_dropout': 0.1,
          'dense_unit': 256,
          'num_layers': 4,
         }

In [33]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        self.product_embedding = nn.Embedding(PRODUCT_NCLASSES, params['product_emb_dim'])
        self.master_cat_embedding = nn.Embedding(MASTER_CAT_NCLASSES, params['master_cat_emb_dim'])
        self.parent_cat_embedding = nn.Embedding(PARENT_CAT_NCLASSES, params['parent_cat_emb_dim'])
        self.brend_embedding = nn.Embedding(BRAND_NCLASSES, params['brand_emb_dim'])

        embedding_size = (params['product_emb_dim'] + params['master_cat_emb_dim'] +
                          params['parent_cat_emb_dim'] + params['brand_emb_dim'] +
                          3
                          )

        transformer_blocks = [(f'transformer_block_{i}', 
                               nn.TransformerEncoderLayer(d_model=embedding_size,
                                                          nhead=params['transformer_nhead'],
                                                          dim_feedforward=params['transformer_dim_feedforward'],
                                                          dropout=params['transformer_dropout']
                                                         )
                              ) for i in range(params['num_layers'])]
 
        self.transformer_encoder = nn.Sequential(OrderedDict(transformer_blocks))

        self.body = nn.Sequential(
            nn.Linear(in_features=embedding_size, out_features=384),
                                 )

        self.scorer = nn.Linear(in_features=384, out_features=PRODUCT_NCLASSES - 1)

    def forward(self, features):

        product_emb = self.product_embedding(features['product'])
        master_cat_emb = self.master_cat_embedding(features['master_category'])
        parent_cat_emb = self.parent_cat_embedding(features['parent_category'])
        brand_emb = self.brend_embedding(features['brand'])

        product_emb = product_emb * features['product_mask'].unsqueeze(-1)
        master_cat_emb = master_cat_emb * features['master_category_mask'].unsqueeze(-1)
        parent_cat_emb = parent_cat_emb * features['parent_category_mask'].unsqueeze(-1)
        brand_emb = brand_emb * features['brand_mask'].unsqueeze(-1)

        embeddings = torch.cat((product_emb, master_cat_emb, parent_cat_emb, brand_emb,
                                features['price'].unsqueeze(-1), features['discounts'].unsqueeze(-1),
                                features['quantity'].unsqueeze(-1), 
                               ), dim=-1)

        transformer_output = self.transformer_encoder(embeddings)

        pooling = torch.mean(transformer_output, dim=1)
        body = torch.tanh(self.body(pooling))
        merch_logits = self.scorer(body)

        return merch_logits

### One-batch-check

In [34]:
model = Model()
criterion = nn.BCEWithLogitsLoss()
batch = next(iter(train_loader))
output = model(batch['features'])
loss = criterion(output, batch['targets'])
print(loss)

print('Model parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))

tensor(0.6962, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
Model parameters: 10528876


In [35]:
from catalyst import dl, utils
from catalyst.utils import metrics

In [36]:
from typing import List, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from catalyst.utils.metrics.functional import preprocess_multi_label_metrics
from catalyst.utils.torch import get_activation_fn


def multi_label_metrics(
    outputs: torch.Tensor,
    targets: torch.Tensor,
    threshold: Union[float, torch.Tensor],
    activation: Optional[str] = None,
    eps: float = 1e-7,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Computes multi-label precision for the specified activation and threshold.

    Args:
        outputs (torch.Tensor): NxK tensor that for each of the N examples
            indicates the probability of the example belonging to each of
            the K classes, according to the model.
        targets (torch.Tensor): binary NxK tensort that encodes which of the K
            classes are associated with the N-th input
            (eg: a row [0, 1, 0, 1] indicates that the example is
            associated with classes 2 and 4)
        threshold (float): threshold for for model output
        activation (str): activation to use for model output
        eps (float): epsilon to avoid zero division
    
    Extended version of 
        https://github.com/catalyst-team/catalyst/blob/master/catalyst/utils/metrics/accuracy.py#L58

    Returns:
        computed multi-label metrics
    """
    outputs, targets, _ = preprocess_multi_label_metrics(
        outputs=outputs, targets=targets
    )
    activation_fn = get_activation_fn(activation)
    outputs = activation_fn(outputs)

    outputs = (outputs > threshold).long()

    accuracy = (targets.long() == outputs.long()).sum().float() / np.prod(
        targets.shape
    )

    intersection = (outputs.long() * targets.long()).sum(axis=1).float()
    num_predicted = outputs.long().sum(axis=1).float()
    num_relevant = targets.long().sum(axis=1).float()
    union = num_predicted + num_relevant

    # Precision = ({predicted items} && {relevant items}) / {predicted items}
    precision = intersection / (num_predicted + eps * (num_predicted == 0))
    # Recall = ({predicted items} && {relevant items}) / {relevant items}
    recall = intersection / (num_relevant + eps * (num_relevant == 0))
    # IoU = ({predicted items} && {relevant items}) / ({predicted items} || {relevant items})
    iou = (intersection + eps * (union == 0)) / (union - intersection + eps)

    return accuracy, precision.mean(), recall.mean(), iou.mean()


def precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
):
    """
    Computes precision at cutoff k for one sample

    Args:
       actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
       predicted (torch.Tensor): binary tensor that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       k (int): parameter k of precison@k

    Returns:
       Computed value of precision@k for given sample
    """
    p_at_k = 0.0
    for item in predicted[:k]:
        if actual[item]:
            p_at_k += 1
    p_at_k /= k

    return p_at_k


def average_precision_at_k(
    actual: torch.Tensor, 
    predicted: torch.Tensor, 
    k: int,
) -> float:
    """
    Computes average precision at cutoff k for one sample

    Args:
      actual: (torch.Tensor): tensor of length K with predicted item_ids sorted by relevance
      predicted (torch.Tensor): binary tensor that encodes which of the K
          classes are associated with the N-th input
          (eg: a row [0, 1, 0, 1] indicates that the example is
          associated with classes 2 and 4)
      k (int): parameter k of AP@k

    Returns:
        Computed value of AP@k for given sample
    """
    ap_at_k = 0.0
    for idx, item in enumerate(predicted[:k]):
        if actual[item]:
            ap_at_k += precision_at_k(actual, predicted, k=idx + 1)
    ap_at_k /= min(k, actual.sum().cpu().numpy())
    

    return ap_at_k


def mean_average_precision_at_k(
    output: torch.Tensor, target: torch.Tensor, top_k: Tuple[int, ...] = (1,)
) -> List[float]:
    """
    Computes mean_average_precision_at_k at set of cutoff parameters K

    Args:
       outputs (torch.Tensor): NxK tensor that for each of the N examples
           indicates the probability of the example belonging to each of
           the K classes, according to the model.
       targets (torch.Tensor): binary NxK tensort that encodes which of the K
           classes are associated with the N-th input
           (eg: a row [0, 1, 0, 1] indicates that the example is
           associated with classes 2 and 4)
       top_k (tuple): list of parameters k at which map@k will be computed


    Returns:
       List of computed values of map@k at each cutoff k from topk
    """
    max_k = max(top_k)
    batch_size = target.size(0)

    _, top_indices = output.topk(k=max_k, dim=1, largest=True, sorted=True)

    result = []
    for k in top_k:  # loop over k
        map_at_k = 0.0
        for actual_target, predicted_items in zip(
            target, top_indices
        ):  # loop over samples
            map_at_k += average_precision_at_k(
                actual_target, predicted_items, k
            )
        map_at_k = map_at_k / batch_size
        result.append(map_at_k)

    return result

In [51]:
class CustomRunner(dl.Runner):

    def _handle_batch(self, batch):
        features, targets = batch['features'], batch['targets']
        logits = self.model(features)
        scores = torch.sigmoid(logits)

        loss = self.criterion(logits, targets)
        accuracy, precision, recall, iou = multi_label_metrics(logits, targets, 
                                                               threshold=0.5, activation='Sigmoid'
                                                              )
        (map50, ) = mean_average_precision_at_k(scores, targets, top_k=(50,))
        batch_metrics = {'loss': loss,
                         'precision': precision,
                         'recall': recall,
                         'map50': map50,   
                       }

        self.input = {'features': features, 'targets': targets}
        self.output = {'logits': logits, 'scores': scores}
        self.batch_metrics.update(batch_metrics)

        if self.is_train_loader:
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
    
    def predict_batch(self, batch):
        # model inference step
        batch = utils.maybe_recursive_call(batch, 'to', device=self.device)
        logits = self.model(batch['features'])
        scores = torch.sigmoid(logits)
        return scores

In [52]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model = Model()
#model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

loaders = {"train": train_loader, "valid": valid_loader}

cuda:0


In [53]:
runner = CustomRunner()

runner.train(model=model,
             criterion=criterion,
             optimizer=optimizer,
             scheduler=None,
             loaders=loaders,
             logdir=f'./logs/{model_type}',
             num_epochs=30,
             verbose=True,
             load_best_on_end=True,
             overfit=False,  #  <<<--- DO NOT FORGET TO MAKE IT ``False`` 
                             #  (``True`` uses only one batch to check pipeline correctness)
             callbacks=[
                 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html
                 # dl.AveragePrecisionCallback(input_key="targets", output_key="scores", prefix="ap"),
                 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
                 # dl.AUCCallback(input_key="targets", output_key="scores", prefix="auc"),
             ],

             main_metric='ap/mean', 
             minimize_metric=False,
            )




1/30 * Epoch (train):   0% 0/17448 [00:00<?, ?it/s][A[A[A


1/30 * Epoch (train):   0% 0/17448 [00:01<?, ?it/s, loss=0.698, map50=3.938e-04, precision=0.002, recall=0.487][A[A[A


1/30 * Epoch (train):   0% 1/17448 [00:01<6:24:30,  1.32s/it, loss=0.698, map50=3.938e-04, precision=0.002, recall=0.487][A[A[A


1/30 * Epoch (train):   0% 1/17448 [00:01<6:24:30,  1.32s/it, loss=0.690, map50=2.117e-04, precision=0.002, recall=0.451][A[A[A


1/30 * Epoch (train):   0% 2/17448 [00:01<4:46:10,  1.02it/s, loss=0.690, map50=2.117e-04, precision=0.002, recall=0.451][A[A[A


1/30 * Epoch (train):   0% 2/17448 [00:01<4:46:10,  1.02it/s, loss=0.577, map50=0.007, precision=0.003, recall=0.289]    [A[A[A


1/30 * Epoch (train):   0% 3/17448 [00:01<3:39:09,  1.33it/s, loss=0.577, map50=0.007, precision=0.003, recall=0.289][A[A[A


1/30 * Epoch (train):   0% 3/17448 [00:01<3:39:09,  1.33it/s, loss=0.479, map50=0.082, precision=0.003, recall=0.135][A[A[A


1/30 * Epoch (train):

1/30 * Epoch (train):   0% 31/17448 [00:07<1:03:10,  4.60it/s, loss=0.014, map50=0.107, precision=1.000, recall=0.081][A[A[A


1/30 * Epoch (train):   0% 32/17448 [00:07<1:04:15,  4.52it/s, loss=0.014, map50=0.107, precision=1.000, recall=0.081][A[A[A


1/30 * Epoch (train):   0% 32/17448 [00:07<1:04:15,  4.52it/s, loss=0.013, map50=0.138, precision=0.969, recall=0.111][A[A[A


1/30 * Epoch (train):   0% 33/17448 [00:07<1:04:32,  4.50it/s, loss=0.013, map50=0.138, precision=0.969, recall=0.111][A[A[A


1/30 * Epoch (train):   0% 33/17448 [00:08<1:04:32,  4.50it/s, loss=0.015, map50=0.082, precision=0.938, recall=0.055][A[A[A


1/30 * Epoch (train):   0% 34/17448 [00:08<1:04:15,  4.52it/s, loss=0.015, map50=0.082, precision=0.938, recall=0.055][A[A[A


1/30 * Epoch (train):   0% 34/17448 [00:08<1:04:15,  4.52it/s, loss=0.015, map50=0.094, precision=0.953, recall=0.064][A[A[A


1/30 * Epoch (train):   0% 35/17448 [00:08<1:04:51,  4.47it/s, loss=0.015, map50=0.094, pr

1/30 * Epoch (train):   0% 63/17448 [00:14<1:05:02,  4.45it/s, loss=0.013, map50=0.105, precision=0.953, recall=0.084][A[A[A


1/30 * Epoch (train):   0% 63/17448 [00:14<1:05:02,  4.45it/s, loss=0.014, map50=0.101, precision=0.922, recall=0.084][A[A[A


1/30 * Epoch (train):   0% 64/17448 [00:14<1:04:36,  4.48it/s, loss=0.014, map50=0.101, precision=0.922, recall=0.084][A[A[A


1/30 * Epoch (train):   0% 64/17448 [00:15<1:04:36,  4.48it/s, loss=0.014, map50=0.083, precision=0.938, recall=0.069][A[A[A


1/30 * Epoch (train):   0% 65/17448 [00:15<1:04:47,  4.47it/s, loss=0.014, map50=0.083, precision=0.938, recall=0.069][A[A[A


1/30 * Epoch (train):   0% 65/17448 [00:15<1:04:47,  4.47it/s, loss=0.014, map50=0.088, precision=0.938, recall=0.072][A[A[A


1/30 * Epoch (train):   0% 66/17448 [00:15<1:05:14,  4.44it/s, loss=0.014, map50=0.088, precision=0.938, recall=0.072][A[A[A


1/30 * Epoch (train):   0% 66/17448 [00:15<1:05:14,  4.44it/s, loss=0.014, map50=0.080, pr

1/30 * Epoch (train):   1% 94/17448 [00:21<1:03:56,  4.52it/s, loss=0.012, map50=0.081, precision=0.953, recall=0.066][A[A[A


1/30 * Epoch (train):   1% 95/17448 [00:21<1:03:34,  4.55it/s, loss=0.012, map50=0.081, precision=0.953, recall=0.066][A[A[A


1/30 * Epoch (train):   1% 95/17448 [00:22<1:03:34,  4.55it/s, loss=0.013, map50=0.130, precision=0.969, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 96/17448 [00:22<1:03:23,  4.56it/s, loss=0.013, map50=0.130, precision=0.969, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 96/17448 [00:22<1:03:23,  4.56it/s, loss=0.014, map50=0.114, precision=0.969, recall=0.091][A[A[A


1/30 * Epoch (train):   1% 97/17448 [00:22<1:04:06,  4.51it/s, loss=0.014, map50=0.114, precision=0.969, recall=0.091][A[A[A


1/30 * Epoch (train):   1% 97/17448 [00:22<1:04:06,  4.51it/s, loss=0.013, map50=0.102, precision=0.938, recall=0.076][A[A[A


1/30 * Epoch (train):   1% 98/17448 [00:22<1:04:50,  4.46it/s, loss=0.013, map50=0.102, pr

1/30 * Epoch (train):   1% 125/17448 [00:28<1:05:32,  4.41it/s, loss=0.012, map50=0.100, precision=0.953, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 126/17448 [00:28<1:04:42,  4.46it/s, loss=0.012, map50=0.100, precision=0.953, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 126/17448 [00:28<1:04:42,  4.46it/s, loss=0.013, map50=0.129, precision=0.984, recall=0.105][A[A[A


1/30 * Epoch (train):   1% 127/17448 [00:28<1:04:43,  4.46it/s, loss=0.013, map50=0.129, precision=0.984, recall=0.105][A[A[A


1/30 * Epoch (train):   1% 127/17448 [00:29<1:04:43,  4.46it/s, loss=0.013, map50=0.112, precision=0.938, recall=0.083][A[A[A


1/30 * Epoch (train):   1% 128/17448 [00:29<1:05:21,  4.42it/s, loss=0.013, map50=0.112, precision=0.938, recall=0.083][A[A[A


1/30 * Epoch (train):   1% 128/17448 [00:29<1:05:21,  4.42it/s, loss=0.013, map50=0.100, precision=0.969, recall=0.077][A[A[A


1/30 * Epoch (train):   1% 129/17448 [00:29<1:05:00,  4.44it/s, loss=0.013, map50=0

1/30 * Epoch (train):   1% 156/17448 [00:35<1:04:11,  4.49it/s, loss=0.011, map50=0.122, precision=0.953, recall=0.103][A[A[A


1/30 * Epoch (train):   1% 157/17448 [00:35<1:03:38,  4.53it/s, loss=0.011, map50=0.122, precision=0.953, recall=0.103][A[A[A


1/30 * Epoch (train):   1% 157/17448 [00:35<1:03:38,  4.53it/s, loss=0.012, map50=0.124, precision=0.922, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 158/17448 [00:35<1:02:52,  4.58it/s, loss=0.012, map50=0.124, precision=0.922, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 158/17448 [00:36<1:02:52,  4.58it/s, loss=0.013, map50=0.092, precision=0.906, recall=0.069][A[A[A


1/30 * Epoch (train):   1% 159/17448 [00:36<1:03:21,  4.55it/s, loss=0.013, map50=0.092, precision=0.906, recall=0.069][A[A[A


1/30 * Epoch (train):   1% 159/17448 [00:36<1:03:21,  4.55it/s, loss=0.014, map50=0.094, precision=0.922, recall=0.078][A[A[A


1/30 * Epoch (train):   1% 160/17448 [00:36<1:03:08,  4.56it/s, loss=0.014, map50=0

1/30 * Epoch (train):   1% 187/17448 [00:42<1:03:51,  4.51it/s, loss=0.012, map50=0.122, precision=0.953, recall=0.098][A[A[A


1/30 * Epoch (train):   1% 188/17448 [00:42<1:03:10,  4.55it/s, loss=0.012, map50=0.122, precision=0.953, recall=0.098][A[A[A


1/30 * Epoch (train):   1% 188/17448 [00:42<1:03:10,  4.55it/s, loss=0.014, map50=0.114, precision=0.969, recall=0.091][A[A[A


1/30 * Epoch (train):   1% 189/17448 [00:42<1:02:55,  4.57it/s, loss=0.014, map50=0.114, precision=0.969, recall=0.091][A[A[A


1/30 * Epoch (train):   1% 189/17448 [00:42<1:02:55,  4.57it/s, loss=0.013, map50=0.099, precision=0.984, recall=0.076][A[A[A


1/30 * Epoch (train):   1% 190/17448 [00:42<1:02:55,  4.57it/s, loss=0.013, map50=0.099, precision=0.984, recall=0.076][A[A[A


1/30 * Epoch (train):   1% 190/17448 [00:43<1:02:55,  4.57it/s, loss=0.012, map50=0.096, precision=0.891, recall=0.080][A[A[A


1/30 * Epoch (train):   1% 191/17448 [00:43<1:03:14,  4.55it/s, loss=0.012, map50=0

1/30 * Epoch (train):   1% 218/17448 [00:49<1:04:28,  4.45it/s, loss=0.015, map50=0.102, precision=0.984, recall=0.081][A[A[A


1/30 * Epoch (train):   1% 219/17448 [00:49<1:04:10,  4.47it/s, loss=0.015, map50=0.102, precision=0.984, recall=0.081][A[A[A


1/30 * Epoch (train):   1% 219/17448 [00:49<1:04:10,  4.47it/s, loss=0.011, map50=0.127, precision=0.922, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 220/17448 [00:49<1:03:22,  4.53it/s, loss=0.011, map50=0.127, precision=0.922, recall=0.110][A[A[A


1/30 * Epoch (train):   1% 220/17448 [00:49<1:03:22,  4.53it/s, loss=0.014, map50=0.094, precision=0.984, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 221/17448 [00:49<1:03:38,  4.51it/s, loss=0.014, map50=0.094, precision=0.984, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 221/17448 [00:50<1:03:38,  4.51it/s, loss=0.013, map50=0.097, precision=0.953, recall=0.070][A[A[A


1/30 * Epoch (train):   1% 222/17448 [00:50<1:03:42,  4.51it/s, loss=0.013, map50=0

1/30 * Epoch (train):   1% 249/17448 [00:56<1:04:01,  4.48it/s, loss=0.013, map50=0.096, precision=0.953, recall=0.080][A[A[A


1/30 * Epoch (train):   1% 250/17448 [00:56<1:03:28,  4.52it/s, loss=0.013, map50=0.096, precision=0.953, recall=0.080][A[A[A


1/30 * Epoch (train):   1% 250/17448 [00:56<1:03:28,  4.52it/s, loss=0.014, map50=0.103, precision=0.969, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 251/17448 [00:56<1:03:20,  4.52it/s, loss=0.014, map50=0.103, precision=0.969, recall=0.084][A[A[A


1/30 * Epoch (train):   1% 251/17448 [00:56<1:03:20,  4.52it/s, loss=0.015, map50=0.083, precision=0.984, recall=0.064][A[A[A


1/30 * Epoch (train):   1% 252/17448 [00:56<1:03:49,  4.49it/s, loss=0.015, map50=0.083, precision=0.984, recall=0.064][A[A[A


1/30 * Epoch (train):   1% 252/17448 [00:56<1:03:49,  4.49it/s, loss=0.013, map50=0.116, precision=0.969, recall=0.097][A[A[A


1/30 * Epoch (train):   1% 253/17448 [00:56<1:03:32,  4.51it/s, loss=0.013, map50=0

1/30 * Epoch (train):   2% 280/17448 [01:03<1:02:03,  4.61it/s, loss=0.013, map50=0.104, precision=0.906, recall=0.084][A[A[A


1/30 * Epoch (train):   2% 281/17448 [01:03<1:01:47,  4.63it/s, loss=0.013, map50=0.104, precision=0.906, recall=0.084][A[A[A


1/30 * Epoch (train):   2% 281/17448 [01:03<1:01:47,  4.63it/s, loss=0.015, map50=0.097, precision=0.984, recall=0.081][A[A[A


1/30 * Epoch (train):   2% 282/17448 [01:03<1:01:48,  4.63it/s, loss=0.015, map50=0.097, precision=0.984, recall=0.081][A[A[A


1/30 * Epoch (train):   2% 282/17448 [01:03<1:01:48,  4.63it/s, loss=0.013, map50=0.119, precision=0.938, recall=0.100][A[A[A


1/30 * Epoch (train):   2% 283/17448 [01:03<1:01:31,  4.65it/s, loss=0.013, map50=0.119, precision=0.938, recall=0.100][A[A[A


1/30 * Epoch (train):   2% 283/17448 [01:03<1:01:31,  4.65it/s, loss=0.013, map50=0.083, precision=0.922, recall=0.066][A[A[A


1/30 * Epoch (train):   2% 284/17448 [01:03<1:01:34,  4.65it/s, loss=0.013, map50=0

1/30 * Epoch (train):   2% 311/17448 [01:09<1:03:30,  4.50it/s, loss=0.013, map50=0.122, precision=0.984, recall=0.100][A[A[A


1/30 * Epoch (train):   2% 312/17448 [01:09<1:03:13,  4.52it/s, loss=0.013, map50=0.122, precision=0.984, recall=0.100][A[A[A


1/30 * Epoch (train):   2% 312/17448 [01:10<1:03:13,  4.52it/s, loss=0.014, map50=0.093, precision=0.969, recall=0.068][A[A[A


1/30 * Epoch (train):   2% 313/17448 [01:10<1:03:47,  4.48it/s, loss=0.014, map50=0.093, precision=0.969, recall=0.068][A[A[A


1/30 * Epoch (train):   2% 313/17448 [01:10<1:03:47,  4.48it/s, loss=0.013, map50=0.089, precision=0.984, recall=0.064][A[A[A


1/30 * Epoch (train):   2% 314/17448 [01:10<1:04:11,  4.45it/s, loss=0.013, map50=0.089, precision=0.984, recall=0.064][A[A[A


1/30 * Epoch (train):   2% 314/17448 [01:10<1:04:11,  4.45it/s, loss=0.012, map50=0.124, precision=0.938, recall=0.110][A[A[A


1/30 * Epoch (train):   2% 315/17448 [01:10<1:03:07,  4.52it/s, loss=0.012, map50=0

## Submission

In [54]:
pd_sbm = pd.read_csv(PATH_TO_SMPL_SUBM, index_col='Id')
submit_ids = pd_sbm.index.values
pd_sbm.head()

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
51,3239211 10720024 8493844 1965540 5009002 95218...
65,11693356 13392267 2464515 2245175 4748570 1134...
766,12773054 3692749 12726751 10041624 11893515 23...
1132,8613934 3599065 575614 4821474 11609320 699047...
1578,289404 4328240 3129788 9390886 11166655 173077...


In [55]:
test_prices, *_ = prepare_data(submit_ids, is_submission=True)
submit_ids_with_data_idx = [i for i, tp in enumerate(test_prices) if len(tp)]
submit_ids_with_data = submit_ids[submit_ids_with_data_idx]

(test_prices, test_discounts, test_quantity, test_brand, 
 test_master_category, test_parent_category, test_product, test_labels) = prepare_data(submit_ids_with_data, 
                                                                                       is_submission=True)

full_dataset = ProductHistoryDataset(test_prices, test_discounts, test_quantity, 
                                     test_brand, test_master_category, test_parent_category,
                                     test_product, is_submission=True)
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False, num_workers=8, drop_last=False)

HBox(children=(FloatProgress(value=0.0, max=107068.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=97327.0), HTML(value='')))




In [56]:
predictions = []
for scores in tqdm(runner.predict_loader(loader=full_loader), total = len(full_loader)):
    _, top_indices = scores.topk(k=50, dim=1, largest=True, sorted=True)
    top_indices += 1
    predictions += top_indices.detach().cpu().tolist()

HBox(children=(FloatProgress(value=0.0, max=1521.0), HTML(value='')))




In [57]:
# inverse mapping for merchant_type in predictions
product_inverse_mapping = {k: int(v) for v, k in mappings['product_id'].items() if v != UNK_TOKEN}

def inverse_mapping(x):
    return list(map(product_inverse_mapping.get, x))

predictions_prod_id = list(map(inverse_mapping, predictions))

In [59]:
submission = pd_sbm.copy()

for i, p in tqdm(zip(submit_ids_with_data, predictions_prod_id)):
    submission.loc[i].Predicted = ' '.join(map(str, p))

sumb_path = f'submission_Hugs_for_Bugs_{model_type}.csv'
submission.to_csv(sumb_path, sep=',')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [60]:
!kaggle competitions submit test-recsys -f $sumb_path -m $model_type

100%|██████████████████████████████████████| 31.7M/31.7M [00:03<00:00, 9.34MB/s]
Successfully submitted to Predict products in the next order 