In [None]:
!pip install pandas numpy torch tqdm

In [None]:
import pandas as pd
import numpy as np
import random
import os
import torch

In [None]:
# download data from https://disk.yandex.ru/d/0ya1tUYrin_tEg 
# or https://www.kaggle.com/chiranjivdas09/ta-feng-grocery-dataset

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

In [None]:
df = pd.read_csv('data/ta_feng_all_months_merged.csv')
df.head()

In [None]:
df = df[['CUSTOMER_ID', 'PRODUCT_ID', 'TRANSACTION_DT']]
df = df.rename({'CUSTOMER_ID': 'user_id', 'PRODUCT_ID': 'item_id', 'TRANSACTION_DT': 'timestamp'}, axis=1)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['user_id', 'timestamp'])
df.head()

#### Удалим непопулярные айтемы и юзеров у которых мало транзакций

* Оставим товары которые купили больше 100 раз

* Оставим юзеров у которых больше 20 покупок

In [None]:
# YOUR CODE HERE

In [None]:
assert len(df) == 296132, 'wrong length of dataframe'
assert df['user_id'].nunique() == 7070, 'incorrent number of users'
assert df['item_id'].nunique() == 1774, 'incorrent number of items'

* закодируем товары числами от 1 до n
* закодируем юзеров числами от 1 до m

In [None]:
# YOUR CODE HERE

In [None]:
assert df['item_id'].min() == 1, 'item encoding should start from 1'
assert df['user_id'].min() == 1, 'user encoding should start from 1'
assert df['item_id'].max() == 1774
assert df['user_id'].max() == 7070

* сгруппируем корзины пользователя(все айтемы купленный в один день)
* оставим только тех пользователей, у которых есть хотя бы 5 корзин
* выделим последнюю корзину как таргет

пример:



| user_id | item_id | timestamp  |
|---------|---------|------------|
| 1       | 1       | 2000-12-03 |
| 1       | 2       | 2000-12-03 |
| 1       | 3       | 2000-12-04 |
| 1       | 5       | 2000-12-04 |
| 1       | 7       | 2000-12-04 |
| 1       | 4       | 2000-12-06 |
| 1       | 3       | 2000-12-09 |
| 1       | 4       | 2000-12-14 |
| 1       | 5       | 2000-12-14 |

| user_id 	| baskets 	| target  	|
|---------	|---------	|------------	|
| 1       	| [[1, 2], [3, 5, 7], [4], [3]]       	| [4, 5]	|


In [None]:
def group_baskets(df):
    # YOUR CODE HERE
    
df_grouped = group_baskets(df)

In [None]:
assert df_grouped['baskets'].apply(len).sum() == 45075, 'wrong number of baskets'
assert df_grouped['target'].apply(len).sum() == 23212, 'wrong target len'
assert len(df_grouped.iloc[0]['target']) == 5, 'wrong target for first user'

### Model

Обучим супер-простую модель которая состоит из

1) Эмбединга айтемов

2) Эмбединг корзины как среднее эмбединга всех айтемов

3) RNN через все корзины

4) Скор для юзера-айтема как скалярное произведение скрытого состояния RNN и эмбединга айтемов

5) Нормируем софтмаксом

6) BCE-loss

In [None]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [None]:
from typing import List

def pad_3d_sequence(tokens: List[List[int]]) -> torch.Tensor:
    '''
    Examples:
    ---------
        pad_3d_sequence(
            [[[1, 2, 3], [4, 5]], [[3, 4], [7, 8, 9, 6], [1, 2, 3]]]
        )
        tensor([[[1., 2., 3., 0.],
                 [4., 5., 0., 0.],
                 [0., 0., 0., 0.]],
                [[3., 4., 0., 0.],
                 [7., 8., 9., 6.],
                 [1., 2., 3., 0.]]])
    '''
    # Adopted from: https://discuss.pytorch.org/t/nested-list-of-variable-length-to-a-tensor/38699
    words = max([len(row) for batch in tokens for row in batch])
    sentences = max([len(batch) for batch in tokens])
    padded = [batch + [[0] * (words)] * (sentences - len(batch)) for batch in tokens]
    padded = torch.LongTensor([row + [0] * (words - len(row)) for batch in padded for row in batch])
    padded = padded.view(-1, sentences, words)
    
    return padded

print(pad_3d_sequence([
    [[1, 2, 3], [4, 5]], 
    [[3, 4], [7, 8, 9, 6], [1, 2, 3]]
]))

In [None]:
class DatasetInstances(Dataset):
    def __init__(self, df, n_items) -> None:
        super().__init__()
        self._df = df
        self._n_items = n_items
        
    def _to_binary(self, target):
        binary_target = np.zeros(self._n_items)
        binary_target[target] = 1.
        
        return binary_target

    def __getitem__(self, idx):
        train = self._df.iloc[idx]['baskets']
        target = self._df.iloc[idx]['target']
        return train, len(train), self._to_binary(target)

    def __len__(self):
        return len(self._df)

def collate_func_train_dataloader(batch):
    return pad_3d_sequence([i[0] for i in batch]), torch.tensor([i[1] for i in batch]), torch.tensor([i[2] for i in batch])

In [None]:
import torch
import torch.nn.functional as F


class Model(torch.nn.Module):
    def __init__(self, n_items, emb_dim):
        super().__init__()
        
        self.embeding = torch.nn.Embedding(
            num_embeddings=n_items, 
            embedding_dim=emb_dim,
            padding_idx=0,
        )
        self.rnn = torch.nn.GRU(
            input_size=emb_dim, 
            hidden_size=emb_dim,
            num_layers=1,
            batch_first=True,
            bidirectional=False,
        )
    
    def forward(self, 
                batch, # ~ [batch_size, n_baskets, n_items_in_basket]
                lengths, # ~ [batch_size]
               ):
        # First you should get embeding for each item in each basket
        # after that get basket embeding as average of all items in basket
        # run RNN throw all baskets
        # get correct last hidden state for each user(dont forget about padding)
        # count logits as dot product between hidden state and all items embedings
        # normalize logits with softmax and return it from model
        # outpput ~ [batch_size, n_items]
        # GOOD LUCK!
        
        # YOUR CODE HERE
        raise NotImplementedError
        

### Train Loop

In [None]:
def train_one_epoch(model, optimizer, dataloader, loss_func):
    model.train()
    loss = 0
    for train, lengths, target in tqdm(dataloader):
        model.zero_grad()
        scores = model(train, lengths)
        batch_loss = loss_func(scores.float(), target.float())
        batch_loss.backward()
        optimizer.step()
        
        loss += batch_loss.data
    
    return loss / len(dataloader)

In [None]:
n_items = df['item_id'].max() + 1
model = Model(n_items=n_items, emb_dim=64)

train_dataloader = DataLoader(
    dataset=DatasetInstances(df_grouped, n_items),
    batch_size=4,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_func_train_dataloader,
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_func = torch.nn.BCELoss()


n_epochs = 10
for epoch in range(n_epochs):
    print(f'Training. Epoch {epoch}')
    train_loss = train_one_epoch(model, optimizer, train_dataloader, loss_func)
    print(f'Train loss: {train_loss}')
    print(f'Epoch {epoch} is finished')

In [None]:
assert train_loss < 0.018, "i'm sorry for your loss"

In [None]:
def get_recommendations(model, dataloader, top_k=20):
    # return top_k items with highest score for each user from model
    
    # YOUR CODE HERE
    raise NotImplementedError

In [None]:
model_predict = get_recommendations(model, train_dataloader)

In [None]:
assert len(model_predict) == len(df_grouped), 'wrong number of users'
assert len(model_predict[0]) == 20, 'wrong number of recommendations'

### Метрики качества

HR@k - доля пользователей, у которых было хотя бы одно верное предсказание среди первых k

$ HR = \dfrac{hits}{hits + misses} $

$precision = \dfrac{TP}{TP + FP}$

$recall = \dfrac{TP}{TP + FN}$

$NDCG@k = \dfrac{DCG@k}{IDCG@k}$,где

$DCG@k = \sum_{i=1}^{k} \dfrac{rel_i}{\log(i + 1)}$

$IDCG@k = DCG@k$ при идеальном ранжировании

In [None]:
def HR(predict, target, at_k=10):
    # YOUR CODE HERE    
    raise NotImplementedError

def recall(predict, target, at_k=10):
    # YOUR CODE HERE
    raise NotImplementedError
    
def precision(predict, target, at_k=10):
    # YOUR CODE HERE
    raise NotImplementedError
    
def NDCG(predict, target, at_k=10):
    # YOUR CODE HERE
    raise NotImplementedError

In [None]:
tmp_pred = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
tmp_target = [[3, 4, 8, 9], [6, 10]]

In [None]:
assert np.allclose(HR(tmp_pred, tmp_target, 1), 0.5), 'wrong HR@1'
assert np.allclose(HR(tmp_pred, tmp_target, 2), 0.5), 'wrong HR@2'
assert np.allclose(HR(tmp_pred, tmp_target, 3), 1.), 'wrong HR@3'

assert np.allclose(recall(tmp_pred, tmp_target, 1), 0.25), 'wrong recall@1'
assert np.allclose(recall(tmp_pred, tmp_target, 3), 0.375), 'wrong recall@3'
assert np.allclose(recall(tmp_pred, tmp_target, 5), 0.75), 'wrong recall@5'

assert np.allclose(precision(tmp_pred, tmp_target, 1), 0.5), 'wrong precision@1'
assert np.allclose(precision(tmp_pred, tmp_target, 2), 0.25), 'wrong precision@2'
assert np.allclose(precision(tmp_pred, tmp_target, 3), 0.3333333), 'wrong precision@3'

assert np.allclose(NDCG(tmp_pred, tmp_target, 1), 0.5), 'wrong ndcg@1'
assert np.allclose(NDCG(tmp_pred, tmp_target, 3), 0.423893), 'wrong ndcg@3'
assert np.allclose(NDCG(tmp_pred, tmp_target, 5), 0.606831), 'wrong ndcg@5'

In [None]:
seed_everything()

n_users = 10000
n_items = 100

sample_pred = [random.sample(range(1, n_items + 1), 20) for i in range(n_users)]
sample_target = [random.sample(range(1, n_items + 1), random.randint(3, 25)) for i in range(n_users)]

In [None]:
assert np.allclose(HR(sample_pred, sample_target, 1), 0.143000), 'wrong HR@1'
assert np.allclose(HR(sample_pred, sample_target, 3), 0.356400), 'wrong HR@3'
assert np.allclose(HR(sample_pred, sample_target, 10), 0.731400), 'wrong HR@10'

assert np.allclose(recall(sample_pred, sample_target, 1), 0.010282), 'wrong recall@1'
assert np.allclose(recall(sample_pred, sample_target, 3), 0.030184), 'wrong recall@3'
assert np.allclose(recall(sample_pred, sample_target, 10), 0.100828), 'wrong recall@10'

assert np.allclose(precision(sample_pred, sample_target, 1), 0.143000), 'wrong precision@1'
assert np.allclose(precision(sample_pred, sample_target, 3), 0.140633), 'wrong precision@3'
assert np.allclose(precision(sample_pred, sample_target, 10), 0.141110), 'wrong precision@10'

assert np.allclose(NDCG(sample_pred, sample_target, 1), 0.143000), 'wrong ndcg@1'
assert np.allclose(NDCG(sample_pred, sample_target, 3), 0.141008), 'wrong ndcg@3'
assert np.allclose(NDCG(sample_pred, sample_target, 10), 0.147899), 'wrong ndcg@10'

### Опять бейзлайны ಠ‿ಠ

In [None]:
good_users = set(df_grouped['user_id'])
df = df[df['user_id'].isin(good_users)]

In [None]:
def global_top_popular(df, top_k=20) -> np.array:
    # output shape ~ [n_users, top_k]
    # YOUR CODE HERE
    raise NotImplementedError

def user_top_popular(df, top_k=20) -> np.array:
    # output shape ~ [n_users, top_k]
    # YOUR CODE HERE
    raise NotImplementedError

In [None]:
global_popular = global_top_popular(df)
user_popular = user_top_popular(df)

In [None]:
assert global_popular.shape[0] == len(df_grouped), 'wrong global popular shape'
assert user_popular.shape[0] == len(df_grouped), 'wrong user popular shape'
assert global_popular.shape[1] == 20, 'wrong global popular shape'
assert global_popular[0].tolist() == [1504, 1277, 1242, 1574, 1470, 1068,  168,  654,  895,  459,  572, 466, 1200, 1412,  213, 1573,  901,  285,   27,  996]

In [None]:
list_k = [1, 3, 5, 10]
target = df_grouped['target'].values

final_metrics = pd.DataFrame(columns=['metric', 'model', 'top_popular', 'user_popular'])
for func in [HR, precision, recall, NDCG]:
    for k in list_k:
        metrics = {
            'metric': f'{func.__name__}@{k}', 
            'model': func(model_predict, target, k), 
            'top_popular': func(global_popular, target, k), 
            'user_popular': func(user_popular, target, k),
        }
        final_metrics = final_metrics.append(metrics, ignore_index=True)
        
final_metrics

### Next step?

* Взять датасет [побольше](https://www.dunnhumby.com/source-files/#)

* Реализовать другую архитектуру([DREAM](https://cseweb.ucsd.edu/classes/fa17/cse291-b/reading/A%20Dynamic%20Recurrent%20Model%20for%20Next%20Basket%20Recommendation.pdf), [BERT4Rec](https://arxiv.org/pdf/1904.06690.pdf), [RepeatNet](https://arxiv.org/pdf/1812.02646.pdf), etc)

* Увеличить размерность векторов, добавить lr_scheduler, потюнить другие параметры

* Улучшить Dataloader, генерировать батчи из корзин "похожего размера"

* Попробовать более интересный лосс([BPR](https://arxiv.org/ftp/arxiv/papers/1205/1205.2618.pdf), [WARP](http://www.thespermwhale.com/jaseweston/papers/wsabie-ijcai.pdf))

### BRP

$$L = \sum_{i=1}^{N} \ln(1 + e ^ {-(s_+ - s_-)})$$

In [None]:
class BRPLoss(torch.nn.Module):
    def __init__(self, *args, **kwags):
        super().__init__()
        #YOUR CODE HERE
        
    def forward(self, positives, negatives, *args, **kwags):
        #YOUR CODE HERE
        raise NotImplementedError