In [1]:
import torch
import random
import datetime
import pandas as pd
import numpy as np
import os

from torch.utils.data import Dataset
from src.datasets import RL4RS, ContentWise, DummyData, OpenCDP
from src.utils import train, get_dummy_data, get_train_val_test_tmatrix_tnumitems
from src.embeddings import RecsysEmbedding, IndexItemEmbeddings, CategoricalItemEmbeddings, SVDItemEmbeddings, MixedEmbeddings

experiment_name = 'LogRegCE'
device = 'cuda:0'
seed = 7331
pkl_path = '../pkl/'


random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f53b453ab30>

# Модель

In [2]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, embedding, output_dim=1):
        super().__init__()
        self.embedding = embedding
        self.linear = torch.nn.Linear(2 * embedding.embedding_dim, output_dim)
    
    def forward(self, batch):
        item_embs, user_embs = self.embedding(batch)

        features = torch.cat(
            [
                item_embs,
                user_embs[:, :, None, :].repeat(1, 1, item_embs.size(-2), 1)
            ],
            dim = -1
        )
        return self.linear(features).squeeze(-1)

# Проверка категориальных фичей

### Dummy data

In [3]:
!ls ../pkl

cosmetics_10_1.pkl   cosmetics_20_8.pkl  multi_10_1.pkl   multi_20_8.pkl
cosmetics_10_2.pkl   cosmetics_5_1.pkl	 multi_10_2.pkl   multi_5_1.pkl
cosmetics_10_24.pkl  cosmetics_5_2.pkl	 multi_10_24.pkl  multi_5_2.pkl
cosmetics_10_4.pkl   cosmetics_5_24.pkl  multi_10_4.pkl   multi_5_24.pkl
cosmetics_10_8.pkl   cosmetics_5_4.pkl	 multi_10_8.pkl   multi_5_4.pkl
cosmetics_20_1.pkl   cosmetics_5_8.pkl	 multi_20_1.pkl   multi_5_8.pkl
cosmetics_20_2.pkl   cosmetics_8_24.pkl  multi_20_2.pkl   rl4rs.pkl
cosmetics_20_24.pkl  cw.pkl		 multi_20_24.pkl
cosmetics_20_4.pkl   ilya_pkl		 multi_20_4.pkl


In [4]:
# dataset = #OpenCDP.load(os.path.join(pkl_path, 'cosmetics_10_24.pkl'))
dataset=DummyData()
train_loader, train_user_item_matrix = get_dummy_data(dataset)
# (
#     train_loader, 
#     val_loader,
#     test_loader, 
#     train_user_item_matrix, 
#     train_num_items
# ) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=800)
print(f"{len(dataset)} data points among {len(train_loader)} batches")
index_embeddings = IndexItemEmbeddings(dataset.n_items, embedding_dim = 32)
category_embeddings = CategoricalItemEmbeddings(dataset.item_categorical)
svd_embeddings = SVDItemEmbeddings(train_user_item_matrix, embedding_dim=2)

me = MixedEmbeddings(
    index_embeddings,
    svd_embeddings,
    category_embeddings
)

model = LogisticRegression(me, output_dim=1)
train(
    model, 
    train_loader, train_loader, train_loader, 
    device=device, lr=1e-3, num_epochs=5000, early_stopping=7,
    silent=True, 
    )

biulding affinity matrix...


3it [00:00, 4310.69it/s]

2 data points among 1 batches





Test before learning: {'f1': 0.0, 'roc-auc': 0.3333333134651184, 'accuracy': 0.5}


train... loss:0.7032778263092041:   0%|                                                                                                    | 1/5000 [00:00<25:30,  3.27it/s]

Val update: epoch: 0 |accuracy: 0.5 | f1: 0.5 | auc: 0.6666666269302368 | treshold: 0.38
Test: accuracy: 0.5 | f1: 0.5 | auc: 0.6666666269302368 | 


train... loss:0.6855418086051941:   0%|                                                                                                    | 3/5000 [00:00<23:00,  3.62it/s]

Val update: epoch: 2 |accuracy: 0.75 | f1: 0.6666666865348816 | auc: 0.6666666269302368 | treshold: 0.42000000000000004
Test: accuracy: 0.75 | f1: 0.6666666865348816 | auc: 0.6666666269302368 | 


train... loss:0.4575468599796295:   1%|▋                                                                                                  | 34/5000 [00:07<20:03,  4.13it/s]

Val update: epoch: 33 |accuracy: 0.75 | f1: 0.6666666865348816 | auc: 1.0 | treshold: 0.29000000000000004
Test: accuracy: 0.75 | f1: 0.6666666865348816 | auc: 1.0 | 


train... loss:0.451371967792511:   1%|▋                                                                                                   | 35/5000 [00:08<19:02,  4.35it/s]

Val update: epoch: 35 |accuracy: 1.0 | f1: 1.0 | auc: 1.0 | treshold: 0.51
Test: accuracy: 1.0 | f1: 1.0 | auc: 1.0 | 





(LogisticRegression(
   (embedding): MixedEmbeddings(
     (embeddings): ModuleList(
       (0): IndexItemEmbeddings(
         (embeddings): Embedding(6, 32)
       )
       (1): SVDItemEmbeddings()
       (2): CategoricalItemEmbeddings(
         (embeddings): ModuleList(
           (0): Embedding(5, 8)
           (1): Embedding(5, 8)
         )
       )
     )
   )
   (linear): Linear(in_features=100, out_features=1, bias=True)
 ),
 {'f1': 1.0, 'roc-auc': 1.0, 'accuracy': 1.0})

# OpenCDP

In [5]:
dataset = OpenCDP.load(os.path.join(pkl_path, 'cosmetics_8_24.pkl'))
(
    train_loader, 
    val_loader,
    test_loader, 
    train_user_item_matrix, 
    train_num_items
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=800)

In [6]:
dataset[0]['slates_item_categorical'].shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
print(f"{len(dataset)} data points among {len(train_loader)} batches")

index_embeddings = IndexItemEmbeddings(dataset.n_items, embedding_dim = 32)
category_embeddings = CategoricalItemEmbeddings(dataset.item_categorical)
svd_embeddings = SVDItemEmbeddings(train_user_item_matrix, embedding_dim=32)

me = MixedEmbeddings(
    index_embeddings,
    svd_embeddings,
    category_embeddings
)

model = LogisticRegression(me, output_dim=1)
train(
    model, 
    train_loader, val_loader, test_loader, 
    device=device, lr=1e-3, num_epochs=5000, early_stopping=7,
    silent=True, 
)