In [1]:
import torch
import random
import datetime
import pandas as pd
import numpy as np
import os

from torch.utils.data import Dataset
from src.datasets import RL4RS, ContentWise, DummyData, OpenCDP
from src.utils import evaluate_model, get_dummy_data, get_train_val_test_tmatrix_tnumitems
from src.embeddings import RecsysEmbedding

experiment_name = 'MatrixFactorization'
device = 'cuda:0'
seed = 7331
pkl_path = '../pkl/'

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fe86de02b30>

# Модель

In [2]:
class MF(torch.nn.Module):
    def __init__(self, embedding):
        super().__init__()
        self.embedding = embedding
    
    def forward(self, batch):
        item_embs, user_embs = self.embedding(batch)
        scores = item_embs * user_embs[:, :, None, :].repeat(1, 1, item_embs.size(-2), 1)
        scores = scores.sum(-1)
        return scores

# ContentWise

In [3]:
dataset = ContentWise.load(os.path.join(pkl_path, 'cw.pkl'))
(
    train_loader, 
    val_loader, 
    test_loader, 
    train_user_item_matrix, 
    train_num_items
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=150)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

model = MF(
    RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
).to('cpu')

test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
test_scores['embeddings'] = 'svd'
test_scores

20216 data points among 108 batches


{'f1': 0.19776448607444763,
 'roc-auc': 0.6526899337768555,
 'accuracy': 0.2600274384021759,
 'embeddings': 'svd'}

In [4]:
pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/cw_MatrixFactorization.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items

# RL4RS

In [5]:
dataset = RL4RS.load(os.path.join(pkl_path, 'rl4rs.pkl'))
(
    train_loader,
    val_loader,
    test_loader,
    train_user_item_matrix,
    train_num_items 
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=350)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

model = MF(
    RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
).to('cpu')

test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
test_scores['embeddings'] = 'svd'
test_scores

45942 data points among 106 batches


{'f1': 0.761080265045166,
 'roc-auc': 0.7169578075408936,
 'accuracy': 0.659001350402832,
 'embeddings': 'svd'}

In [6]:
pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/rl4rs_MatrixFactorization.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items

# OpenCDP

In [7]:
# cosmetics
for filename in os.listdir(pkl_path):
    if not filename.startswith('cosmetics'):
        continue
    print(f"\n == {filename} ==")
    dataset = OpenCDP.load(os.path.join(pkl_path, filename))
    (
        train_loader, 
        val_loader,
        test_loader, 
        train_user_item_matrix, 
        train_num_items
    ) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=200)

    print(f"{len(dataset)} data points among {len(train_loader)} batches")

    model = MF(
        RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
    ).to('cpu')
    
    test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
    test_scores['embeddings'] = 'svd'
    print(test_scores)
    pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/{filename}_MatrixFactorization.csv')


 == cosmetics_10_8.pkl ==


AttributeError: 'OpenCDP' object has no attribute 'item_categorical'