In [3]:
import torch
import random
import datetime
import pandas as pd
import numpy as np
import os

from torch.utils.data import Dataset
from src.datasets import RL4RS, ContentWise, DummyData, OpenCDP
from src.utils import evaluate_model, get_dummy_data, get_train_val_test_tmatrix_tnumitems
from src.embeddings import RecsysEmbedding

experiment_name = 'MatrixFactorization'
device = 'cuda:0'
seed = 7331
pkl_path = '../pkl/'

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fd4f051b4f0>

# Модель

In [5]:
class MF(torch.nn.Module):
    def __init__(self, embedding):
        super().__init__()
        self.embedding = embedding
    
    def forward(self, batch):
        item_embs, user_embs = self.embedding(batch)
        scores = item_embs * user_embs[:, :, None, :].repeat(1, 1, item_embs.size(-2), 1)
        scores = scores.sum(-1)
        return scores

# ContentWise

In [7]:
dataset = ContentWise.load(os.path.join(pkl_path, 'cw.pkl'))
(
    train_loader, 
    val_loader, 
    test_loader, 
    train_user_item_matrix, 
    train_num_items
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=150)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

model = MF(
    RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
).to('cpu')

test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
test_scores['embeddings'] = 'svd'
test_scores

20216 data points among 108 batches


{'f1': 0.2006726711988449,
 'roc-auc': 0.6502835750579834,
 'accuracy': 0.26173314452171326,
 'embeddings': 'svd'}

In [8]:
pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/cw_MatrixFactorization.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items

# RL4RS

In [14]:
dataset = RL4RS.load(os.path.join(pkl_path, 'rl4rs.pkl'))
(
    train_loader,
    val_loader,
    test_loader,
    train_user_item_matrix,
    train_num_items 
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=350)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

model = MF(
    RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
).to('cpu')

test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
test_scores['embeddings'] = 'svd'
test_scores

45942 data points among 106 batches


{'f1': 0.7611088752746582,
 'roc-auc': 0.7190536260604858,
 'accuracy': 0.6578406691551208,
 'embeddings': 'svd'}

In [15]:
pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/rl4rs_MatrixFactorization.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items

# OpenCDP

In [6]:
# cosmetics
for filename in os.listdir(pkl_path):
    if not filename.startswith('cosmetics'):
        continue
    print(f"\n == {filename} ==")
    dataset = OpenCDP.load(os.path.join(pkl_path, filename))
    (
        train_loader, 
        val_loader,
        test_loader, 
        train_user_item_matrix, 
        train_num_items
    ) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=200)

    print(f"{len(dataset)} data points among {len(train_loader)} batches")

    model = MF(
        RecsysEmbedding(dataset.n_items, train_user_item_matrix, embeddings='svd'),
    ).to('cpu')
    
    test_scores = evaluate_model(model, test_loader, device='cpu', silent=True, debug=False)
    test_scores['embeddings'] = 'svd'
    print(test_scores)
    pd.DataFrame({key:[test_scores[key]] for key in test_scores}).to_csv(f'results/{filename}_MatrixFactorization.csv')


 == cosmetics_10_8.pkl ==
115256 data points among 460 batches
{'f1': 0.6125137805938721, 'roc-auc': 0.526435911655426, 'accuracy': 0.5191967487335205, 'embeddings': 'svd'}

 == cosmetics_10_24.pkl ==
115190 data points among 461 batches
{'f1': 0.6235707998275757, 'roc-auc': 0.5364035367965698, 'accuracy': 0.5278186798095703, 'embeddings': 'svd'}

 == cosmetics_5_8.pkl ==
121483 data points among 487 batches
{'f1': 0.6717410087585449, 'roc-auc': 0.5695322155952454, 'accuracy': 0.5446938872337341, 'embeddings': 'svd'}

 == cosmetics_20_8.pkl ==
89073 data points among 357 batches
{'f1': 0.41108468174934387, 'roc-auc': 0.5153679847717285, 'accuracy': 0.4976668357849121, 'embeddings': 'svd'}

 == cosmetics_10_4.pkl ==
115287 data points among 462 batches
{'f1': 0.612734854221344, 'roc-auc': 0.5334126949310303, 'accuracy': 0.5211506485939026, 'embeddings': 'svd'}

 == cosmetics_5_24.pkl ==
121463 data points among 484 batches
{'f1': 0.6769800186157227, 'roc-auc': 0.5859543681144714, 'accu