In [1]:
import torch
import random
import datetime
import pandas as pd
import numpy as np
import os

from torch.utils.data import Dataset
from src.datasets import RL4RS, ContentWise, DummyData, OpenCDP
from src.utils import train, get_dummy_data, get_train_val_test_tmatrix_tnumitems
from src.embeddings import RecsysEmbedding

experiment_name = 'LogReg'
device = 'cuda:0'
seed = 7331
pkl_path = '../pkl/'


random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f2a2ee1eb50>

# Модель

In [2]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, embedding, output_dim=1):
        super().__init__()
        self.embedding = embedding
        self.linear = torch.nn.Linear(2 * embedding.embedding_dim, output_dim)
    
    def forward(self, batch):
        item_embs, user_embs = self.embedding(batch)

        features = torch.cat(
            [
                item_embs,
                user_embs[:, :, None, :].repeat(1, 1, item_embs.size(-2), 1)
            ],
            dim = -1
        )
        return self.linear(features).squeeze(-1)

# Игрушечный датасет: проверим, что сходится к идеальным метрикам

In [3]:
d = DummyData()
dummy_loader, dummy_matrix = get_dummy_data(d)

model = LogisticRegression(
    RecsysEmbedding(
        d.n_items, 
        dummy_matrix, 
        embeddings='svd',
        embedding_dim=2
    ).to('cpu'),
    output_dim=1
).to('cpu')

train(
    model, 
    dummy_loader, dummy_loader, dummy_loader,
    device='cpu', lr=1e-2, num_epochs=5000, 
    silent=True
)

biulding affinity matrix...


3it [00:00, 3084.05it/s]

Test before learning: {'f1': 0.0, 'roc-auc': 0.0, 'accuracy': 0.75}





train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 0 |accuracy: 0.25 | f1: 0.4000000059604645 | auc: 0.0 | treshold: 0.01
Test: accuracy: 0.25 | f1: 0.4000000059604645 | auc: 0.0 | 
Val update: epoch: 10 |accuracy: 0.25 | f1: 0.4000000059604645 | auc: 0.6666666269302368 | treshold: 0.01
Test: accuracy: 0.25 | f1: 0.4000000059604645 | auc: 0.6666666269302368 | 
Val update: epoch: 12 |accuracy: 0.75 | f1: 0.6666666865348816 | auc: 0.6666666269302368 | treshold: 0.44
Test: accuracy: 0.75 | f1: 0.6666666865348816 | auc: 0.6666666269302368 | 
Val update: epoch: 21 |accuracy: 0.75 | f1: 0.6666666865348816 | auc: 1.0 | treshold: 0.4
Test: accuracy: 0.75 | f1: 0.6666666865348816 | auc: 1.0 | 
Val update: epoch: 25 |accuracy: 1.0 | f1: 1.0 | auc: 1.0 | treshold: 0.47000000000000003
Test: accuracy: 1.0 | f1: 1.0 | auc: 1.0 | 


(LogisticRegression(
   (embedding): RecsysEmbedding()
   (linear): Linear(in_features=4, out_features=1, bias=True)
 ),
 {'f1': 1.0, 'roc-auc': 1.0, 'accuracy': 1.0})

# ContentWise

In [4]:
content_wise_results = []
dataset = ContentWise.load(os.path.join(pkl_path, 'cw.pkl'))
(
    train_loader, 
    val_loader, 
    test_loader, 
    train_user_item_matrix, 
    train_num_items 
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=150)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

20216 data points among 108 batches


In [5]:
for embeddings in ['svd', 'neural']:
    print(f"\nEvaluating {experiment_name} with {embeddings} embeddings")
    
    model = LogisticRegression(
        RecsysEmbedding(train_num_items, train_user_item_matrix, embeddings=embeddings),
        output_dim=1
    ).to(device)

    _, metrics = train(
        model, 
        train_loader, val_loader, test_loader, 
        device=device, lr=1e-3, num_epochs=5000, early_stopping=7,
       silent=True, 
    )
    
    metrics['embeddings'] = embeddings
    content_wise_results.append(metrics)


Evaluating LogReg with svd embeddings
Test before learning: {'f1': 0.15713582932949066, 'roc-auc': 0.5370557904243469, 'accuracy': 0.8466743230819702}


train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 14 |accuracy: 0.3955571949481964 | f1: 0.1861746609210968 | auc: 0.5461916923522949 | treshold: 0.11
Test: accuracy: 0.39641425013542175 | f1: 0.18501228094100952 | auc: 0.5385094881057739 | 
Val update: epoch: 15 |accuracy: 0.46515989303588867 | f1: 0.1906258761882782 | auc: 0.5612063407897949 | treshold: 0.11
Test: accuracy: 0.46350914239883423 | f1: 0.18828357756137848 | auc: 0.5512282252311707 | 
Val update: epoch: 16 |accuracy: 0.5251049995422363 | f1: 0.19633254408836365 | auc: 0.5784597396850586 | treshold: 0.11
Test: accuracy: 0.5319602489471436 | f1: 0.19356495141983032 | auc: 0.5666322708129883 | 
Val update: epoch: 17 |accuracy: 0.8673363327980042 | f1: 0.2142857164144516 | auc: 0.5931597948074341 | treshold: 0.13
Test: accuracy: 0.8617118000984192 | f1: 0.207261860370636 | auc: 0.5813994407653809 | 
Val update: epoch: 18 |accuracy: 0.8557875752449036 | f1: 0.23852010071277618 | auc: 0.6028863191604614 | treshold: 0.12
Test: accuracy: 0.8481795787811279 | 

train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 0 |accuracy: 0.10476057976484299 | f1: 0.18142904341220856 | auc: 0.4712187349796295 | treshold: 0.04
Test: accuracy: 0.10611186176538467 | f1: 0.18468020856380463 | auc: 0.47093233466148376 | 
Val update: epoch: 1 |accuracy: 0.2082035094499588 | f1: 0.18665605783462524 | auc: 0.5641592741012573 | treshold: 0.04
Test: accuracy: 0.20617297291755676 | f1: 0.18809542059898376 | auc: 0.5634668469429016 | 
Val update: epoch: 2 |accuracy: 0.7537552714347839 | f1: 0.24882961809635162 | auc: 0.6608912348747253 | treshold: 0.12
Test: accuracy: 0.7466430068016052 | f1: 0.2493155598640442 | auc: 0.6521070599555969 | 
Val update: epoch: 3 |accuracy: 0.7342851161956787 | f1: 0.2638608515262604 | auc: 0.6731790900230408 | treshold: 0.13
Test: accuracy: 0.7243773937225342 | f1: 0.26177549362182617 | auc: 0.6614862084388733 | 
Val update: epoch: 4 |accuracy: 0.604489266872406 | f1: 0.25104209780693054 | auc: 0.675378680229187 | treshold: 0.12
Test: accuracy: 0.593898594379425 | f1: 

In [6]:
pd.DataFrame(content_wise_results).to_csv(f'results/cw_{experiment_name}.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items

# RL4RS

In [7]:
rl4rs_results = []
dataset = RL4RS.load(os.path.join(pkl_path, 'rl4rs.pkl'))
(
    train_loader, 
    val_loader, 
    test_loader, 
    train_user_item_matrix, 
    train_num_items 
) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=350)

print(f"{len(dataset)} data points among {len(train_loader)} batches")

45942 data points among 106 batches


In [8]:
for embeddings in ['explicit','neural', 'svd']:
    print(f"\nEvaluating {experiment_name} with {embeddings} embeddings")

    model = LogisticRegression(
        RecsysEmbedding(
            train_num_items, 
            train_user_item_matrix, 
            embeddings=embeddings, 
            embedding_dim=40
        ),
        output_dim=1
    ).to(device)

    best_model, metrics = train(
        model, 
        train_loader, val_loader, test_loader, 
        device=device, lr=1e-3, num_epochs=5000, early_stopping=7,
        silent=True
    )
    
    metrics['embeddings'] = embeddings
    rl4rs_results.append(metrics)
    
pd.DataFrame(rl4rs_results).to_csv(f'results/rl4rs_{experiment_name}.csv')
del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items


Evaluating LogReg with explicit embeddings
Test before learning: {'f1': 0.588447630405426, 'roc-auc': 0.5688874125480652, 'accuracy': 0.5313746929168701}


train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 0 |accuracy: 0.7282687425613403 | f1: 0.8226267099380493 | auc: 0.8072115182876587 | treshold: 0.43
Test: accuracy: 0.7341071367263794 | f1: 0.8261061906814575 | auc: 0.8071073889732361 | 
Val update: epoch: 1 |accuracy: 0.7938373684883118 | f1: 0.8516533374786377 | auc: 0.8437247276306152 | treshold: 0.51
Test: accuracy: 0.7947285771369934 | f1: 0.8524652719497681 | auc: 0.8442938923835754 | 
Val update: epoch: 2 |accuracy: 0.7999323010444641 | f1: 0.8559386730194092 | auc: 0.8604885339736938 | treshold: 0.5
Test: accuracy: 0.8031193614006042 | f1: 0.8586261868476868 | auc: 0.8609044551849365 | 
Val update: epoch: 3 |accuracy: 0.8070430159568787 | f1: 0.8585962653160095 | auc: 0.8689801692962646 | treshold: 0.51
Test: accuracy: 0.808003842830658 | f1: 0.8596478700637817 | auc: 0.868678867816925 | 
Val update: epoch: 4 |accuracy: 0.8091713786125183 | f1: 0.8589509725570679 | auc: 0.8732187151908875 | treshold: 0.5
Test: accuracy: 0.8104944825172424 | f1: 0.8601559400



Val update: epoch: 30 |accuracy: 0.8269240260124207 | f1: 0.8689521551132202 | auc: 0.8980323076248169 | treshold: 0.54
Test: accuracy: 0.828896164894104 | f1: 0.8707107901573181 | auc: 0.8978813886642456 | 
Val update: epoch: 32 |accuracy: 0.824481189250946 | f1: 0.8721210956573486 | auc: 0.8992319107055664 | treshold: 0.51
Test: accuracy: 0.824350118637085 | f1: 0.8721621632575989 | auc: 0.898556113243103 | 
Val update: epoch: 33 |accuracy: 0.820708155632019 | f1: 0.8707072734832764 | auc: 0.8998627662658691 | treshold: 0.5
Test: accuracy: 0.8216902613639832 | f1: 0.871671736240387 | auc: 0.899205207824707 | 
Val update: epoch: 34 |accuracy: 0.8157016634941101 | f1: 0.8685300350189209 | auc: 0.9012598991394043 | treshold: 0.5
Test: accuracy: 0.8168298602104187 | f1: 0.8695787191390991 | auc: 0.9004025459289551 | 
Val update: epoch: 43 |accuracy: 0.8292942643165588 | f1: 0.8734581470489502 | auc: 0.9015151262283325 | treshold: 0.55
Test: accuracy: 0.8308306336402893 | f1: 0.8748076558

train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 0 |accuracy: 0.6970444321632385 | f1: 0.8070905208587646 | auc: 0.793698787689209 | treshold: 0.35000000000000003
Test: accuracy: 0.703711748123169 | f1: 0.8111989498138428 | auc: 0.8031184673309326 | 
Val update: epoch: 1 |accuracy: 0.8126541972160339 | f1: 0.8641195893287659 | auc: 0.8828929662704468 | treshold: 0.52
Test: accuracy: 0.8162737488746643 | f1: 0.8667765259742737 | auc: 0.8840545415878296 | 
Val update: epoch: 2 |accuracy: 0.8323900699615479 | f1: 0.8738669753074646 | auc: 0.9059879779815674 | treshold: 0.53
Test: accuracy: 0.8342642784118652 | f1: 0.8752502799034119 | auc: 0.9057952761650085 | 
Val update: epoch: 3 |accuracy: 0.832704484462738 | f1: 0.8752232193946838 | auc: 0.9097120761871338 | treshold: 0.51
Test: accuracy: 0.8361504077911377 | f1: 0.8779231309890747 | auc: 0.9095450639724731 | 
Val update: epoch: 4 |accuracy: 0.8351956605911255 | f1: 0.8766607642173767 | auc: 0.9111686944961548 | treshold: 0.54
Test: accuracy: 0.8376979827880859 | 

train:   0%|          | 0/5000 [00:00<?, ?it/s]

Val update: epoch: 0 |accuracy: 0.6653364300727844 | f1: 0.7936839461326599 | auc: 0.6785668134689331 | treshold: 0.49
Test: accuracy: 0.6706081628799438 | f1: 0.7967411875724792 | auc: 0.6895123720169067 | 
Val update: epoch: 1 |accuracy: 0.6717941164970398 | f1: 0.7956632971763611 | auc: 0.703956127166748 | treshold: 0.49
Test: accuracy: 0.6776931285858154 | f1: 0.7991682887077332 | auc: 0.7143428921699524 | 
Val update: epoch: 2 |accuracy: 0.6733420491218567 | f1: 0.7957412004470825 | auc: 0.7146733403205872 | treshold: 0.49
Test: accuracy: 0.679144024848938 | f1: 0.7990306615829468 | auc: 0.7260066866874695 | 
Val update: epoch: 3 |accuracy: 0.6783244013786316 | f1: 0.7979337573051453 | auc: 0.7221701741218567 | treshold: 0.49
Test: accuracy: 0.6836416125297546 | f1: 0.8008099794387817 | auc: 0.7336980104446411 | 
Val update: epoch: 4 |accuracy: 0.6792192459106445 | f1: 0.7983028292655945 | auc: 0.7301320433616638 | treshold: 0.48000000000000004
Test: accuracy: 0.6846330761909485 |

# OpenCDP

In [9]:
for group in ['cosmetics', 'multi']:
    for filename in os.listdir(pkl_path):    
        result = []
        if not filename.startswith(group):
            continue
        print(f"\n == {filename} ==")
        dataset = OpenCDP.load(os.path.join(pkl_path, filename))
        (
            train_loader, 
            val_loader,
            test_loader, 
            train_user_item_matrix, 
            train_num_items
        ) = get_train_val_test_tmatrix_tnumitems(dataset, batch_size=800)
    
        print(f"{len(dataset)} data points among {len(train_loader)} batches")
        for embeddings in ['neural', 'svd']:
            print(f"\nEvaluating {experiment_name} with {embeddings} embeddings")
        
            model = LogisticRegression(
                RecsysEmbedding(train_num_items, train_user_item_matrix, embeddings=embeddings),
                output_dim=1
            ).to(device)
        
            best_model, metrics = train(
                model, 
                train_loader, val_loader, test_loader, 
                device=device, lr=1e-3, num_epochs=5000, early_stopping=7,
                silent=True
            )
            
            print(metrics)
            metrics['embeddings'] = embeddings
            result.append(metrics)
        pd.DataFrame(result).to_csv(f'results/{filename}_{experiment_name}.csv')
        del dataset, train_loader, val_loader, test_loader, train_user_item_matrix, train_num_items


 == cosmetics_10_8.pkl ==


AttributeError: 'OpenCDP' object has no attribute 'item_categorical'