# Evaluation metrix modification

In [3]:
from collections import defaultdict

def data_partition(fname):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)

    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
    return [user_train, user_valid, user_test, usernum, itemnum]

dataset = data_partition('data/processed/ml-1m')

In [5]:
def data_partition_window_P(fname, valid_percent, test_percent, train_percent):
    if valid_percent + test_percent > 0.6:
        print('the percent you select for val/test are too high')
        return None
    valid_start = 1 - valid_percent - test_percent
    test_start = 1 - test_percent
    train_start = 1 - train_percent
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train_seq = {}
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    # read from each line
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)
        # count user and items
    # read from each user
    count = 0
    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            # select the whole training seq
            # user_train[user] = User[user][:-2]
            seq_len = len(User[user])
            valid_index = int(seq_len * valid_start)
            test_index = int(seq_len * test_start)
            if valid_index == test_index:
                user_train[user] = User[user]
                user_valid[user] = []
                user_test[user] = []
            else:
                train_seq = User[user][: valid_index]
                valid_seq = User[user][valid_index: test_index]
                test_seq = User[user][test_index:]
                train_seq_length = len(train_seq)
                split_index = int(train_seq_length * train_start)
                input_seq = train_seq[:split_index]
                target_seq = train_seq[split_index:]
                for target in target_seq:
                    count += 1
                    user_train[count] = input_seq + [target]
                user_train_seq[user] = []
                user_train_seq[user] += train_seq
                user_valid[user] = []
                user_valid[user] += valid_seq
                user_test[user] = []
                user_test[user] += test_seq
    return [user_train, user_train_seq, user_valid, user_test, usernum, itemnum]

dataset_window = data_partition_window_P('data/processed/ml-1m', 0.1, 0.1, 0.2)

In [11]:
import random
import numpy as np
import copy
import torch
from collections import Counter
import sys

def evaluate_window_valid(model, dataset, dataset_window, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    [_, train, valid, test, _, itemnum] = copy.deepcopy(dataset_window)
    Recall = 0.0
    Recall_U = 0.0
    coverage_list = []
    # P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
    valid_user = 0.0
    sample_nums = 500
    random_items = random.sample(range(1, itemnum + 1), sample_nums)
    sample_idx = random_items
    sample_idx_tensor = torch.tensor(sample_idx).to(args.device)
    users = range(1, usernum + 1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue
        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            # fill the sequence from end to beginning
            idx -= 1
            if idx == -1: break
            # select the max len or all of the training data in the sequence
            # limit the length, seq contains the actual training sequence
        # interacted items
        rated = set(train[u])
        rated.add(0)
        # ground truth item
        ground_truth_idx = valid[u]
        valid_num = len(valid[u])
        # collect all indexes, which needs to process on
        process_idx = ground_truth_idx + sample_idx
        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], process_idx]])[0]
        # target distance
        target_ds = predictions[:valid_num]
        # sampled results
        sample_d = predictions[valid_num:]
        # print(len(sample_d))
        for target_d in target_ds:
            bool_tensor = target_d >= sample_d
            count = torch.sum(bool_tensor).item()
            if count < 10:
                Recall_U += 1
        Recall_U = Recall_U / valid_num
        Recall += Recall_U
        Recall_U = 0
        sorted_indices = torch.argsort(sample_d)
        sorted_sample_idx = sample_idx_tensor[sorted_indices]
        # take the coverage@10 for all users
        coverage_list += list(sorted_sample_idx[:10])
        valid_user += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()
    p90_list = [i.item() for i in coverage_list]
    p90_dict = Counter(p90_list)
    p90_sort = sorted(p90_dict.items(), key=lambda x: x[1], reverse=True)
    total_rec = 0
    item_count = 0
    for _, num in p90_sort:
        total_rec += num
        item_count += 1
        if total_rec >= 0.9 * 10 * usernum:
            break
    return Recall / valid_user, item_count / sample_nums

In [8]:
import argparse

def str2bool(s):
    if s not in {'false', 'true'}:
        raise ValueError('Not a valid boolean string')
    return s == 'true'

def create_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True)
    parser.add_argument('--train_dir', required=True)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--maxlen', default=50, type=int)
    parser.add_argument('--hidden_units', default=50, type=int)
    parser.add_argument('--num_blocks', default=2, type=int)
    parser.add_argument('--num_epochs', default=201, type=int)
    parser.add_argument('--num_heads', default=1, type=int)
    parser.add_argument('--dropout_rate', default=0.5, type=float)
    parser.add_argument('--l2_emb', default=0.0, type=float)
    parser.add_argument('--device', default='cpu', type=str)
    parser.add_argument('--inference_only', default=False, type=str2bool)
    parser.add_argument('--state_dict_path', default=None, type=str)
    parser.add_argument('--window_predictor', default=False, type=str2bool)
    parser.add_argument('--window_eval', default=False, type=str2bool)
    parser.add_argument('--eval_epoch', default=20, type=int)
    args = parser.parse_args(args)
    return args

In [9]:
from models.SASRec.model import SASRec
[train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
model_path = 'processed/ml-1m_repro2/SASRec.epoch=201.lr=0.001.layer=2.head=1.hidden=50.maxlen=200.pth'
# args.device = 'ml-1m'
# args.train_dir = 'test'
# args.state_dict_path = model_path
# args.inference
args = create_args(['--dataset','ml-1m',
                    '--train_dir', 'test',
                    '--device', 'cuda',
                    '--state_dict_path', model_path,
                    '--inference_only', 'true',
                    '--maxlen', '200'])
# print(args.dataset)
model = SASRec(usernum, itemnum, args).to(args.device) # no ReLU activation in original SASRec implementation?
model.load_state_dict(torch.load(args.state_dict_path, map_location=torch.device(args.device)))

<All keys matched successfully>

In [12]:
r_10, p90_10 = evaluate_window_valid(model, dataset, dataset_window, args)
r_10, p90_10

............................................................

(0.35887693968618883, 0.338)

In [14]:
[train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
[_, train, valid, test, _, itemnum] = copy.deepcopy(dataset_window)

In [57]:
Recall = 0.0
Recall_U = 0.0
coverage_list = []
# P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
valid_user = 0.0
sample_nums = 100

In [21]:
train[1], valid[1], test[1]

([1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42],
 [43, 44, 45, 46, 47],
 [48, 49, 50, 51, 52, 53])

In [24]:

random_items = random.sample(range(1, itemnum + 1), sample_nums)
sample_idx = random_items
sample_idx_tensor = torch.tensor(sample_idx).to(args.device)

In [81]:
Recall,valid_user,p90_list, coverage_list = 0, 0, [], []
for u in [1,2]:
    seq = np.zeros([args.maxlen], dtype=np.int32)
    idx = args.maxlen - 1
    for i in reversed(train[u]):
        seq[idx] = i
        # fill the sequence from end to beginning
        idx -= 1
        if idx == -1: break
        # select the max len or all of the training data in the sequence
        # limit the length, seq contains the actual training sequence
    # interacted items
    rated = set(train[u])
    rated.add(0)
    # ground truth item
    ground_truth_idx = valid[u]
    valid_num = len(valid[u])
    # collect all indexes, which needs to process on
    process_idx = ground_truth_idx + sample_idx
    predictions = -model.predict(*[np.array(l) for l in [[u], [seq], process_idx]])[0]
    target_ds = predictions[:valid_num]
        # sampled results
    sample_d = predictions[valid_num:]
        # print(len(sample_d))
    for target_d in target_ds:
        bool_tensor = target_d >= sample_d
        count = torch.sum(bool_tensor).item()
        if count < 10:
            Recall_U += 1
    print(Recall_U)
    Recall_U = Recall_U / valid_num
    Recall += Recall_U
    print(Recall)
    Recall_U = 0
    sorted_indices = torch.argsort(sample_d)
    sorted_sample_idx = sample_idx_tensor[sorted_indices]
    # take the coverage@10 for all users
    coverage_list += list(sorted_sample_idx[:10])
    valid_user += 1
    if valid_user % 100 == 0:
        print('.', end="")
        sys.stdout.flush()
    p90_list = [i.item() for i in coverage_list]
    p90_dict = Counter(p90_list)
    p90_sort = sorted(p90_dict.items(), key=lambda x: x[1], reverse=True)
    total_rec = 0
    item_count = 0
    for _, num in p90_sort:
        total_rec += num
        item_count += 1
        if total_rec >= 0.9 * 10 * usernum:
                break

5
1.0
10
1.7692307692307692


In [28]:
print(*[np.array(l) for l in [[u], [seq], process_idx]])

[1] [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  2  3  4  5  6  7  8  9 10
  11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
  35 36 37 38 39 40 41 42]] [  43   44   45   46   47 1872 2909  104 1447 2452 1112 1419 2951 1965
 1730 2632 1233  832 3387  383 3196 1528 1224  457 3107 3164 3108 2847
  542  515 2793 2035 1199 3263 2806 2423  672 1345 1492  687 1436  549
 2474 1790 3025  826  406 2437 2817  580 1731 2770   65 1865 2824 3272
 1524  198 1611 2980 1494  252 2778 1548 1047 2118 1406  178   51 2695
 2123  444   63 2331 

In [32]:
predictions.size(), len(process_idx)

(torch.Size([105]), 105)

In [70]:
train[u], valid[u], test[u]

([54,
  55,
  56,
  57,
  58,
  59,
  60,
  11,
  61,
  62,
  63,
  64,
  65,
  8,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  18,
  78,
  79,
  80,
  81,
  82,
  83,
  20,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  30,
  120,
  121,
  122,
  123,
  124,
  37,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  17,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149],
 [150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162],
 [163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175])

In [71]:
# predictions = -model.predict(*[np.array(l) for l in [[u], [seq], process_idx]])[0]


9
0.6923076923076923


In [82]:
Recall / valid_user, item_count / sample_nums

(0.8846153846153846, 0.16)