# Evaluation metrix modification

In [11]:
from collections import defaultdict

def data_partition(fname):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)

    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
    return [user_train, user_valid, user_test, usernum, itemnum]

dataset = data_partition('data/processed/ml-1m')

In [12]:
import random
import numpy as np
import copy
import torch
import sys

def evaluate_window_valid(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    Recall = 0.0
    P90 = 0.0
    # P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
    valid_user = 0.0
    sample_nums = 500
    random_items = random.sample(range(1, itemnum + 1), sample_nums)
    # if usernum > 10000:
    #     # avoid too many training users
    #     # keep at most 10000 users
    #     users = random.sample(range(1, usernum + 1), 10000)
    # else:
    #     # else keep all the users
    #     users = range(1, usernum + 1)
    users = range(1, usernum+1)
    for u in users:
        # make sure the sequence can be validated
        if len(train[u]) < 1 or len(valid[u]) < 1: continue
        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            # fill the sequence from end to beginning
            idx -= 1
            if idx == -1: break
            # select the max len or all of the training data in the sequence
            # limit the length, seq contains the actual training sequence
        rated = set(train[u])
        rated.add(0)
        # all items interacted by the current user
        item_idx = [valid[u][0]]
        # get the index of validated item
        for _ in range(100):
            # negative sampling
            t = np.random.randint(1, itemnum + 1)
            # randomly sample 100 items
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)
        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        # predicting the recommendation list
        predictions = predictions[0]
        rank = predictions.argsort().argsort()[0].item()
        # the rank of the expected next single item
        valid_user += 1
        if rank < 10:
            Recall += 1
            # P90 coverage
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()
    return Recall / valid_user, P90 / valid_user

In [13]:
import argparse

def str2bool(s):
    if s not in {'false', 'true'}:
        raise ValueError('Not a valid boolean string')
    return s == 'true'

def create_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True)
    parser.add_argument('--train_dir', required=True)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--maxlen', default=50, type=int)
    parser.add_argument('--hidden_units', default=50, type=int)
    parser.add_argument('--num_blocks', default=2, type=int)
    parser.add_argument('--num_epochs', default=201, type=int)
    parser.add_argument('--num_heads', default=1, type=int)
    parser.add_argument('--dropout_rate', default=0.5, type=float)
    parser.add_argument('--l2_emb', default=0.0, type=float)
    parser.add_argument('--device', default='cpu', type=str)
    parser.add_argument('--inference_only', default=False, type=str2bool)
    parser.add_argument('--state_dict_path', default=None, type=str)
    args = parser.parse_args(args)
    return args

In [14]:
from models.SASRec.model import SASRec
[train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
model_path = 'processed/ml-1m_repro2/SASRec.epoch=201.lr=0.001.layer=2.head=1.hidden=50.maxlen=200.pth'
# args.device = 'ml-1m'
# args.train_dir = 'test'
# args.state_dict_path = model_path
# args.inference
args = create_args(['--dataset','ml-1m',
                    '--train_dir', 'test',
                    '--device', 'cuda',
                    '--state_dict_path', model_path,
                    '--inference_only', 'true',
                    '--maxlen', '200'])
# print(args.dataset)
model = SASRec(usernum, itemnum, args).to(args.device) # no ReLU activation in original SASRec implementation?
model.load_state_dict(torch.load(args.state_dict_path, map_location=torch.device(args.device)))

<All keys matched successfully>

In [15]:
# seq = np.zeros([200], dtype=np.int32)
from collections import Counter

def window_eval(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    Recall = 0.0
    P90 = 0.0
    coverage_list = []
    # P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
    valid_user = 0.0
    sample_nums = 100
    random_items = random.sample(range(1, itemnum + 1), sample_nums)
    sample_idx = random_items
    sample_idx_tensor = torch.tensor(sample_idx).to(args.device)
    users = range(1, usernum+1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue
        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            # fill the sequence from end to beginning
            idx -= 1
            if idx == -1: break
            # select the max len or all of the training data in the sequence
            # limit the length, seq contains the actual training sequence
        # interacted items
        rated = set(train[u])
        rated.add(0)
        # ground truth item
        ground_truth_idx = [valid[u][0]]
        # collect all indexes, which needs to process on
        process_idx = ground_truth_idx + sample_idx
        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], process_idx]])[0]
        # target distance
        target_d = predictions[0]
        # sampled results
        sample_d = predictions[1:]
        # print(len(sample_d))
        bool_tensor = target_d >= sample_d
        count = torch.sum(bool_tensor).item()
        if count < 10:
            Recall += 1
        sorted_indices = torch.argsort(sample_d)
        sorted_sample_idx = sample_idx_tensor[sorted_indices]
        # take the coverage@10 for all users
        coverage_list+=list(sorted_sample_idx[:10])
        valid_user+=1
    p90_list = [i.item() for i in coverage_list]
    p90_dict = Counter(p90_list)
    p90_sort = sorted(p90_dict.items(), key=lambda x: x[1], reverse=True)
    total_rec = 0
    item_count = 0
    for _, num in p90_sort:
        total_rec+= num
        item_count+= 1
        if total_rec>=0.9*10*usernum:
            break
    return Recall/ valid_user, item_count/sample_nums

In [16]:
r_10, p90_10 = window_eval(model, dataset, args)
r_10, p90_10

(0.7357615894039735, 0.43)

In [7]:
from models.SASRec.utils import evaluate
model.eval()
t_test = evaluate(model, dataset, args)

...

KeyboardInterrupt: 