In [1]:
from collections import defaultdict
from datetime import datetime
import torch
import pandas as pd

In [2]:
ml_20m_path = '../../../../datasets/ml-20m'

In [3]:
file_path = ml_20m_path + '/ratings.csv'
data = pd.read_csv(file_path, encoding='latin-1', sep=',', engine='python', index_col='userId')

In [4]:
data

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,3.5,1112486027
1,29,3.5,1112484676
1,32,3.5,1112484819
1,47,3.5,1112484727
1,50,3.5,1112484580
...,...,...,...
138493,68954,4.5,1258126920
138493,69526,4.5,1259865108
138493,69644,3.0,1260209457
138493,70286,5.0,1258126944


In [5]:
actions = data
actions = actions.groupby('movieId').filter(lambda  x: len(x) >= 5)
actions = actions.groupby('userId').filter(lambda  x: len(x) >= 5)

In [6]:
actions = actions.groupby('userId', group_keys=False).apply(lambda  x: x.sort_values('timestamp'))

In [7]:
actions

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,924,3.5,1094785598
1,919,3.5,1094785621
1,2683,3.5,1094785650
1,1584,3.5,1094785656
1,1079,4.0,1094785665
...,...,...,...
138493,6534,3.0,1260209908
138493,53464,4.0,1260209920
138493,1275,3.0,1262378552
138493,6996,3.0,1262378555


In [8]:
# data = data.groupby('userId')
min = 100000
max = 0
total = 0
for i in range(1, actions.index.max()):
    total += len(actions.loc[i])
    if len(data.loc[i]) < min:
        min = len(actions.loc[i])
    if len(data.loc[i]) > max:
        max = len(actions.loc[i])
min, max, total/actions.index.max()

(20, 8540, 144.2935816250641)

In [9]:
usermap = dict()
usernum = 0
itemmap = dict()
itemnum = 0
# reorder the userid and itemid (keep the same step with original SASRec code)
for _id, row in actions.iterrows():
    if _id in usermap:
        userid = usermap[_id]
    else:
        usernum += 1
        userid = usernum
        usermap[_id] = userid

    if row.movieId in itemmap:
        itemid = itemmap[row.movieId]
    else:
        itemnum += 1
        itemid = itemnum
        itemmap[row.movieId] = itemid

In [10]:
usermap.__len__(), itemmap.__len__()

(138493, 18345)

In [11]:
actions['movieId'] = actions['movieId'].map(itemmap)
actions.index = actions.index.map(usermap)

In [12]:
actions

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,3.5,1094785598
1,2,3.5,1094785621
1,3,3.5,1094785650
1,4,3.5,1094785656
1,5,4.0,1094785665
...,...,...,...
138493,872,3.0,1260209908
138493,797,4.0,1260209920
138493,2336,3.0,1262378552
138493,946,3.0,1262378555


In [13]:
sas_data = actions.drop('rating', axis=1, inplace=False).drop('timestamp', axis=1, inplace=False)

In [14]:
with open('ml-20m.txt', 'w') as f:
    for _id in set(sas_data.index):
        for movie_id in sas_data.loc[_id].movieId:
            f.write('%d %d\n' % (_id, movie_id))

In [15]:
sas_data

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
1,1
1,2
1,3
1,4
1,5
...,...
138493,872
138493,797
138493,2336
138493,946


In [16]:
userMaxTime = actions.groupby('userId').timestamp.max()

In [17]:
userMaxTime

userId
1         1112486201
2          974821014
3          945176099
4          840879654
5          851617728
             ...    
138489    1352990179
138490     975545576
138491    1247183347
138492    1115351450
138493    1262378572
Name: timestamp, Length: 138493, dtype: int64

In [18]:
day = 86400
num_day = 7
userSplitTime = userMaxTime - num_day  * day

In [19]:
userSplitTime

userId
1         1111881401
2          974216214
3          944571299
4          840274854
5          851012928
             ...    
138489    1352385379
138490     974940776
138491    1246578547
138492    1114746650
138493    1261773772
Name: timestamp, Length: 138493, dtype: int64

In [20]:
train_seq = None
for _id in actions.index:
    split_time = userSplitTime.loc[_id]
    user_seq = actions[(actions.index == _id) & (actions.timestamp < split_time)]
    if _id == 1:
        train_seq = user_seq
    else:
        train_seq = pd.concat([train_seq, user_seq], ignore_index=False)

KeyboardInterrupt: 

In [None]:
train_seq

## SASRec handling
Only use implicit feedback in the sequence of items

In [None]:
day = 86400
usernum = actions.index.max()
split_percent = 0.9
k = 25
split_index = int(len(actions.loc[k]) * split_percent)
train_seq = actions.loc[k].iloc[:split_index]
target_seq = actions.loc[k].iloc[split_index:]
target_seq.timestamp.max() - train_seq.timestamp.max()
# actions.loc[1].timestamp.max() -

In [None]:
userMaxTime = actions.groupby('userId').timestamp.max()
userActLength = actions.groupby('userId').size()
split_index = (userActLength * split_percent).astype(int)
userMaxTime, split_index

In [None]:
window_size = []
for i in range(1 ,actions.index.max() + 1):
    cur_seq = actions.loc[i]
    split = split_index.loc[i]
    train_seq = cur_seq.iloc[:split]
    target_seq = cur_seq.iloc[split:]
    window_size.append(target_seq.iloc[-1].timestamp - train_seq.iloc[-1].timestamp)

seq_avg_length= userActLength.mean()
seq_avg_length # , window_size

In [None]:
def data_partition_window(fname, valid_percent, test_percent, train_percent):
    if valid_percent + test_percent > 0.6:
        print('the percent you select for val/test are too high')
        return None
    valid_start = 1 - valid_percent - test_percent
    test_start = 1 - test_percent
    train_start = 1 - train_percent
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    # read from each line
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)
        # count user and items
    # read from each user
    count = 0
    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            # select the whole training seq
            # user_train[user] = User[user][:-2]
            seq_len = len(User[user])
            valid_index = int(seq_len * valid_start)
            test_index = int(seq_len * test_start)
            if valid_index == test_index:
                user_train[user] = User[user]
                user_valid[user] = []
                user_test[user] = []
            else:
                train_seq = User[user][: valid_index]
                valid_seq = User[user][valid_index: test_index]
                test_seq = User[user][test_index:]
                train_seq_length =len(train_seq)
                split_index = int(train_seq_length * train_start)
                input_seq = train_seq[:split_index]
                target_seq = train_seq[split_index:]
                for target in target_seq:
                    count+=1
                    user_train[count] = input_seq + [target]
                user_valid[user] = []
                user_valid[user]+= valid_seq
                user_test[user] = []
                user_test[user]+= test_seq
    return [user_train, user_valid, user_test, usernum, itemnum]

In [None]:
dataset = data_partition_window('ml-1m', 0.1, 0.1, 0.2)

In [None]:
[user_train, user_valid, user_test, usernum, itemnum] = dataset
len(user_train), user_train[1]

In [None]:
[user_train, user_valid, user_test, usernum, itemnum] = dataset

In [None]:
def data_partition_window_1(fname):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    split_percent = 0.8

    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    # read from each line
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)
    # read from each user
    count = 0
    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            # select the whole training seq
            # user_train[user] = User[user][:-2]
            train_seq = User[user][:-2]
            train_seq_length = len(train_seq)
            split_index = int(train_seq_length * split_percent)
            input_seq = train_seq[:split_index]
            target_seq = train_seq[split_index:]
            for target in target_seq:
                count += 1
                user_train[count] = input_seq + [target]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
    usernum = count
    return [user_train, user_valid, user_test, usernum, itemnum]

In [None]:
dataset = data_partition_window_1('ml-1m')
[user_train, user_valid, user_test, usernum, itemnum] = dataset

In [None]:
len(user_train), user_train[101442]

In [None]:
import numpy as np
user = np.random.randint(1, usernum + 1)
user

# Evaluation metrix modification

In [None]:
from collections import defaultdict

def data_partition(fname):
    usernum = 0
    itemnum = 0
    User = defaultdict(list)
    user_train = {}
    user_valid = {}
    user_test = {}
    # assume user/item index starting from 1
    f = open('%s.txt' % fname, 'r')
    for line in f:
        u, i = line.rstrip().split(' ')
        u = int(u)
        i = int(i)
        usernum = max(u, usernum)
        itemnum = max(i, itemnum)
        User[u].append(i)

    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = []
            user_valid[user].append(User[user][-2])
            user_test[user] = []
            user_test[user].append(User[user][-1])
    return [user_train, user_valid, user_test, usernum, itemnum]

In [None]:
dataset = data_partition('ml-1m')
dataset[0]

In [None]:
import random
import numpy as np
import copy
import torch
import sys

def evaluate_window_valid(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    Recall = 0.0
    P90 = 0.0
    # P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
    valid_user = 0.0
    sample_nums = 500
    random_items = random.sample(range(1, itemnum + 1), sample_nums)
    # if usernum > 10000:
    #     # avoid too many training users
    #     # keep at most 10000 users
    #     users = random.sample(range(1, usernum + 1), 10000)
    # else:
    #     # else keep all the users
    #     users = range(1, usernum + 1)
    users = range(1, usernum+1)
    for u in users:
        # make sure the sequence can be validated
        if len(train[u]) < 1 or len(valid[u]) < 1: continue
        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            # fill the sequence from end to beginning
            idx -= 1
            if idx == -1: break
            # select the max len or all of the training data in the sequence
            # limit the length, seq contains the actual training sequence
        rated = set(train[u])
        rated.add(0)
        # all items interacted by the current user
        item_idx = [valid[u][0]]
        # get the index of validated item
        for _ in range(100):
            # negative sampling
            t = np.random.randint(1, itemnum + 1)
            # randomly sample 100 items
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)
        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        # predicting the recommendation list
        predictions = predictions[0]
        rank = predictions.argsort().argsort()[0].item()
        # the rank of the expected next single item
        valid_user += 1
        if rank < 10:
            Recall += 1
            # P90 coverage
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()
    return Recall / valid_user, P90 / valid_user

In [None]:
import argparse

def str2bool(s):
    if s not in {'false', 'true'}:
        raise ValueError('Not a valid boolean string')
    return s == 'true'

def create_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True)
    parser.add_argument('--train_dir', required=True)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--maxlen', default=50, type=int)
    parser.add_argument('--hidden_units', default=50, type=int)
    parser.add_argument('--num_blocks', default=2, type=int)
    parser.add_argument('--num_epochs', default=201, type=int)
    parser.add_argument('--num_heads', default=1, type=int)
    parser.add_argument('--dropout_rate', default=0.5, type=float)
    parser.add_argument('--l2_emb', default=0.0, type=float)
    parser.add_argument('--device', default='cpu', type=str)
    parser.add_argument('--inference_only', default=False, type=str2bool)
    parser.add_argument('--state_dict_path', default=None, type=str)
    args = parser.parse_args(args)
    return args

In [None]:
from models.SASRec.model import SASRec
[train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
model_path = '../../processed/ml-1m_repro2/SASRec.epoch=201.lr=0.001.layer=2.head=1.hidden=50.maxlen=200.pth'
# args.device = 'ml-1m'
# args.train_dir = 'test'
# args.state_dict_path = model_path
# args.inference
args = create_args(['--dataset','ml-1m',
                    '--train_dir', 'test',
                    '--device', 'cuda',
                    '--state_dict_path', model_path,
                    '--inference_only', 'true',
                    '--maxlen', '200'])
# print(args.dataset)
model = SASRec(usernum, itemnum, args).to(args.device) # no ReLU activation in original SASRec implementation?
model.load_state_dict(torch.load(args.state_dict_path, map_location=torch.device(args.device)))


In [None]:
# seq = np.zeros([200], dtype=np.int32)
from collections import Counter

def window_eval(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    Recall = 0.0
    P90 = 0.0
    coverage_list = []
    # P90 coverage means the smallest item sets that appear in the top 10 lists of at least 90% of the users.
    valid_user = 0.0
    sample_nums = 100
    random_items = random.sample(range(1, itemnum + 1), sample_nums)
    sample_idx = random_items
    sample_idx_tensor = torch.tensor(sample_idx).to(args.device)
    users = range(1, usernum+1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue
        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            # fill the sequence from end to beginning
            idx -= 1
            if idx == -1: break
            # select the max len or all of the training data in the sequence
            # limit the length, seq contains the actual training sequence
        # interacted items
        rated = set(train[u])
        rated.add(0)
        # ground truth item
        ground_truth_idx = [valid[u][0]]
        # collect all indexes, which needs to process on
        process_idx = ground_truth_idx + sample_idx
        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], process_idx]])[0]
        # target distance
        target_d = predictions[0]
        # sampled results
        sample_d = predictions[1:]
        # print(len(sample_d))
        bool_tensor = target_d >= sample_d
        count = torch.sum(bool_tensor).item()
        if count < 10:
            Recall += 1
        sorted_indices = torch.argsort(sample_d)
        sorted_sample_idx = sample_idx_tensor[sorted_indices]
        # take the coverage@10 for all users
        coverage_list+=list(sorted_sample_idx[:10])
        valid_user+=1
    p90_list = [i.item() for i in coverage_list]
    p90_dict = Counter(p90_list)
    p90_sort = sorted(p90_dict.items(), key=lambda x: x[1], reverse=True)
    total_rec = 0
    item_count = 0
    for _, num in p90_sort:
        total_rec+= num
        item_count+= 1
        if total_rec>=0.9*10*usernum:
            break
    return Recall/ valid_user, item_count/sample_nums

In [None]:
r_10, p90_10 = window_eval(model, dataset, args)
r_10, p90_10

In [None]:
r_10

In [None]:
p90_10[0]

In [None]:
train_user = [7,3,2,8,6,1]
train_user[:-1]