In [1]:
import os 
import pandas as pd 
import numpy as np 
import torch 

from recbole.model.general_recommender.bpr import BPR
from recbole.model.general_recommender.ease import EASE
from recbole.model.context_aware_recommender.ffm import FFM
from recbole.model.context_aware_recommender.fm import FM

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color
from recbole.quick_start import load_data_and_model, run_recbole, objective_function
from recbole.utils.case_study import full_sort_topk, full_sort_scores
from recbole.trainer import HyperTuning


from logging import getLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dpath = '../../../data/train/'
df = pd.read_csv(os.path.join(dpath, 'train_ratings.csv'))

In [3]:
user_grp = dict(df.groupby('user').item.apply(list))
user_grp

{11: [4643,
  170,
  531,
  616,
  2140,
  2722,
  2313,
  2688,
  2428,
  3113,
  1591,
  2600,
  8169,
  2572,
  58293,
  7541,
  1367,
  32,
  4792,
  7444,
  53953,
  56949,
  6502,
  53000,
  51662,
  5151,
  35836,
  7293,
  33585,
  8810,
  56801,
  5377,
  344,
  19,
  410,
  2124,
  828,
  1274,
  8977,
  1032,
  1214,
  1200,
  1320,
  3897,
  7173,
  1225,
  2858,
  59418,
  45361,
  2706,
  1321,
  2793,
  33085,
  4235,
  3892,
  4340,
  27660,
  43556,
  47124,
  2294,
  48304,
  150,
  31184,
  34338,
  1917,
  50162,
  2827,
  27368,
  4366,
  2153,
  30812,
  3525,
  1270,
  2011,
  2012,
  8973,
  1255,
  2018,
  541,
  4878,
  7361,
  31658,
  2571,
  7099,
  260,
  1196,
  60069,
  160,
  1882,
  60037,
  880,
  36509,
  405,
  3826,
  4133,
  673,
  6541,
  611,
  172,
  4638,
  5171,
  208,
  4887,
  5459,
  60760,
  8361,
  60514,
  1544,
  1876,
  442,
  32213,
  5219,
  1690,
  2717,
  27608,
  52722,
  780,
  6934,
  52287,
  3745,
  45499,
  37830,
  60040,
 

In [4]:
df

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [7]:
df = df.rename(columns={'user':'user_id:token'})
df.to_csv('recbole_train.user',index=False,sep='\t')

In [23]:
train = pd.read_csv(os.path.join(dpath, "train_ratings.csv"))
directors = pd.read_csv(os.path.join(dpath, "directors.tsv"), sep="\t")
genres = pd.read_csv(os.path.join(dpath, "genres.tsv"), sep="\t")
titles = pd.read_csv(os.path.join(dpath, "titles.tsv"), sep="\t")
writers = pd.read_csv(os.path.join(dpath, "writers.tsv"), sep="\t")
years = pd.read_csv(os.path.join(dpath, "years.tsv"), sep="\t")

In [24]:
def make_feature_sequence(x):
    x = list(set(x))
    y = ""
    for item in x:
        y += str(item + " ")
    return y.rstrip()


In [25]:
writers_seq = writers.groupby("item")['writer'].apply(make_feature_sequence)
genres_seq = genres.groupby("item")['genre'].apply(make_feature_sequence)
director_seq = directors.groupby('item')['director'].apply(make_feature_sequence)

train_df = pd.merge(train, writers_seq, on="item",how='left')
train_df = pd.merge(train_df, genres_seq, on="item",how='left')
train_df = pd.merge(train_df, director_seq, on="item",how='left')
train_df = pd.merge(train_df, years, on="item",how='left')
train_df

Unnamed: 0,user,item,time,writer,genre,director,year
0,11,4643,1230782529,nm0465199 nm0099541 nm0742797 nm0115310,Adventure Drama Action Sci-Fi,nm0000318,2001.0
1,11,170,1230782534,,Crime Adventure Action Thriller,nm0812200,1995.0
2,11,531,1230782539,nm0003031 nm0122364,Drama Children,nm0002140,1993.0
3,11,616,1230782542,nm0858826 nm0166551 nm0841035 nm0027011 nm0942...,Animation Children,nm0718627,1970.0
4,11,2140,1230782563,nm0643973 nm0001345,Fantasy Adventure,nm0001345 nm0000568,1982.0
...,...,...,...,...,...,...,...
5154466,138493,44022,1260209449,nm0310087 nm5022110 nm0841532,Comedy Adventure Animation Children,nm0757858,2006.0
5154467,138493,4958,1260209482,nm0672015 nm0892705 nm0859029 nm0859049,Drama Action War,nm0601382,2001.0
5154468,138493,68319,1260209720,nm0940790 nm1125275,Action Thriller Sci-Fi,nm0004303,2009.0
5154469,138493,40819,1260209726,nm0219456 nm0003506,Drama Musical Romance,nm0003506,2005.0


In [26]:
train_df = train_df.sort_values('user')
train_df.head()

Unnamed: 0,user,item,time,writer,genre,director,year
0,11,4643,1230782529,nm0465199 nm0099541 nm0742797 nm0115310,Adventure Drama Action Sci-Fi,nm0000318,2001.0
255,11,8640,1230856739,nm0291905,Adventure Drama Action War,nm0298807,2004.0
254,11,8907,1230856729,nm1019583 nm0931185 nm5022110 nm0073688 nm0769...,Comedy Animation Children,nm0421776 nm0074426 nm1224299,2004.0
253,11,8965,1230856675,nm0000709 nm0885575 nm0115310,Fantasy Adventure Animation Children,nm0000709,2004.0
252,11,36401,1230856670,nm0472567,Comedy Fantasy Thriller Horror,nm0000416,2005.0


In [27]:
train_data = train_df[['user', 'item', 'time']].reset_index(drop=True)
user_data = train_df[['user']].reset_index(drop=True)
item_data = train_df[['item', 'year', 'writer', 'genre', 'director']].drop_duplicates(subset=['item']).reset_index(drop=True)

In [38]:
train_data.columns=['user_id:token', 'item_id:token', 'timestamp:float']
user_data.columns=['user_id:token']
item_data.columns=['item_id:token', 'year:token', 'writer:token_seq', 'genre:token_seq', 'director:token_seq']

In [28]:
outpath = f"dataset/recbole_train"

os.makedirs(outpath, exist_ok=True)

print("Dump Start")

# 데이터 파일 저장
train_data.to_csv(os.path.join(outpath,"recbole_train.inter"),sep='\t',index=False)
user_data.to_csv(os.path.join(outpath,"recbole_train.user"),sep='\t',index=False)
item_data.to_csv(os.path.join(outpath,"recbole_train.item"),sep='\t',index=False)
# with open(outfile, "w") as f:
#     # write header
#     f.write("user_id:token\titem_id:token\ttimestamp:float\n")
#     for row in inter_table:
#         f.write("\t".join([str(x) for x in row])+"\n")

print("Dump Complete")

Dump Start
Dump Complete


데이터 생성 완료

In [7]:
genre_df = pd.merge(train, genres, on=["item"],how='left')

In [19]:
# genre_df = genre_df.drop(["user", "time"], axis=1)
# genre_df

In [8]:
new_genres = genre_df.groupby("item")["genre"].apply(make_feature_sequence)
new_genres

item
1         Fantasy Comedy Adventure Children Animation
2                          Fantasy Adventure Children
3                                      Comedy Romance
4                                Comedy Drama Romance
5                                              Comedy
                             ...                     
118700                                          Drama
118900                                          Drama
118997                Comedy Fantasy Musical Children
119141                                  Comedy Action
119145                  Comedy Adventure Action Crime
Name: genre, Length: 6807, dtype: object

In [10]:
writer_df = pd.merge(train, writers, on=["item"])
# writer_df = writer_df.drop(["user", "time"], axis=1)
new_writers = writer_df.groupby("item")["writer"].apply(make_feature_sequence)

In [10]:
director_df = pd.merge(train, directors, on=["item"],)
# director_df = director_df.drop(["user", "time"], axis=1)
new_directors = director_df.groupby("item")["director"].apply(make_feature_sequence)

In [11]:
df = pd.merge(years, new_genres, on=["item"])
df = pd.merge(df, new_writers, on=["item"])
df = pd.merge(df, new_directors, on=["item"])

In [12]:
df

Unnamed: 0,item,year,genre,writer,director
0,1348,1922,Horror,nm0831290,nm0003638
1,4768,1922,Mystery Thriller Crime,nm0415167 nm0000485 nm0902376,nm0000485
2,8235,1923,Action Comedy Romance,nm0516001 nm0853130 nm0369841 nm0924065,nm0628345 nm0853130
3,8609,1923,Comedy,nm0593477 nm0115669 nm0369841,nm0000036
4,25750,1924,Fantasy Comedy Romance,nm0593477 nm0115669 nm0369841,nm0000036
...,...,...,...,...,...
4962,113225,2014,Drama Comedy Romance,nm0000095,nm0000095
4963,111743,2014,Western Comedy,nm1273397 nm1273148 nm0532235,nm0532235
4964,112804,2014,Sci-Fi Drama,nm2648685,nm2648685
4965,113378,2014,Sci-Fi Drama,nm0004332,nm0637518


In [13]:
df=pd.merge(train,df, on=['item'])

In [15]:
df = df.sort_values('user')

In [20]:
df = df.rename(columns={"user":"user_id:token","item": "item_id:token","year":"year:token","genre":"genre:token_seq","writer":"writer:token_seq","director":"director:token_seq"})
df

Unnamed: 0,user_id:token,item_id:token,time,year:token,genre:token_seq,writer:token_seq,director:token_seq
0,11,4643,1230782529,2001,Adventure Action Drama Sci-Fi,nm0099541 nm0742797 nm0465199 nm0115310,nm0000318
1026932,11,5952,1230859039,2002,Adventure Fantasy,nm0001392 nm0801728 nm0101991 nm0909638 nm0866058,nm0001392
49913,11,7293,1230783500,2004,Comedy Romance,nm1286500,nm0781842
372760,11,2716,1230788672,1984,Sci-Fi Action Comedy,nm0000601 nm0000101 nm0001548,nm0718645
372053,11,27728,1230788663,2004,Thriller Sci-Fi Animation Action Drama,nm0794385 nm0651900 nm0556856 nm0258268,nm0651900
...,...,...,...,...,...,...,...
4410388,138493,47465,1256680620,2005,Fantasy Drama Thriller,nm0342617 nm0000416,nm0000416
263413,138493,1196,1258390284,1980,Adventure Action Sci-Fi,nm0102824 nm0000184 nm0001410,nm0449984
4338348,138493,7577,1258134201,1975,Fantasy Comedy Musical Romance,nm0000005,nm0000005
2754465,138493,2065,1255805272,1985,Fantasy Drama Comedy Romance,nm0000095,nm0000095


In [18]:
user_df = df[['user_id:token']]
user_df.to_csv('recbole_train.user',index=False,sep='\t')

In [19]:
item_df = df[['item_id:token','year:token','genre:token_seq','writer:token_seq','director:token_seq']]
item_df.to_csv('recbole_train.item',index=False,sep='\t')

In [46]:
df.to_csv('recbole_train.item',index=False,sep='\t')

In [3]:
yaml_data = """
field_separator: '\t'
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
    user: [user_id]
    item: [item_id, year, genre, writer, director]

train_neg_sample_args:
    uniform: 1
    
eval_args:
    split: {'RS': [8, 1, 1]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10
"""

with open("fm.yaml","w") as f:
    f.write(yaml_data)

In [40]:
logger = getLogger()


In [41]:
config = Config(model='FFM', dataset="recbole_train", config_file_list=[f'ffm.yaml'])
config['epochs'] = 6
config['show_progress'] = False
config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
init_seed(config['seed'], config['reproducibility'])
# logger initialization
init_logger(config)

logger.info(config)

13 Jun 11:07    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/recbole_train
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 6
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk = [10]
valid_metric = Recall@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field

In [42]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

13 Jun 11:08    INFO  recbole_train
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'genre', 'director']
13 Jun 11:08    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
13 Jun 11:08    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [43]:
train_data.dataset
valid_data.dataset
test_data.dataset


[1;35mrecbole_train[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.69372151154242
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'genre', 'director', 'label']

In [44]:
# model loading and initialization
init_seed(config['seed'], config['reproducibility'])
model = FFM(config, train_data.dataset).to(config['device'])
logger.info(model)

13 Jun 11:09    INFO  FFM(
  (token_embedding_table): FMEmbedding(
    (embedding): Embedding(38263, 10)
  )
  (token_seq_embedding_table): ModuleList(
    (0): Embedding(2990, 10)
    (1): Embedding(19, 10)
    (2): Embedding(1341, 10)
  )
  (first_order_linear): FMFirstOrderLinear(
    (token_embedding_table): FMEmbedding(
      (embedding): Embedding(38263, 1)
    )
    (token_seq_embedding_table): ModuleList(
      (0): Embedding(2990, 1)
      (1): Embedding(19, 1)
      (2): Embedding(1341, 1)
    )
  )
  (sigmoid): Sigmoid()
  (ffm): FieldAwareFactorizationMachine(
    (token_embeddings): ModuleList(
      (0): Embedding(38263, 10)
      (1): Embedding(38263, 10)
      (2): Embedding(38263, 10)
      (3): Embedding(38263, 10)
      (4): Embedding(38263, 10)
      (5): Embedding(38263, 10)
    )
    (token_seq_embeddings): ModuleList(
      (0): ModuleList(
        (0): Embedding(2990, 10)
        (1): Embedding(19, 10)
        (2): Embedding(1341, 10)
        (3): Embedding(2990

In [None]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

In [46]:
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(model_file=os.path.join('saved','FFM-Jun-13-2023_11-09-38.pth'))

13 Jun 12:01    INFO  recbole_train
The number of users: 31361
Average actions of users: 164.36450892857144
The number of items: 6808
Average actions of items: 757.2309387395328
The number of inters: 5154471
The sparsity of the dataset: 97.58579218741939%
Remain Fields: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'genre', 'director']
13 Jun 12:02    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'uniform': 1, 'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
13 Jun 12:02    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]


In [55]:
test_data

<recbole.data.dataloader.general_dataloader.FullSortEvalDataLoader at 0x7f4170a2ccd0>

In [47]:
submission = pd.read_csv('../../../data/eval/sample_submission.csv')

In [48]:
from tqdm import tqdm

In [None]:
full_sort_topk(np.array([]))

In [59]:
test_data.dataset

[1;35mrecbole_train[0m
[1;34mThe number of users[0m: 31361
[1;34mAverage actions of users[0m: 15.981855867346939
[1;34mThe number of items[0m: 6808
[1;34mAverage actions of items[0m: 73.69372151154242
[1;34mThe number of inters[0m: 501191
[1;34mThe sparsity of the dataset[0m: 99.76525637106212%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'year', 'writer', 'genre', 'director', 'label']

In [49]:

sub_user_idx = submission['user'].unique()
sub_user_idx = np.array(sub_user_idx,dtype=str)
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)
total_topk_score, total_topk_iid_list = torch.zeros_like(torch.Tensor(31360, 10)), torch.zeros_like(torch.Tensor(31360, 10))

for idx in tqdm(range(0,len(uid_series))):
    topk_score, topk_iid_list = full_sort_topk(np.array([uid_series[idx]]),model,test_data,10,config['device'])
    total_topk_score[idx] = topk_score
    total_topk_iid_list[idx] = topk_iid_list
    
int_iid = total_topk_iid_list.to(torch.int64)
external_item_list = dataset.id2token(dataset.iid_field, int_iid.cpu())
external_item_list = external_item_list.flatten()
df = pd.DataFrame({'user': np.repeat(sub_user_idx, 10), 'item': external_item_list})
df.to_csv("submission.csv",index=False)

100%|██████████| 31360/31360 [04:03<00:00, 129.04it/s]


In [None]:
sub_user_idx = submission['user'].unique()
sub_user_idx = np.array(sub_user_idx,dtype=str)
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)
total_topk_score, total_topk_iid_list = torch.zeros_like(torch.Tensor(31360, 10)), torch.zeros_like(torch.Tensor(31360, 10))

for idx in tqdm(range(0,len(uid_series))):
    topk_score, topk_iid_list = full_sort_topk(np.array([uid_series[idx]]),model,test_data,10,config['device'])
    total_topk_score[idx] = topk_score
    total_topk_iid_list[idx] = topk_iid_list
    
int_iid = total_topk_iid_list.to(torch.int64)
external_item_list = dataset.id2token(dataset.iid_field, int_iid.cpu())
external_item_list = external_item_list.flatten()
df = pd.DataFrame({'user': np.repeat(sub_user_idx, 10), 'item': external_item_list})
df.to_csv("submission.csv",index=False)

In [135]:
sorted(dataset.token2id(dataset.iid_field,np.array(user_grp[int(sub_user_idx[0])],dtype=str)))


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [121]:
full_sort_scores(np.array([uid_series[0]]),model,test_data,config['device']).size()

torch.Size([1, 6808])

In [74]:
scores = full_sort_scores(np.array([uid_series[0]]),model,test_data,config['device'])

In [131]:
scores.masked_fill()

tensor([  -inf,   -inf,   -inf,  ..., 0.0102, 0.0019, 0.5001], device='cuda:0')

In [203]:
sub_user_idx = submission['user'].unique()
sub_user_idx = np.array(sub_user_idx,dtype=str)
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)
total_topk_score, total_topk_iid_list = torch.zeros_like(torch.Tensor(31360, 10)), torch.zeros_like(torch.Tensor(31360, 10))

for idx in tqdm(range(0,len(uid_series))):
    need_inf = dataset.token2id(dataset.iid_field, np.array(user_grp[int(sub_user_idx[idx])],dtype=str))
    mask = [True if i in need_inf else False for i in range(0,6808)]
    scores = full_sort_scores(np.array([uid_series[idx]]),model,test_data,config['device'])
    new_scores=scores.cpu().masked_fill(torch.from_numpy(np.array(mask)),float('-inf'))    
    total_topk_score[idx] = torch.topk(new_scores,10)[0]
    total_topk_iid_list[idx] = torch.topk(new_scores,10)[1]
    
int_iid = total_topk_iid_list.to(torch.int64)
external_item_list = dataset.id2token(dataset.iid_field, int_iid.cpu())
external_item_list = external_item_list.flatten()
df = pd.DataFrame({'user': np.repeat(sub_user_idx, 10), 'item': external_item_list})
df.to_csv("submission.csv",index=False)

100%|██████████| 31360/31360 [15:48<00:00, 33.06it/s]


In [206]:
sub_user_idx = submission['user'].unique()
sub_user_idx = np.array(sub_user_idx,dtype=str)
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)
total_topk_score, total_topk_iid_list = torch.zeros_like(torch.Tensor(31360, 10)), torch.zeros_like(torch.Tensor(31360, 10))

for idx in tqdm(range(0,len(uid_series))):
    need_inf = dataset.token2id(dataset.iid_field, np.array(user_grp[int(sub_user_idx[idx])],dtype=str))
    mask = [True if i in need_inf else False for i in range(0,6808)]
    scores = full_sort_scores(np.array([uid_series[idx]]),model,test_data,config['device'])
    new_scores=scores.cpu().masked_fill(torch.from_numpy(np.array(mask)),float('-inf'))    
    total_topk_score[idx] = torch.topk(new_scores,10)[0]
    total_topk_iid_list[idx] = torch.topk(new_scores,10)[1]
    
int_iid = total_topk_iid_list.to(torch.int64)
external_item_list = dataset.id2token(dataset.iid_field, int_iid.cpu())
external_item_list = external_item_list.flatten()
df = pd.DataFrame({'user': np.repeat(sub_user_idx, 10), 'item': external_item_list})
df.to_csv("submission.csv",index=False)

  1%|          | 231/31360 [00:06<15:40, 33.10it/s]


KeyboardInterrupt: 

In [201]:
sub_user_idx

array(['11', '14', '18', ..., '138486', '138492', '138493'], dtype='<U21')

In [None]:
sub_user_idx = np.array(sub_user_idx,dtype=str)
uid_series = dataset.token2id(dataset.uid_field, sub_user_idx)

In [153]:
scores

tensor([[  -inf,   -inf,   -inf,  ..., 0.0102, 0.0019, 0.5001]],
       device='cuda:0')

In [161]:
need_inf = dataset.token2id(dataset.iid_field, np.array(user_grp[11],dtype=str))
need_inf

array([  1, 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244,
       243, 242, 241, 240, 239, 238, 257, 258, 259, 260, 280, 279, 278,
       277, 276, 275, 274, 273, 272, 281, 271, 269, 268, 267, 266, 265,
       264, 263, 262, 261, 237, 270, 236, 223, 209, 283, 208, 207, 206,
       205, 204, 203, 202, 201, 200, 199, 198, 197, 196, 195, 194, 193,
       192, 191, 210, 211, 212, 213, 233, 232, 231, 230, 229, 228, 227,
       226, 225, 234, 224, 222, 221, 220, 219, 218, 217, 216, 215, 214,
       235, 282, 311, 284, 352, 350, 355, 349, 348, 347, 346, 345, 344,
       353, 343, 341, 340, 339, 338, 337, 336, 335, 334, 333, 342, 354,
       358, 356, 371, 375, 374, 373, 372, 370, 369, 368, 367, 376, 365,
       364, 363, 362, 361, 360, 359, 366, 357, 332, 351, 330, 306, 304,
       303, 302, 301, 300, 299, 298, 297, 296, 305, 295, 293, 292, 291,
       290, 289, 288, 287, 286, 285, 294, 307, 318, 308, 328, 327, 326,
       325, 324, 323, 322, 321, 320, 329, 319, 317, 316, 315, 31

In [166]:
mask = [True if i in need_inf else False for i in range(0,6808)]


In [169]:
torch.from_numpy(np.array(mask))

tensor([False,  True,  True,  ..., False, False, False])

In [172]:
new_scores=scores.cpu().masked_fill(torch.from_numpy(np.array(mask)),float('-inf'))

In [198]:
total_topk_iid_list[0] = torch.topk(new_scores,10).indices

In [200]:
total_topk_iid_list[1]

tensor([ 412.,  149.,   70.,  594.,  154.,  646., 1428.,   83.,  288.,   68.])

In [183]:
torch.topk(new_scores,10).indices

tensor([[ 659,  482,  780,  727,  720,  646,  750, 2474, 1390,  884]])

In [140]:
need_inf=torch.from_numpy(dataset.token2id(dataset.iid_field, np.array(user_grp[11],dtype=str))).cpu()

In [117]:
int_iid.size()

torch.Size([31360, 10])

In [119]:
dataset.token2id(dataset.iid_field, torch.LongTensor(np.array(user_grp[11])))

TypeError: The type of tokens [tensor([ 4643,   170,   531,   616,  2140,  2722,  2313,  2688,  2428,  3113,
         1591,  2600,  8169,  2572, 58293,  7541,  1367,    32,  4792,  7444,
        53953, 56949,  6502, 53000, 51662,  5151, 35836,  7293, 33585,  8810,
        56801,  5377,   344,    19,   410,  2124,   828,  1274,  8977,  1032,
         1214,  1200,  1320,  3897,  7173,  1225,  2858, 59418, 45361,  2706,
         1321,  2793, 33085,  4235,  3892,  4340, 27660, 43556, 47124,  2294,
        48304,   150, 31184, 34338,  1917, 50162,  2827, 27368,  4366,  2153,
        30812,  3525,  1270,  2011,  2012,  8973,  1255,  2018,   541,  4878,
         7361, 31658,  2571,  7099,   260,  1196, 60069,   160,  1882, 60037,
          880, 36509,   405,  3826,  4133,   673,  6541,   611,   172,  4638,
         5171,   208,  4887,  5459, 60760,  8361, 60514,  1544,  1876,   442,
        32213,  5219,  1690,  2717, 27608, 52722,   780,  6934, 52287,  3745,
        45499, 37830, 60040, 34319,  8644,  6365, 34048,   316,  3300,  7022,
         7254, 57368,  1584,  2232,  1748,  1253, 49278, 42718,  1097,  7481,
         5903,  1653,   741, 27728,  2716, 52281, 57640,  2761, 59315,   480,
         3702,  1580,  5445, 52885,  3527, 44849, 27904,  5349,  8636,   589,
         1240,  2916,  3793,  6333,  3054,   546,   256,   173,  2642,  7846,
         8870, 54771,  7845, 34150, 51412,  3697,  6534,  7163,  5463,  3438,
         3269,  8371,  5046,  6537,  4446,  8865,  5378, 32031,  3704,  4533,
         6659,  2134,  1676,  2105,  5502,  3033,   592, 56174,   318, 60832,
         4993,  3114,  1136,  1073,   648,   110,  8622,  6874,  6863,  6539,
         6377,  5816,  5669,  5618,  5299,  3863,  3752,  2355,  2291,  1339,
         1029,  1028,   784,   610,   551,   158, 54001, 50872, 48394, 45722,
        33679, 31878, 30793,  6953,  6503,  6281,  5782,  5400,  5266,  4388,
         4299,  4054,  3988,  3980,  3969,  3247,  2840,  2720,  2004,   761,
        56757, 41571, 36401,  8965,  8907,  8640,  7439,  6996,  5283,  4974,
         2959,  2240,  2042,  2028,   296,   356,   593,     1,  1210,   608,
          377,  1198,   165,   595,   153,   364,   597,   231,   500,   367,
         1721,   587,  1197,  2628,   253,  1291,  3578,  1036,   185,  4306,
          586,    39,  5952,  2396,  1961,  1527,  1387,  8368,  6378,  6016,
         4896,  3000,  2810,  3159,  7458,  5152, 33493, 41566,  4973, 33794,
         4995, 33660, 33166, 58559, 62265, 27441, 67295,   653, 55999, 63859,
         3753,  8972, 53125,  3083,  5225,  1046, 64034, 53996, 43932, 26887,
        66171, 68319, 51698,  5444,  6731,  2455, 56145,   968, 54995, 52328,
         3994, 44191,  6979, 60684,  6250, 44731, 64497,  3864,  6294, 64997,
        67197, 69526,  5882, 31660,  1391,  4571,  4492,  2985,   380,   527,
          588,   736,  2762, 41569, 72641, 48774, 48516, 71282, 64614, 34542,
        65514, 48738,  6291, 46578,  7153,  4226])] is not supported

In [None]:
device = config['device']
uid_series = torch.tensor(np.array([uid_series[idx]]))
uid_field = test_data.dataset.uid_field
dataset = test_data.dataset
model.eval() 


In [87]:
input_interaction = dataset.join(Interaction({uid_field: uid_series}))

In [88]:
history_item = test_data.uid2history_item[list(uid_series)]

In [105]:
valid_data.uid2history_item[list(uid_series)]

array([None], dtype=object)

In [110]:
type(valid_data)

recbole.data.dataloader.general_dataloader.FullSortEvalDataLoader

In [112]:
test_data.uid2history_item[list(uid_series)]

array([None], dtype=object)