In [1]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color
from recbole.utils.case_study import full_sort_topk
import os

In [44]:
cdf = pd.read_pickle('../Utils/Pickle/230203_character_movie_merge.pickle')
mdf = pd.read_pickle('../Utils/Pickle/230130_Popular_movie_1192_cwj.pickle')

In [46]:
cdf.shape, mdf.shape

((2869, 11), (1192, 13))

In [6]:
topk=100
model_name="EASE"
model_path="./saved/EASE-Feb-05-2023_05-23-53.pth"

In [8]:
print('inference start!')
if model_path is None:
    # model_name이 들어가는 pth 파일 중 최근에 생성된 걸로 불러옴
    os.makedirs('saved',exist_ok=True)
    save_path = os.listdir('./saved')
    model_path = './saved/' + sorted([file for file in save_path if model_name in file ])[-1]

K = topk

# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']

init_seed(config['seed'], config['reproducibility'])
config['dataset'] = 'train_data'
if model_name=="S3Rec":
    config['eval_args']['split']={'RS':[99999,0,1]}
else:
    config['eval_args']['split']['RS']=[999999,0,1]
print("create dataset start!")
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
print("create dataset done!")

model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

inference start!
create dataset start!


In inter_feat, line [6348477, 6348671], item_id do not exist, so they will be removed.


{'RS': [999999, 0, 1]}
create dataset done!


In [56]:
user_id2token

array(['[PAD]', '1', '2', ..., '300010', '300011', '300013'], dtype='<U6')

In [21]:
user_list=[]
for idx,i in enumerate(user_id2token):
    if i!='[PAD]':
        if int(i)>=300_000:
            print(idx,i)
            user_list.append(idx)

161860 300001
161861 300002
161862 300003
161863 300004
161864 300005
161865 300006
161866 300007
161867 300008
161868 300009
161869 300010
161870 300011
161871 300013


In [22]:
user_list

[161860,
 161861,
 161862,
 161863,
 161864,
 161865,
 161866,
 161867,
 161868,
 161869,
 161870,
 161871]

In [36]:
user_tensor_list = torch.tensor(user_list)
user_tensor_list

tensor([161860, 161861, 161862, 161863, 161864, 161865, 161866, 161867, 161868,
        161869, 161870, 161871])

In [47]:
pred_list = full_sort_topk(user_tensor_list[:1], model, test_data, K+50, device=device)[1]
pred_list

  uid_series = torch.tensor(uid_series)


tensor([[ 56, 203, 276, 293, 114, 473, 223, 597, 208, 736, 295, 755, 288, 669,
         588, 401, 395, 882, 229, 657, 396, 636, 270, 269, 129, 643, 705, 325,
         393, 806, 572, 645, 649, 474, 741, 535, 465, 155, 206, 429,  37, 505,
         477, 804, 527, 807, 326, 866, 641,  49, 204, 690, 523, 452, 503, 264,
         332,  18, 383, 701, 590, 544, 529, 850,  20, 651, 258, 292, 632, 220,
         631, 268, 329, 502, 596, 128, 792, 545, 439, 105, 711,  51,   1, 249,
         454, 714, 762, 746, 616, 163, 818, 216, 172,  31,  38, 650, 445,  46,
          99, 246, 154,  36, 629, 558, 791, 619, 245, 308,  34, 381, 528,  71,
         262,  32, 412, 397, 422, 552, 151, 355, 595, 303,  57, 582, 449, 648,
         672, 531,  82, 769, 344, 743,  22, 121, 277, 698, 692, 375, 143, 499,
         832,  90, 817, 676, 654, 140, 189, 235, 408, 103]])

In [48]:
# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))
sub = pd.DataFrame(result, columns=["user", "item"])

In [53]:
mdf.movieId

0        73141
1       122470
2       152081
3         4447
4         1246
         ...  
1187    300997
1188    300998
1189    300999
1190    301010
1191    301014
Name: movieId, Length: 1192, dtype: int64

In [55]:
sub.merge(mdf,left_on='item',right_on='movieId')

Unnamed: 0,user,item,Contents,ko_title,contents_year,country,ko_genre,ko_plot,rating,npop,genres,plot,directors,actors,movieId
0,300001,35836,The 40-Year-Old Virgin (2005),40살까지 못해본 남자,2005,미국,로맨스/코미디,40세의 노총각 앤디는 지금까지 한번도 섹스를 못해본 그야말로 숫총각 이다. 대형 ...,3.1,20000,"'Comedy', 'Romance'","Goaded by his buddies, a nerdy guy who\'s neve...",'Judd Apatow',"'Steve Carell', 'Catherine Keener', 'Paul Rudd...",35836
1,300001,80463,The Social Network (2010),소셜 네트워크,2010,미국,전기/드라마,2004년 첫 투자금 단돈 1천 달러,3.5,300000,"'Biography', 'Drama'",As Harvard student Mark Zuckerberg creates the...,'David Fincher',"'Jesse Eisenberg', 'Rooney Mara', 'Bryan Barte...",80463
2,300001,7451,Mean Girls (2004),퀸카로 살아남는 법,2004,"캐나다,미국",로맨틱 코미디/코미디,동물학자인 아버지를 따라 아프리카에서 성장한 케이디 (린제이 로한 분)는 일리노이즈...,3.7,420000,'Comedy',"Cady Heron is a hit with The Plastics, the A-l...",'Mark Waters',"'Lindsay Lohan', 'Rachel McAdams', 'Tina Fey',...",7451
3,300001,182715,Annihilation (2018),서던 리치: 소멸의 땅,2018,"영국,미국",판타지/미스터리/공포/드라마/SF/스릴러/액션/모험,불가사의한 이유로 사람들의 출입이 금지된 채 방치된 미국 해안지대에 펼쳐져 있는 미...,3.3,30000,"'Adventure', 'Drama', 'Horror', 'Mystery', 'Sc...","A biologist signs up for a dangerous, secret e...",'Alex Garland',"'Natalie Portman', 'Benedict Wong', 'Sonoya Mi...",182715
4,300001,7293,50 First Dates,첫 키스만 50번째,2004,미국,코미디/드라마/로맨스,"그녀에겐 매일이 첫 데이트, 첫 키스, 첫 사랑!? 로맨틱 아일랜드 하와이에서 펼쳐...",3.7,380000,"'Comedy', 'Drama', 'Romance'",Henry Roth is a man afraid of commitment until...,'Peter Segal',"'Adam Sandler', 'Drew Barrymore', 'Rob Schneid...",7293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,300001,45666,Nacho Libre (2006),나쵸 리브레,2006,"독일,미국",코미디/가족/스포츠,수도사 이그나쵸는 요리와 환자 심방 외엔 주어지는 일거리가 없는 수도원의 미운 오리...,3.4,30000,"'Comedy', 'Family', 'Sport'","Berated all his life by those around him, a mo...",'Jared Hess',"'Jack Black', 'Ana de la Reguera', 'Héctor Jim...",45666
146,300001,70286,District 9 (2009),디스트릭트 9,2009,"남아프리카 공화국,미국,뉴질랜드,캐나다",액션/SF/스릴러,남아공 상공에 불시착한 외계인들은 요하네스버그 인근 지역 외계인 수용구역 ‘디스트릭...,3.8,410000,"'Action', 'Sci-Fi', 'Thriller'",Violence ensues after an extraterrestrial race...,'Neill Blomkamp',"'Sharlto Copley', 'Jason Cope', 'Nathalie Bolt...",70286
147,300001,105213,Don Jon (2013),돈 존,2013,미국,코미디/드라마/로맨스,섹시녀들의 사랑을 한 몸에 받고 있는 클럽의 지존. 그러나 그 어떤 섹시녀와의 하룻...,2.9,90000,"'Comedy', 'Drama', 'Romance'","A New Jersey guy dedicated to his family, frie...",'Joseph Gordon-Levitt',"'Joseph Gordon-Levitt', 'Scarlett Johansson', ...",105213
148,300001,1954,Rocky,록키 발보아,2006,미국,액션/드라마/스포츠,최고의 헤비급 챔피언에서 성공한 사업가로 변신한 록키. 록키의 즐거움은 레스토랑을 ...,3.6,30000,"'Drama', 'Sport'",A small-time Philadelphia boxer gets a supreme...,'John G. Avildsen',"'Sylvester Stallone', 'Talia Shire', 'Burt You...",1954


In [51]:
sub[sub.item.isin(mdf.movieId)]

Unnamed: 0,user,item
0,300001,35836
1,300001,80463
2,300001,7451
3,300001,182715
4,300001,7293
...,...,...
145,300001,45666
146,300001,70286
147,300001,105213
148,300001,1954


In [None]:
user 별 인터랙션 확인
user 별 로 1200개 영화에 포함되는 추천 안정적으로 받는 K개 찾기 -> 엄청 넉넉하게주기 거의 다 sort한다는 느낌
user 별로 1200개 내의 100개 영화이상 추천 받도록하기


In [None]:
# user, item 길이
user_len = len(user_id2token) # 31361 (PAD 포함)
item_len = len(item_id2token) # 6808 (PAD 포함)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr') # (31361, 6808)

# user id, predict item id 저장 변수
pred_list = None
user_list = []

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, K+50, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()

# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

sub = pd.DataFrame(result, columns=["user", "item"])

# 인덱스 -> 유저 아이템번호 dictionary 불러오기
with open('./index/uidx2user.pickle','rb') as f:
    uidx2user = pickle.load(f)
with open('./index/iidx2item.pickle','rb') as f:
    iidx2item = pickle.load(f)   

# submission 생성
sub = pd.DataFrame(result, columns=["user", "item"])
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

# extract Top K 
users = sub.groupby('user').user.head(K).reset_index(drop=True)
items = sub.groupby('user').item.head(K).reset_index(drop=True)
sub = pd.concat([users,items],axis=1)

print(f"submission length: {sub.shape[0]}")

os.makedirs('submission',exist_ok=True)
submission=f"./submission/{model_path[8:-4]}.csv"
submission = uniquify(submission)
sub[['user','item']].to_csv(
    submission, index=False # "./saved/" 와 ".pth" 제거
)
print(f"model path: {model_path}")
print(f"submission path: {os.path.relpath(submission)}")
print('inference done!')