In [40]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color
from recbole.utils.case_study import full_sort_topk
import os
from environ import Env
from pathlib import Path
from sqlalchemy import create_engine

In [43]:
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(os.curdir).resolve().parent
env = Env()
env_path = BASE_DIR / "django/.env"
if env_path.exists():
    with env_path.open("rt", encoding="utf8") as f:
        env.read_env(f, overwrite=True)

dbname = env.get_value('GCPDB_NAME')
user = env.get_value('GCPDB_USER')
pw = env.get_value('GCPDB_PASSWORD')
host = env.get_value('GCPDB_HOST')

# engine 생성
engine = create_engine(f'mysql+mysqldb://{user}:{pw}@{host}:3306/{dbname}?charset=utf8')

In [68]:
topk=500
model_name="EASE"
model_path="./saved/EASE-Feb-05-2023_05-23-53.pth"
model_path_absolute = Path(model_path).absolute()

In [70]:
str(model_path_absolute)

'/opt/ml/project2/BaseLine/saved/EASE-Feb-05-2023_05-23-53.pth'

In [64]:
print('inference start!')
if model_path is None:
    # model_name이 들어가는 pth 파일 중 최근에 생성된 걸로 불러옴
    os.makedirs('saved',exist_ok=True)
    save_path = os.listdir('./saved')
    model_path = './saved/' + sorted([file for file in save_path if model_name in file ])[-1]

K = topk

# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']

init_seed(config['seed'], config['reproducibility'])
config['dataset'] = 'train_data'
if model_name=="S3Rec":
    config['eval_args']['split']={'RS':[99999,0,1]}
else:
    config['eval_args']['split']['RS']=[999999,0,1]
print("create dataset start!")
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
print("create dataset done!")

model = get_model(config['model'])(config, train_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

inference start!
create dataset start!


In inter_feat, line [6348477, 6348671], item_id do not exist, so they will be removed.


{'RS': [999999, 0, 1]}
create dataset done!


In [65]:
user_list=[]
for idx,i in enumerate(user_id2token):
    if i!='[PAD]':
        if int(i)>=300_000:
            user_list.append(idx)

In [66]:
user_tensor_list = torch.tensor(copy(user_list))
user_tensor_list

tensor([161860, 161861, 161862, 161863, 161864, 161865, 161866, 161867, 161868,
        161869, 161870, 161871])

In [67]:
pred_list = full_sort_topk(user_tensor_list, model, test_data, topk, device=device)[1]

  uid_series = torch.tensor(uid_series)


In [22]:
# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))
sub = pd.DataFrame(result, columns=["user", "item"])

In [24]:
result = sub.groupby('user').item.apply(list).reset_index()

In [49]:
result['model_name'] = model_name
result['model_path'] = model_path_absolute
result['create_time'] = str(pd.Timestamp.now())
result['id'] = 0

In [30]:
result.user -= 300_000

In [35]:
result.rename(columns={'user':'LoginUser_id','item':'recommended_movie_list'},inplace=True)

In [45]:
cols = ['id','LoginUser_id', 'model_name','model_path', 'recommended_movie_list', 'create_time']
result = result[cols]

In [55]:
result.astype(str).to_sql(name='common_batchtrain', con=engine, index=False, if_exists='append')

12

In [51]:
sub[sub.item.isin(mdf.movieId)]

Unnamed: 0,user,item
0,300001,35836
1,300001,80463
2,300001,7451
3,300001,182715
4,300001,7293
...,...,...
145,300001,45666
146,300001,70286
147,300001,105213
148,300001,1954


In [None]:
user 별 인터랙션 확인
user 별 로 1200개 영화에 포함되는 추천 안정적으로 받는 K개 찾기 -> 엄청 넉넉하게주기 거의 다 sort한다는 느낌
user 별로 1200개 내의 100개 영화이상 추천 받도록하기


In [None]:
# user, item 길이
user_len = len(user_id2token) # 31361 (PAD 포함)
item_len = len(item_id2token) # 6808 (PAD 포함)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr') # (31361, 6808)

# user id, predict item id 저장 변수
pred_list = None
user_list = []

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128) # 245, 128

tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink')) # 245, 128

for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, K+50, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()

# user별 item 추천 결과 하나로 합쳐주기
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

sub = pd.DataFrame(result, columns=["user", "item"])

# 인덱스 -> 유저 아이템번호 dictionary 불러오기
with open('./index/uidx2user.pickle','rb') as f:
    uidx2user = pickle.load(f)
with open('./index/iidx2item.pickle','rb') as f:
    iidx2item = pickle.load(f)   

# submission 생성
sub = pd.DataFrame(result, columns=["user", "item"])
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

# extract Top K 
users = sub.groupby('user').user.head(K).reset_index(drop=True)
items = sub.groupby('user').item.head(K).reset_index(drop=True)
sub = pd.concat([users,items],axis=1)

print(f"submission length: {sub.shape[0]}")

os.makedirs('submission',exist_ok=True)
submission=f"./submission/{model_path[8:-4]}.csv"
submission = uniquify(submission)
sub[['user','item']].to_csv(
    submission, index=False # "./saved/" 와 ".pth" 제거
)
print(f"model path: {model_path}")
print(f"submission path: {os.path.relpath(submission)}")
print('inference done!')