In [1]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, SequentialSampler

from datasets import SASRecDataset
from models import S3RecModel
from trainers import FinetuneTrainer

In [2]:
from utils import (
    check_path,
    generate_submission_file,
    get_item2attribute_json,
    get_user_seqs,
    set_seed,
    __save_labels
)

In [3]:
from collections import namedtuple
args = namedtuple("Config", [])

In [4]:
args.data_dir = "../../../data/train/"
args.output_dir = "output/"
args.data_name  = "rb"
args.model_name = "Finetune_full"
args.hidden_size = 64 # hidden size of transformer model
args.num_hidden_layers = 2 # number of layers
args.num_attention_heads = 2 # num_attention_heads
args.hidden_act = "gelu"
args.attention_probs_dropout_prob = 0.5 # attention dropout p
args.hidden_dropout_prob = 0.5 # hidden dropout p
args.initializer_range=0.02
args.max_seq_length=50

# train args
args.lr=0.001 # "learning rate of adam"
args.batch_size=256 # number of batch_size"
args.epochs=200#"number of epochs")
args.log_freq=1 #per epoch print res")
args.seed=42
args.weight_decay=0.0 #, help="weight_decay of adam"
args.adam_beta1=0.9 #"adam first beta value"
args.adam_beta2=0.999 #"adam second beta value"
args.gpu_id="0"#"gpu_id"

In [5]:
set_seed(args.seed)
check_path(args.output_dir) # output_dir 로 설정한 경로가 없으면 생성

In [6]:
args.no_cuda = False # gpu 있음
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
args.cuda_condition = torch.cuda.is_available() and not args.no_cuda

In [7]:
args.data_file = args.data_dir + "train_ratings.csv"
item2attribute_file = args.data_dir + args.data_name + "_item2attributes.json"

In [8]:
# args.data_fil 여기경로를 지나쳐서 index에 관한 정보를 한 번 업데이트하고 지나가자.
# def get_user_seqs(data_file):
# 이거는 training 한정
rating_df = pd.read_csv(args.data_file)



# user_seq, max_item, _, _, submission_rating_matrix = get_user_seqs(args.data_file)
# item2attribute, attribute_size = get_item2attribute_json(item2attribute_file)

# args.item_size = max_item + 2
# args.mask_id = max_item + 1
# args.attribute_size = attribute_size + 1

In [9]:
# le = LabelEncoder()
# if is_train:
#     # For UNKNOWN class
#     a = df[col].unique().tolist() + ["unknown"]
#     le.fit(a)
#     self.__save_labels(le, col)
# else:
#     label_path = os.path.join(self.args.asset_dir, col + "_classes.npy")
#     le.classes_ = np.load(label_path)

#     df[col] = np.where(df[col].isin(le.classes_), df[col], "unknown")


In [10]:
# TODO 이 부분은 학습때 자동 저장되도록 설정해야한다. 
le = LabelEncoder()
raw_item_list = rating_df["item"].unique().tolist()+ [-99999] # "unknown" -> -99999
le.fit(raw_item_list)
__save_labels(args.output_dir, le, "item")

In [11]:
# TODO inference 할 때는, 이런 것들을 불러오도록 해야한다. 

label_path = os.path.join(args.output_dir, "item" + "_classes.npy") # args.asset_dir -> args.output_dir
le.classes_ = np.load(label_path)

In [12]:
rating_df["item"] = le.transform(rating_df["item"])
# le.inverse_transform([1663, 1241, 1446, 5619, 4295, 9335])

In [13]:
# save model args
args_str = f"{args.model_name}-{args.data_name}"

# print(args)

# args.item2attribute = item2attribute

# args.train_matrix = submission_rating_matrix

checkpoint = args_str + ".pt"
args.checkpoint_path = os.path.join(args.output_dir, checkpoint)

# submission_dataset = SASRecDataset(args, user_seq, data_type="submission")
# submission_sampler = SequentialSampler(submission_dataset)
# submission_dataloader = DataLoader(
#     submission_dataset, sampler=submission_sampler, batch_size=args.batch_size
# )

In [14]:
# 따로 저장해야하는 것들
# max_item # 
user_seq, rating_seq, max_item, valid_rating_matrix, test_rating_matrix, _ = get_user_seqs(
    args, # args.data_file, args.output_dir
    is_train = True
)
item2attribute, attribute_size = get_item2attribute_json(item2attribute_file)



In [15]:
args.item_size = max_item + 2
args.mask_id = max_item + 1
args.attribute_size = attribute_size + 1


In [16]:
model = S3RecModel(args=args)

In [17]:
file_name = args.checkpoint_path
model.load_state_dict(torch.load(file_name))

<All keys matched successfully>

In [18]:
model = model.to(device="cuda:0")

In [224]:
beer_pick = [5588, 2228, 473, 37, 268, 390, 729, 86387, 1478, 730] # 민철님
beer_pick = [730, 268685, 35424, 2228, 86387, 37] #칭따오, 타이거, 블루문, 블랑, # 아현님 동생분
beer_pick = [251, 2228, 473, 4007, 1267, 86387, 730] #경태
beer_pick = [614833, 5588, 709, 814, 1267, 37, 315443, 717] #동석님
input_ids = le.transform(beer_pick).tolist()
input_ratings = [1, 1, 5, 5 ,5, 5, 5, 5, 5, 5]
input_ratings = [5, 5, 1, 1, 1, 5]
input_ratings = [5, 5, 5, 5, 5, 5, 5]
input_ratings = [1, 1, 5, 1, 5, 5, 5, 5]
# input_ratings = [5,5,1,1,1,1,1,1,1,1]

pad_len = args.max_seq_length - len(input_ids)
input_ids = [0] * pad_len + input_ids
input_ratings = [0] * pad_len + input_ratings

input_ids = input_ids[-args.max_seq_length :]
input_ratings = input_ratings[-args.max_seq_length :]


input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to("cuda:0")
input_ratings = torch.tensor(input_ratings, dtype=torch.float32).unsqueeze(0).to("cuda:0")

In [225]:
# input_ids
print("input_ids.size ===> ", input_ids.size()) # [batch * max_seq]

# attention_mask
attention_mask = (input_ids > 0).long()
weighted_mask = input_ratings

print("attention_mask.size ===> ", attention_mask.size()) # [batch * max_seq]

# extended_attention_mask
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # torch.int64
extended_weighted_mask = weighted_mask.unsqueeze(1).unsqueeze(2)
print("extended_attention_mask.size ===> ", extended_attention_mask.size()) # [batch * 1 * 1 * max_seq]
print("extended_weighted_mask.size ===> ", extended_weighted_mask.size()) # [batch * 1 * 1 * max_seq]


max_len = attention_mask.size(-1)
print("max_len ==> ", max_len)
attn_shape = (1, max_len, max_len) 
print("attn_shape ==> ", attn_shape)

# subsequent_mask
subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1)  # torch.uint8
print(subsequent_mask)
print("======== subsequent_mask =========")
subsequent_mask = (subsequent_mask == 0).unsqueeze(1)
subsequent_mask = subsequent_mask.long().to("cuda:0")
print("subsequent_mask.size ===> ", subsequent_mask.size()) # [batch * 1 * 1 * max_seq] 하위 삼각형이 1이다.
# print(subsequent_mask)


extended_attention_mask = extended_attention_mask * subsequent_mask
weighted_mask_by_score = extended_weighted_mask * subsequent_mask  # 이거는 곱해지는거.
# extended_attention_mask = extended_attention_mask
# print(extended_attention_mask)
print("weighted_mask_by_score")
print(weighted_mask_by_score)

extended_attention_mask
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
print("extended_attention_mask")
print(extended_attention_mask)



input_ids.size ===>  torch.Size([1, 50])
attention_mask.size ===>  torch.Size([1, 50])
extended_attention_mask.size ===>  torch.Size([1, 1, 1, 50])
extended_weighted_mask.size ===>  torch.Size([1, 1, 1, 50])
max_len ==>  50
attn_shape ==>  (1, 50, 50)
tensor([[[0., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
subsequent_mask.size ===>  torch.Size([1, 1, 50, 50])
weighted_mask_by_score
tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 5., 0., 0.],
          [0., 0., 0.,  ..., 5., 5., 0.],
          [0., 0., 0.,  ..., 5., 5., 5.]]]], device='cuda:0')
extended_attention_mask
tensor([[[[-10000., -10000., -10000.,  ..., -10000., -10000., -10000.],
          [-10000., -10000

In [226]:
recommend_output = model.finetune(input_ids, input_ratings)
recommend_output = recommend_output[:, -1, :]

In [227]:
# recommend_output [1(batch) hidden_dim]
# model.item_embeddings.weight: [max_item hidden_dim]


In [228]:
result_scores = torch.matmul(model.item_embeddings.weight, recommend_output.transpose(0, 1)).squeeze(1)
result_scores = result_scores.cpu().data.numpy().copy()

In [229]:
result_scores = result_scores[1:] # remove "-99999 value('unseen')"

In [230]:
K=8
sorted_items = result_scores.argsort()

# 방금 체크 했던거는 제거
checked_right_before = input_ids[input_ids > 0]
sorted_items = sorted_items[~pd.Series(sorted_items).isin(checked_right_before)]
# print(sorted_items)
# 원래의 id로 되돌아오기
sorted_items = le.inverse_transform(sorted_items.argsort()) # 뒤로갈수록 추천해주고 싶은 맥주

In [231]:
# 이것도 DB에서 불러오는 것으로...
filter_df = pd.read_csv("../../../data/ratebeer_list_inKorea.csv")
filter_df.columns = ["beer_name", "beer_id", "mik"]
target_items = filter_df[filter_df["mik"].isna()]["beer_id"].tolist()

In [232]:
# target_items = [12769, 71477, 43176, 116553, 7686, 82006] # 우리가 취급할 맥주만 선택: 한국에서 파는 맥주만
sorted_target_items = sorted_items[pd.Series(sorted_items).isin(target_items)]
# sorted_target_items = sorted_items

sorted_target_items_topk = sorted_target_items[-K:]
sorted_target_items_topk

array([452525,  30913, 149060, 136785,   5588,  15747, 146725, 315443])

In [233]:
id2name = {int(beer_id):str(beer_name[0]) for beer_id, beer_name in filter_df[["beer_id", "beer_name"]].set_index("beer_id").iterrows()}

In [234]:
# last result
# filter_df[filter_df["beer_id"].isin(sorted_target_items_topk)]["beer_name"]#.tolist()
[id2name[beer_id] for beer_id in sorted_target_items_topk]


['Volfas Engelman Grünberger',
 'Goose Island 312 Urban Wheat Ale',
 'Stephans Brau Philsner',
 'Patagonia Weisse',
 'Barvaria 8.6',
 "Suntory The Premium Malt's",
 'Stephans Brau Larger',
 'Hop House 13 Lager']

In [194]:
for i in filter_df[["beer_id", "beer_name"]].set_index("beer_id").iterrows():
    print(i[0], i[1][0])

8666 Ambar Especial
75661 Apostel Brau
614833 Apple Fox
169964 Asahi Super dray Black
251 Asahi super dry
169964 Asahi Super Dry Black
10726 Bali Hai Premium Larger
5588 Barvaria 8.6
5571 Barvaria Premium
703 beck's
6668 Berliner Kindl Pilsener
7656 BINTANG pilsner
2228 Blue Moon Belgian White
135361 BrewDog Punk IPA
473 Budweiser
36249 BURGE MEESTER
709 Carlsberg
12775 Cass Fresh
143730 Cass Light
742 Corona Extra
4007 Desperados
72025 Edelweiss Weissbier Snowfresh
58275 Egger Marzenbier
221535 Egger Radler Grapefruit
291276 Egger Zwickl
2469 Erdinger Dunkel
2468 Erdinger Weissbier
553454 Filite
1088 Franzisaner Hefe-Weissbier
721 Gambrinus Original
30913 Goose Island 312 Urban Wheat Ale
814 Goose Island Goose IPA
75071 Grimbergen Cuvée Blanche
715 Grolsch Premium Larger
1267 Guinness Draught
55 Guinness original
17334 Harbin Beer
37 Heineken
493355 Heineken 0.0
64253 Heineken 3.5%
34662 Heineken Dark Lager
34662 Heineken Dark Lager
136165 Hite D (Dry Finish)
500620 Hite Extra Cold
13

In [130]:
for i in filter_df.iterrows():
    print(i)

(0, beer_name    Ambar Especial
beer_id                8666
mik                     NaN
Name: 0, dtype: object)
(1, beer_name    Apostel Brau
beer_id             75661
mik                   NaN
Name: 1, dtype: object)
(2, beer_name    Apple Fox
beer_id         614833
mik                NaN
Name: 2, dtype: object)
(3, beer_name    Asahi Super dray Black
beer_id                      169964
mik                             NaN
Name: 3, dtype: object)
(4, beer_name    Asahi super dry
beer_id                  251
mik                      NaN
Name: 4, dtype: object)
(5, beer_name    Asahi Super Dry Black
beer_id                     169964
mik                            NaN
Name: 5, dtype: object)
(6, beer_name    Bali Hai Premium Larger
beer_id                        10726
mik                              NaN
Name: 6, dtype: object)
(7, beer_name    Barvaria 8.6
beer_id              5588
mik                   NaN
Name: 7, dtype: object)
(8, beer_name    Barvaria Premium
beer_id               