In [1]:
import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, SequentialSampler

from datasets import SASRecDataset
from models import S3RecModel
from trainers import FinetuneTrainer

In [2]:
from utils import (
    check_path,
    generate_submission_file,
    get_item2attribute_json,
    get_user_seqs,
    set_seed,
    __save_labels
)

In [3]:
from collections import namedtuple
args = namedtuple("Config", [])

In [4]:
args.data_dir = "../../../data/train/"
args.output_dir = "output-6.pt/"
args.data_name  = "rb"
args.model_name = "Finetune_full"
args.hidden_size = 64 # hidden size of transformer model
args.num_hidden_layers = 3 #2 # number of layers
args.num_attention_heads = 2 # num_attention_heads
args.hidden_act = "gelu"
args.attention_probs_dropout_prob = 0.5 # attention dropout p
args.hidden_dropout_prob = 0.5 # hidden dropout p
args.initializer_range=0.02
args.max_seq_length=300 #50

# train args
args.lr=0.001 # "learning rate of adam"
args.batch_size= 512 #256 # number of batch_size"
args.epochs=200#"number of epochs")
args.log_freq=1 #per epoch print res")
args.seed=42
args.weight_decay=0.0 #, help="weight_decay of adam"
args.adam_beta1=0.9 #"adam first beta value"
args.adam_beta2=0.999 #"adam second beta value"
args.gpu_id="0"#"gpu_id"

In [5]:
set_seed(args.seed)
check_path(args.output_dir) # output_dir 로 설정한 경로가 없으면 생성

In [6]:
args.no_cuda = False # gpu 있음
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
args.cuda_condition = torch.cuda.is_available() and not args.no_cuda

In [7]:
args.data_file = args.data_dir + "train_ratings.csv"
item2attribute_file = args.data_dir + args.data_name + "_item2attributes.json"

In [8]:
le = LabelEncoder()
label_path = os.path.join(args.output_dir, "item" + "_classes.npy") # args.asset_dir -> args.output_dir
le.classes_ = np.load(label_path)

In [9]:
# save model args
args_str = f"{args.model_name}-{args.data_name}"

# print(args)

# args.item2attribute = item2attribute

# args.train_matrix = submission_rating_matrix

checkpoint = args_str + ".pt"
args.checkpoint_path = os.path.join(args.output_dir, checkpoint)
print(args.checkpoint_path)

output-6.pt/Finetune_full-rb.pt


In [10]:
# 따로 저장해야하는 것들
# max_item # 
user_seq, rating_seq, max_item, valid_rating_matrix, test_rating_matrix, _ = get_user_seqs(
    args, # args.data_file, args.output_dir
    is_train = True
)
item2attribute, attribute_size = get_item2attribute_json(item2attribute_file)


In [11]:
args.item_size = max_item + 2
args.mask_id = max_item + 1
args.attribute_size = attribute_size + 1

In [12]:
model = S3RecModel(args=args)

In [13]:
file_name = args.checkpoint_path
model.load_state_dict(torch.load(file_name))

<All keys matched successfully>

In [14]:
model = model.to(device="cuda:0")

In [15]:
args.data_file

'../../../data/train/train_ratings.csv'

In [16]:
# ### get user seq from test dataset
# test_input_file = "../../../data/test/test_input_ratebeer_without_text.csv"
# rating_df = pd.read_csv(test_input_file)

# test_output_file = "../../../data/test/test_output_ratebeer_without_text.csv"
# out_rating_df = pd.read_csv(test_output_file)

In [17]:
# # label decoding
# rating_df = rating_df[["user_id", "beer_id", "review_score", "review_time"]].sort_values(["user_id", "review_time"])
# rating_df.columns = ["user", "item", "rating", "time"]
# rating_df.index = range(len(rating_df))

In [18]:
# rating_df["item"] = le.transform(rating_df["item"])

In [19]:
# lines_item = rating_df.groupby("user")["item"].apply(list)
# lines_rating = rating_df.groupby("user")["rating"].apply(list)

# user_seq = []
# rating_seq = []

# for line_item, line_rating in zip(lines_item, lines_rating):
#     items = line_item
#     user_seq.append(items)
#     ratings = line_rating
#     rating_seq.append(ratings)

In [20]:
from torch.utils.data import Dataset

class TestDataset(Dataset):
    def __init__(self, args, test_dir = "../../../data/test"):

        self.max_len = args.max_seq_length

        # load data set
        self.test_input_path = test_dir + "/test_input_ratebeer_without_text.csv"
        self.test_output_path = test_dir + "/test_output_ratebeer_without_text.csv"

        le = LabelEncoder()
        label_path = os.path.join(args.output_dir, "item" + "_classes.npy") # args.asset_dir -> args.output_dir
        le.classes_ = np.load(label_path)

        input_df = pd.read_csv(self.test_input_path)[["user_id", "beer_id", "review_score", "review_time"]].sort_values(["user_id", "review_time"])
        input_df.columns = ["user", "item", "rating", "time"]
        input_df["item"] = le.transform(input_df["item"])
        input_df.index = range(len(input_df))

        output_df = pd.read_csv(self.test_output_path)[["user_id", "beer_id", "review_score", "review_time"]].sort_values(["user_id", "review_time"])
        output_df.columns = ["user", "item", "rating", "time"]
        output_df["item"] = le.transform(output_df["item"])
        output_df.index = range(len(output_df))

        input_lines_item = input_df.groupby("user")["item"].apply(list)
        input_lines_rating = input_df.groupby("user")["rating"].apply(list)

        output_lines_item = output_df.groupby("user")["item"].apply(list)
        output_lines_rating = output_df.groupby("user")["rating"].apply(list)

        self.input_user_seq = []
        self.input_rating_seq = []
        self.output_user_seq = []
        self.output_rating_seq = []

        for i_line_item, i_line_rating, o_line_item, o_line_rating in zip(
                                                                        input_lines_item, 
                                                                        input_lines_rating, 
                                                                        output_lines_item, 
                                                                        output_lines_rating):
            items, ratings, o_items, o_ratings = i_line_item, i_line_rating, o_line_item, o_line_rating
            self.input_user_seq.append(items)
            self.input_rating_seq.append(ratings)
            self.output_user_seq.append(o_items)
            self.output_rating_seq.append(o_ratings)

        self.num_users = len(self.input_user_seq)

    def __len__(self):
        return self.num_users # i/o 유저의 수

    def __getitem__(self, index):
        user_id = index
        input_item_seq = self.input_user_seq[user_id]
        input_item_rating = self.input_rating_seq[user_id]
        output_item_seq = self.output_user_seq[user_id]
        output_item_ratings = self.output_rating_seq[user_id]
        # print(len(input_item_seq), len(input_item_rating), len(output_item_seq), len(output_item_ratings))

        pad_len = self.max_len - len(input_item_seq)
        input_item_seq = [0] * pad_len + input_item_seq
        input_item_rating = [0] * pad_len + input_item_rating

        pad_len = self.max_len - len(output_item_seq)
        output_item_seq = [0] * pad_len + output_item_seq
        output_item_ratings = [0] * pad_len + output_item_ratings

        input_item_seq = input_item_seq[-self.max_len :]
        input_item_rating = input_item_rating[-self.max_len :]
        output_item_seq = output_item_seq[-self.max_len :]
        output_item_ratings = output_item_ratings[-self.max_len :]

        cur_tensors = (
            torch.tensor(input_item_seq, dtype=torch.long), # batch_size * max_seq * emb
            torch.tensor(input_item_rating, dtype=torch.float32), # batch_size * max_seq * emb
            torch.tensor(output_item_seq, dtype=torch.long), # batch_size * n_answer
            torch.tensor(output_item_ratings, dtype=torch.float32) # batch_size * n_answer
        )

        return cur_tensors #(input_item_seq, input_item_rating, output_item_seq, output_item_ratings)

    


In [21]:
test_dataset = TestDataset(args)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset, sampler=test_sampler, batch_size=args.batch_size, drop_last=False
)

In [22]:
device = "cuda:0"

rating_pred_list = []
rating_true_list = []

for batch in test_dataloader:
    print(batch)
    batch = tuple(t.to(device) for t in batch)

    input_items, input_ratings, output_items, ouput_ratings = batch
    recommend_output = model.finetune(input_items,  input_ratings) ############## TODO
    recommend_output = recommend_output[:, -1, :]
        
    # recommend_output : 시퀀스의 마지막 시점에 소비한 맥주까지 포함하여 다른 맥주와의 상호작용을 확인하는 임베딩 벡터 # [batch hidden_size]
    # answers : 해당 유저의 뒤에서 두 번째 아이템   # [batch * 1]
    # ratings_answer : 해당 아이템을 해당 유저가 매길것으로 예상되는 점수 # [batch hidden_size]
    # print(output_items[output_items > 0])
    # print(output_items[output_items != -1])

    # print("recommend_output.size() :", recommend_output.size())
    # print("output_items.size() :", output_items.size())

    output_items_emb = model.item_embeddings(output_items)
    # print("output_items_emb.size() :", output_items_emb.size())

    rating_pred = torch.bmm(output_items_emb, recommend_output.unsqueeze(2)).squeeze(2)
    masking_non_value = (output_items != 0)

    rating_pred = rating_pred*masking_non_value

    rating_pred_list += rating_pred[masking_non_value].cpu().data.numpy().copy().tolist()    
    rating_true_list += ouput_ratings[masking_non_value].cpu().data.numpy().copy().tolist()


    # answers_emb = model.item_embeddings(answers).squeeze(1) # [batch * hidden_size]

    # print(answers_emb.size)
    # rating_pred = torch.sum(answers_emb * recommend_output, dim = 1)
    # print(rating_pred)

    # metric_fn = torch.nn.MSELoss()
    # score = metric_fn(rating_pred, ratings_answer.squeeze(1))
    # break

[tensor([[   0,    0,    0,  ..., 6101,  415, 3056],
        [   0,    0,    0,  ...,  554, 1672, 2918],
        [   0,    0,    0,  ...,  206,  615, 6814],
        ...,
        [   0,    0,    0,  ...,  610,   38, 6855],
        [   0,    0,    0,  ..., 3226,   38, 9326],
        [   0,    0,    0,  ..., 7579,  855, 2402]]), tensor([[0.0000, 0.0000, 0.0000,  ..., 3.5000, 2.3000, 3.8000],
        [0.0000, 0.0000, 0.0000,  ..., 3.2000, 3.6000, 4.2000],
        [0.0000, 0.0000, 0.0000,  ..., 3.5000, 4.2000, 3.6000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 3.5000, 3.5000, 3.3000],
        [0.0000, 0.0000, 0.0000,  ..., 2.9000, 1.9000, 2.6000],
        [0.0000, 0.0000, 0.0000,  ..., 3.6000, 4.0000, 4.4000]]), tensor([[   0,    0,    0,  ...,  433,   24,  259],
        [   0,    0,    0,  ...,  416,  693,  433],
        [   0,    0,    0,  ...,  202,  613,   38],
        ...,
        [   0,    0,    0,  ...,    0,    0,  259],
        [   0,    0,    0,  ...,    0,    0,  613],


In [27]:
for i, (a, b) in enumerate(zip(rating_pred_list, rating_true_list)):
    print(round(a,3), round(b,3))
    if i == 100: break

1.749 1.7
2.198 1.9
1.505 1.7
3.315 3.3
2.419 2.0
1.756 2.5
3.183 3.4
2.946 2.1
3.413 3.9
3.291 4.0
1.726 0.9
2.46 3.0
2.167 0.5
2.195 1.2
2.192 0.5
1.837 0.7
2.821 3.5
2.622 2.9
2.833 3.0
2.711 2.8
3.65 3.8
2.34 2.5
2.916 2.9
2.25 1.5
2.32 2.2
2.28 2.7
2.57 2.3
3.202 3.1
2.33 2.0
1.934 0.8
3.467 3.6
3.734 3.0
3.494 3.9
3.655 3.6
2.99 2.7
3.69 3.6
2.215 1.7
1.92 1.1
2.136 1.5
2.911 3.2
2.551 2.6
1.947 2.0
3.159 2.9
2.169 2.3
3.411 3.6
2.449 2.0
2.167 2.1
1.96 2.7
3.02 3.4
1.914 2.5
2.358 3.2
2.299 3.0
2.287 2.2
1.781 1.6
2.593 2.1
3.171 3.1
2.397 2.2
1.74 1.5
3.668 3.8
1.856 1.6
2.178 2.2
3.391 3.3
3.228 3.3
3.077 3.0
3.421 3.9
3.506 3.3
3.252 3.4
3.498 3.6
2.896 3.2
2.358 2.4
1.769 1.5
3.686 3.5
3.517 4.3
3.21 3.1
1.715 1.0
3.226 3.7
3.437 3.7
2.643 2.9
2.736 3.0
1.83 1.8
2.932 2.8
2.164 1.2
3.148 3.0
2.804 2.3
3.377 3.2
2.096 1.2
2.562 2.4
3.383 3.1
2.34 1.8
2.034 1.7
3.557 3.1
2.254 1.9
3.78 3.7
2.387 1.3
2.514 2.5
3.649 2.9
3.417 3.3
3.503 3.0
3.757 3.6
3.54 3.4
3.115 3.2


In [35]:
def rmse(y_pred_arr, y_true_arr):
    return np.sqrt(((y_pred_arr - y_true_arr) ** 2).mean())

In [36]:
rating_pred_arr = np.array(rating_pred_list)
rating_true_arr = np.array(rating_true_list)

In [37]:
rmse(rating_pred_arr, rating_true_arr)

0.6471637365085615

In [38]:
len(rating_pred_arr)

3631

### 일반적인 것으로 
- 단순 평균
- 선형 모형

In [39]:
# 일반적인 함수로 테스트
train_df = pd.read_csv("/opt/ml/workspace/final-project-level3-recsys-10/data/standard/train_ratebeer_without_text.csv")

In [40]:
mean_model_df = train_df.groupby("beer_id")["review_score"].mean().reset_index()
mean_model_df.columns = ["beer_id", "pred_review_score"]
mean_model_df

Unnamed: 0,beer_id,pred_review_score
0,1,3.395875
1,2,2.709655
2,3,2.956899
3,4,2.925490
4,5,2.878067
...,...,...
9331,580686,4.100000
9332,614833,2.500000
9333,621308,2.775862
9334,632627,2.670270


In [53]:
test_mean_model_df = pd.read_csv("/opt/ml/workspace/final-project-level3-recsys-10/data/test/test_output_ratebeer_without_text.csv")
test_mean_model_df = test_mean_model_df[["beer_id", "review_score"]]
test_mean_model_df.columns = ["beer_id", "true_review_score"]
test_mean_model_df = test_mean_model_df.reset_index()
test_mean_model_df

Unnamed: 0,index,beer_id,true_review_score
0,0,37,1.9
1,1,473,1.7
2,2,742,1.7
3,3,1478,2.0
4,4,717,3.3
...,...,...,...
3626,3626,2514,3.4
3627,3627,2516,3.9
3628,3628,2516,2.9
3629,3629,1267,3.2


In [57]:
get_test_mean_model_df = test_mean_model_df.merge(mean_model_df, on = "beer_id", how = "inner")
get_test_mean_model_df = get_test_mean_model_df.sort_values("index")
get_test_mean_model_df.index = range(len(get_test_mean_model_df))
get_test_mean_model_df

Unnamed: 0,index,beer_id,true_review_score,pred_review_score
0,0,37,1.9,2.169665
1,1,473,1.7,1.543605
2,2,742,1.7,1.792759
3,3,1478,2.0,2.549066
4,4,717,3.3,3.296014
...,...,...,...,...
3626,3626,2514,3.4,3.383548
3627,3627,2516,3.9,3.480714
3628,3628,2516,2.9,3.480714
3629,3629,1267,3.2,3.457336


In [61]:
get_test_mean_model_df["model_based"] = rating_pred_arr
get_test_mean_model_df = get_test_mean_model_df[["beer_id", "true_review_score", "model_based", "pred_review_score"]]
get_test_mean_model_df

Unnamed: 0,beer_id,true_review_score,model_based,pred_review_score
0,37,1.9,1.748652,2.169665
1,473,1.7,2.197562,1.543605
2,742,1.7,1.505415,1.792759
3,1478,2.0,3.314546,2.549066
4,717,3.3,2.419085,3.296014
...,...,...,...,...
3626,2514,3.4,3.360144,3.383548
3627,2516,3.9,3.529265,3.480714
3628,2516,2.9,3.522529,3.480714
3629,1267,3.2,3.448052,3.457336


In [62]:
get_test_mean_model_df.head(50)

Unnamed: 0,beer_id,true_review_score,model_based,pred_review_score
0,37,1.9,1.748652,2.169665
1,473,1.7,2.197562,1.543605
2,742,1.7,1.505415,1.792759
3,1478,2.0,3.314546,2.549066
4,717,3.3,2.419085,3.296014
5,742,2.5,1.756427,1.792759
6,1267,3.9,3.183325,3.457336
7,71469,3.4,2.945865,3.141362
8,390,2.1,3.413313,3.010999
9,55,4.0,3.291302,3.233394


In [43]:
rmse(get_test_mean_model_df["true_review_score"], get_test_mean_model_df["pred_review_score"])

0.6441041552565331

In [45]:
get_test_mean_model_df["model_based"] = rating_pred_arr

In [50]:
get_test_mean_model_df.head(60)

Unnamed: 0,beer_id,true_review_score,pred_review_score,model_based
0,37,1.9,2.169665,1.748652
1,37,2.3,2.169665,2.197562
2,37,2.4,2.169665,1.505415
3,37,1.9,2.169665,3.314546
4,37,1.9,2.169665,2.419085
5,37,2.1,2.169665,1.756427
6,37,2.0,2.169665,3.183325
7,37,2.0,2.169665,2.945865
8,37,1.7,2.169665,3.413313
9,37,1.7,2.169665,3.291302


In [64]:
test_df = pd.read_csv("/opt/ml/workspace/final-project-level3-recsys-10/data/standard/test_ratebeer_without_text.csv")

In [65]:
test_df = pd.read_csv("/opt/ml/workspace/final-project-level3-recsys-10/data/standard/test_ratebeer_without_text.csv")
test_df = test_df[["beer_id", "review_score"]]
test_df.columns = ["beer_id", "true_review_score"]
test_df = test_df.reset_index()
test_df

Unnamed: 0,index,beer_id,true_review_score
0,0,131594,4.0
1,1,131594,4.3
2,2,131594,4.1
3,3,131594,4.0
4,4,131594,3.4
...,...,...,...
314547,314547,3324,3.6
314548,314548,3324,4.0
314549,314549,3324,3.2
314550,314550,3324,2.4


In [66]:
get_test_df = test_df.merge(mean_model_df, on = "beer_id", how = "inner")
get_test_df = get_test_df.sort_values("index")
get_test_df.index = range(len(get_test_df))
get_test_df

Unnamed: 0,index,beer_id,true_review_score,pred_review_score
0,0,131594,4.0,3.986297
1,1,131594,4.3,3.986297
2,2,131594,4.1,3.986297
3,3,131594,4.0,3.986297
4,4,131594,3.4,3.986297
...,...,...,...,...
314547,314547,3324,3.6,3.105983
314548,314548,3324,4.0,3.105983
314549,314549,3324,3.2,3.105983
314550,314550,3324,2.4,3.105983


In [67]:
rmse(get_test_df["true_review_score"], get_test_df["pred_review_score"])

0.49836598385990016