In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


df = pd.read_pickle("../../data/processed/rating_engage.pkl")
df

Unnamed: 0,user_id,item_id,rating,item_len
0,0,28833,1.0,123
0,0,29361,5.0,123
0,0,32319,1.0,123
0,0,37990,5.0,123
0,0,40014,5.0,123
...,...,...,...,...
999,999,44930,0,193
999,999,14950,0,193
999,999,50080,0,193
999,999,22395,0,193


In [2]:
data = df.to_numpy()[:, :3].astype(int)
data = torch.from_numpy(data)
data

tensor([[    0, 28833,     1],
        [    0, 29361,     5],
        [    0, 32319,     1],
        ...,
        [  999, 50080,     0],
        [  999, 22395,     0],
        [  999, 65599,     0]])

In [3]:
# data = data[data[:, 2] > 0]
# data

In [4]:
data[:, 2][data[:, 2] > 1] = 1
data

tensor([[    0, 28833,     1],
        [    0, 29361,     1],
        [    0, 32319,     1],
        ...,
        [  999, 50080,     0],
        [  999, 22395,     0],
        [  999, 65599,     0]])

In [5]:
data[:, 0].max(), data[:, 1].max(), data[:, 2].max(), data[:, 2].min()

(tensor(999), tensor(72318), tensor(1), tensor(0))

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

user_id_lst = torch.unique(data[:, 0])

train_uid_lst, test_uid_lst = train_test_split(user_id_lst[:], test_size=0.2, random_state=42)
test_uid_lst.shape

torch.Size([200])

In [7]:
train_data = data[:]
test_data = data[torch.isin(data[:, 0], test_uid_lst)]

train_data.shape, test_data.shape, test_data

(torch.Size([328728, 3]),
 torch.Size([70432, 3]),
 tensor([[   10,   677,     1],
         [   10,   833,     1],
         [   10,   901,     1],
         ...,
         [  998, 60146,     0],
         [  998, 65120,     0],
         [  998, 15879,     0]]))

In [8]:
user_shape = np.unique(data[:, 0]).shape[0]
item_shape = np.unique(data[:, 1]).shape[0]
user_shape, item_shape

(1000, 72319)

In [9]:
total_item_id_lst = torch.unique(data[:, 1])
total_item_id_lst

tensor([    0,     1,     2,  ..., 72316, 72317, 72318])

In [10]:

from torch.utils.data import DataLoader, Dataset


class TwoTwoerDataset(Dataset):
    def __init__(self, interactions):
        self.interactions = interactions

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        inter = self.interactions[idx]
        return inter[0], inter[1], inter[2].float()
    
class TwoTwoerTestDataset(Dataset):
    def __init__(self, test_data, total_item_id_lst):
        self.users = torch.unique(test_data[:, 0])
        self.test_data = test_data
        self.total_item_id_lst = total_item_id_lst

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user_idx = self.users[idx]
        labels = self.test_data[self.test_data[:, 0]==user_idx]

        full_rank_data = torch.zeros((self.total_item_id_lst.shape[0], 3)).long()
        full_rank_data[:, 0] = user_idx
        full_rank_data[:, 1] = self.total_item_id_lst

        for item in (labels):
            indicies = ((full_rank_data[:, 0] == item[0]) & (full_rank_data[:, 1] == item[1]))
            full_rank_data[indicies, 2] = item[2]
        
        return full_rank_data


# Hyperparameters
embedding_dim = 128
num_epochs = 1000
learning_rate = 5e-4
batch_size = 2024

train_loader = DataLoader(TwoTwoerDataset(train_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TwoTwoerTestDataset(test_data, total_item_id_lst), batch_size=32, shuffle=False)

In [11]:
import torch.optim as optim
import sys
sys.path.append("../")
from model import TTRecommender

    
num_uesrs = torch.unique(data[:, 0]).max() + 1
num_items = torch.unique(data[:, 1]).max() + 1

# 모델 인스턴스화
model = TTRecommender(num_uesrs, num_items, embedding_dim).cuda()

# Loss와 Optimizer 정의
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
from tqdm import tqdm
from copy import deepcopy
import warnings
import gc
early_stopping_patience=5
warnings.filterwarnings('always')

model_parameters = deepcopy(model.state_dict())

def recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=20):
    unique_user_id = torch.unique(user_lst)
    recall_lst = []
    pre_lst = []
    
    for user_id in tqdm(unique_user_id):
        idx = torch.where(user_lst==user_id)[0]
        preds = pred_lst[idx]
        targets = target_lst[idx]
        items = item_lst[idx]

        top_k_preds = items[preds.topk(k, dim=0).indices.cpu().numpy()]

        gt = items[targets==1]
        if(gt.shape[0] == 0):
            continue
        
        recall_lst.append((np.isin(top_k_preds, gt))/(gt.shape[0]))
        pre_lst.append((np.isin(top_k_preds, gt))/(k))
            
    return np.mean(recall_lst), np.mean(pre_lst)



best_val_recall = -float('inf')
patience_counter = 0

for epoch in range(num_epochs):

    model.train()
    total_loss = 0
    for user_idx, item_idx, target in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(user_idx.cuda(), item_idx.cuda())
        loss = criterion(outputs, target.cuda())

        loss.backward()
        optimizer.step()
        total_loss += loss.detach().cpu().item()

    model.eval()
    with torch.no_grad():
        pred_lst, target_lst, user_lst, item_lst = [], [], [], []
        for full_rank_data in tqdm(test_loader):
            user_idx, item_idx, target  = full_rank_data[:, :, 0], full_rank_data[:, :, 1], full_rank_data[:, :, 2]
            preds = model(user_idx.cuda(), item_idx.cuda())

            pred_lst.append(preds.view(-1).detach().cpu())
            target_lst.append(target.view(-1).detach().cpu())
            user_lst.append(user_idx.view(-1).detach().cpu())
            item_lst.append(item_idx.view(-1).detach().cpu())

    pred_lst = torch.cat(pred_lst)
    target_lst = torch.cat(target_lst)
    user_lst = torch.cat(user_lst)
    item_lst = torch.cat(item_lst)
    recall_20, pre_20 = recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=20)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, '
            f'Precision@20: {pre_20:.4f}, Recall@20: {recall_20:.10f}')
    
    # Early stopping
    if recall_20 > best_val_recall:
        best_val_recall = recall_20
        patience_counter = 0
        model_parameters = deepcopy(model.state_dict())
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break


100%|██████████| 163/163 [00:05<00:00, 29.14it/s]
100%|██████████| 7/7 [00:24<00:00,  3.55s/it]
100%|██████████| 200/200 [00:05<00:00, 36.03it/s]


Epoch [1/1000], Loss: 4.5656, Precision@20: 0.0002, Recall@20: 0.0000211052


100%|██████████| 163/163 [00:05<00:00, 29.38it/s]
100%|██████████| 7/7 [00:18<00:00,  2.61s/it]
100%|██████████| 200/200 [00:05<00:00, 38.90it/s]


Epoch [2/1000], Loss: 4.3364, Precision@20: 0.0002, Recall@20: 0.0000211052


100%|██████████| 163/163 [00:05<00:00, 29.31it/s]
100%|██████████| 7/7 [00:20<00:00,  2.95s/it]
100%|██████████| 200/200 [00:05<00:00, 38.43it/s]


Epoch [3/1000], Loss: 4.1009, Precision@20: 0.0002, Recall@20: 0.0000211052


100%|██████████| 163/163 [00:05<00:00, 29.45it/s]
100%|██████████| 7/7 [00:19<00:00,  2.82s/it]
100%|██████████| 200/200 [00:04<00:00, 40.35it/s]


Epoch [4/1000], Loss: 3.8717, Precision@20: 0.0002, Recall@20: 0.0000211052


100%|██████████| 163/163 [00:05<00:00, 27.48it/s]
100%|██████████| 7/7 [00:21<00:00,  3.01s/it]
100%|██████████| 200/200 [00:06<00:00, 32.09it/s]


Epoch [5/1000], Loss: 3.6472, Precision@20: 0.0002, Recall@20: 0.0000211052


100%|██████████| 163/163 [00:05<00:00, 30.05it/s]
100%|██████████| 7/7 [00:17<00:00,  2.44s/it]
100%|██████████| 200/200 [00:05<00:00, 38.35it/s]

Epoch [6/1000], Loss: 3.4331, Precision@20: 0.0002, Recall@20: 0.0000211052
Early stopping triggered





In [13]:
model.load_state_dict(model_parameters)

model.eval()
with torch.no_grad():
    pred_lst, target_lst, user_lst, item_lst = [], [], [], []
    for full_rank_data in tqdm(test_loader):
        user_idx, item_idx, target  = full_rank_data[:, :, 0], full_rank_data[:, :, 1], full_rank_data[:, :, 2]
        preds = model(user_idx.cuda(), item_idx.cuda())

        pred_lst.append(preds.view(-1).detach().cpu())
        target_lst.append(target.view(-1).detach().cpu())
        user_lst.append(user_idx.view(-1).detach().cpu())
        item_lst.append(item_idx.view(-1).detach().cpu())

pred_lst = torch.cat(pred_lst)
target_lst = torch.cat(target_lst)
user_lst = torch.cat(user_lst)
item_lst = torch.cat(item_lst)
recall_20, pre_20 = recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=100)

recall_20, pre_20

100%|██████████| 7/7 [00:17<00:00,  2.51s/it]
100%|██████████| 200/200 [00:06<00:00, 31.71it/s]


(1.3759706604139124e-05, 2.25e-05)

In [14]:
torch.save({
    "state_dict": model.cpu().state_dict(), 
    "num_users": num_uesrs, 
    "num_items": num_items, 
    "embedding_dim": embedding_dim, 
}, "../parameters/twotower.pth")