In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


df = pd.read_pickle("../../data/processed/rating_engage.pkl")
df

Unnamed: 0,user_id,item_id,rating,item_len
0,0,556,2.0,93
0,0,843,2.0,93
0,0,1039,5.0,93
0,0,3865,5.0,93
0,0,4646,3.0,93
...,...,...,...,...
1999,1999,72419,0,106
1999,1999,82072,0,106
1999,1999,7212,0,106
1999,1999,19116,0,106


In [2]:
data = df.to_numpy()[:, :3].astype(int)
data = torch.from_numpy(data)
data

tensor([[    0,   556,     2],
        [    0,   843,     2],
        [    0,  1039,     5],
        ...,
        [ 1999,  7212,     0],
        [ 1999, 19116,     0],
        [ 1999, 84812,     0]])

In [3]:
# data = data[data[:, 2] > 0]
# data

In [4]:
data[:, 2][data[:, 2] > 1] = 1
data

tensor([[    0,   556,     1],
        [    0,   843,     1],
        [    0,  1039,     1],
        ...,
        [ 1999,  7212,     0],
        [ 1999, 19116,     0],
        [ 1999, 84812,     0]])

In [5]:
data[:, 0].max(), data[:, 1].max(), data[:, 2].max(), data[:, 2].min()

(tensor(1999), tensor(97717), tensor(1), tensor(0))

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

user_id_lst = torch.unique(data[:, 0])

train_uid_lst, test_uid_lst = train_test_split(user_id_lst[:], test_size=0.2, random_state=42)
test_uid_lst.shape

torch.Size([400])

In [7]:
train_data = data[:]
test_data = data[torch.isin(data[:, 0], test_uid_lst)]

train_data.shape, test_data.shape, test_data

(torch.Size([527772, 3]),
 torch.Size([106014, 3]),
 tensor([[   23,  1219,     1],
         [   23,  1541,     1],
         [   23,  1930,     1],
         ...,
         [ 1990, 67701,     0],
         [ 1990, 36109,     0],
         [ 1990, 34740,     0]]))

In [8]:
user_shape = np.unique(data[:, 0]).shape[0]
item_shape = np.unique(data[:, 1]).shape[0]
user_shape, item_shape

(2000, 97718)

In [9]:
total_item_id_lst = torch.unique(data[:, 1])
total_item_id_lst

tensor([    0,     1,     2,  ..., 97715, 97716, 97717])

In [10]:

from torch.utils.data import DataLoader, Dataset


class TwoTwoerDataset(Dataset):
    def __init__(self, interactions):
        self.interactions = interactions

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        inter = self.interactions[idx]
        return inter[0], inter[1], inter[2].float()
    
class TwoTwoerTestDataset(Dataset):
    def __init__(self, test_data, total_item_id_lst):
        self.users = torch.unique(test_data[:, 0])
        self.test_data = test_data
        self.total_item_id_lst = total_item_id_lst

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user_idx = self.users[idx]
        labels = self.test_data[self.test_data[:, 0]==user_idx]

        full_rank_data = torch.zeros((self.total_item_id_lst.shape[0], 3)).long()
        full_rank_data[:, 0] = user_idx
        full_rank_data[:, 1] = self.total_item_id_lst

        for item in (labels):
            indicies = ((full_rank_data[:, 0] == item[0]) & (full_rank_data[:, 1] == item[1]))
            full_rank_data[indicies, 2] = item[2]
        
        return full_rank_data


# Hyperparameters
embedding_dim = 128
num_epochs = 1000
learning_rate = 5e-4
batch_size = 2024

train_loader = DataLoader(TwoTwoerDataset(train_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TwoTwoerTestDataset(test_data, total_item_id_lst), batch_size=32, shuffle=False)

In [11]:
import torch.optim as optim
import sys
sys.path.append("../")
from model import TTRecommender

    
num_uesrs = torch.unique(data[:, 0]).max() + 1
num_items = torch.unique(data[:, 1]).max() + 1

# 모델 인스턴스화
model = TTRecommender(num_uesrs, num_items, embedding_dim).cuda()

# Loss와 Optimizer 정의
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
from tqdm import tqdm
from copy import deepcopy
import warnings
import gc
early_stopping_patience=5
warnings.filterwarnings('always')

model_parameters = deepcopy(model.state_dict())

def recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=20):
    unique_user_id = torch.unique(user_lst)
    recall_lst = []
    pre_lst = []
    
    for user_id in tqdm(unique_user_id):
        idx = torch.where(user_lst==user_id)[0]
        preds = pred_lst[idx]
        targets = target_lst[idx]
        items = item_lst[idx]

        top_k_preds = items[preds.topk(k, dim=0).indices.cpu().numpy()]

        gt = items[targets==1]
        if(gt.shape[0] == 0):
            continue
        
        recall_lst.append((np.isin(top_k_preds, gt))/(gt.shape[0]))
        pre_lst.append((np.isin(top_k_preds, gt))/(k))
            
    return np.mean(recall_lst), np.mean(pre_lst)



best_val_recall = -float('inf')
patience_counter = 0

for epoch in range(num_epochs):

    model.train()
    total_loss = 0
    for user_idx, item_idx, target in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(user_idx.cuda(), item_idx.cuda())
        loss = criterion(outputs, target.cuda())

        loss.backward()
        optimizer.step()
        total_loss += loss.detach().cpu().item()

    model.eval()
    with torch.no_grad():
        pred_lst, target_lst, user_lst, item_lst = [], [], [], []
        for full_rank_data in tqdm(test_loader):
            user_idx, item_idx, target  = full_rank_data[:, :, 0], full_rank_data[:, :, 1], full_rank_data[:, :, 2]
            preds = model(user_idx.cuda(), item_idx.cuda())

            pred_lst.append(preds.view(-1).detach().cpu())
            target_lst.append(target.view(-1).detach().cpu())
            user_lst.append(user_idx.view(-1).detach().cpu())
            item_lst.append(item_idx.view(-1).detach().cpu())

    pred_lst = torch.cat(pred_lst)
    target_lst = torch.cat(target_lst)
    user_lst = torch.cat(user_lst)
    item_lst = torch.cat(item_lst)
    recall_20, pre_20 = recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=20)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, '
            f'Precision@20: {pre_20:.4f}, Recall@20: {recall_20:.10f}')
    
    # Early stopping
    if recall_20 > best_val_recall:
        best_val_recall = recall_20
        patience_counter = 0
        model_parameters = deepcopy(model.state_dict())
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break


100%|██████████| 261/261 [00:08<00:00, 29.05it/s]
100%|██████████| 13/13 [00:32<00:00,  2.50s/it]
100%|██████████| 400/400 [00:22<00:00, 17.89it/s]


Epoch [1/1000], Loss: 4.5454, Precision@20: 0.0001, Recall@20: 0.0000080145


100%|██████████| 261/261 [00:08<00:00, 30.24it/s]
100%|██████████| 13/13 [00:24<00:00,  1.91s/it]
100%|██████████| 400/400 [00:21<00:00, 18.96it/s]


Epoch [2/1000], Loss: 4.2734, Precision@20: 0.0001, Recall@20: 0.0000080145


100%|██████████| 261/261 [00:08<00:00, 30.24it/s]
100%|██████████| 13/13 [00:24<00:00,  1.91s/it]
100%|██████████| 400/400 [00:20<00:00, 19.47it/s]


Epoch [3/1000], Loss: 3.9931, Precision@20: 0.0001, Recall@20: 0.0000080145


100%|██████████| 261/261 [00:08<00:00, 29.70it/s]
100%|██████████| 13/13 [00:36<00:00,  2.84s/it]
100%|██████████| 400/400 [00:20<00:00, 19.90it/s]


Epoch [4/1000], Loss: 3.7205, Precision@20: 0.0001, Recall@20: 0.0000080145


100%|██████████| 261/261 [00:08<00:00, 29.81it/s]
100%|██████████| 13/13 [00:29<00:00,  2.24s/it]
100%|██████████| 400/400 [00:19<00:00, 20.57it/s]


Epoch [5/1000], Loss: 3.4593, Precision@20: 0.0000, Recall@20: 0.0000067768


100%|██████████| 261/261 [00:08<00:00, 30.53it/s]
100%|██████████| 13/13 [00:32<00:00,  2.52s/it]
100%|██████████| 400/400 [00:21<00:00, 19.02it/s]

Epoch [6/1000], Loss: 3.2107, Precision@20: 0.0000, Recall@20: 0.0000067768
Early stopping triggered





In [13]:
model.load_state_dict(model_parameters)

model.eval()
with torch.no_grad():
    pred_lst, target_lst, user_lst, item_lst = [], [], [], []
    for full_rank_data in tqdm(test_loader):
        user_idx, item_idx, target  = full_rank_data[:, :, 0], full_rank_data[:, :, 1], full_rank_data[:, :, 2]
        preds = model(user_idx.cuda(), item_idx.cuda())

        pred_lst.append(preds.view(-1).detach().cpu())
        target_lst.append(target.view(-1).detach().cpu())
        user_lst.append(user_idx.view(-1).detach().cpu())
        item_lst.append(item_idx.view(-1).detach().cpu())

pred_lst = torch.cat(pred_lst)
target_lst = torch.cat(target_lst)
user_lst = torch.cat(user_lst)
item_lst = torch.cat(item_lst)
recall_20, pre_20 = recall_pre_at_k_per_user(user_lst, item_lst, pred_lst, target_lst, k=100)

recall_20, pre_20

100%|██████████| 13/13 [00:30<00:00,  2.32s/it]
100%|██████████| 400/400 [00:20<00:00, 19.52it/s]


(9.614821276692984e-06, 1.25e-05)

In [14]:
torch.save({
    "state_dict": model.cpu().state_dict(), 
    "num_users": num_uesrs, 
    "num_items": num_items, 
    "embedding_dim": embedding_dim, 
}, "../parameters/twotower.pth")