In [None]:
from typing import Union, Tuple, List

import numpy as np
import random
import pandas as pd
from datetime import datetime, date
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader

seed = 42
random.seed(seed)
np.random.seed(seed)

In [None]:
df = pd.read_csv('../asset/inter_240129.csv')
df = df[df['local_time'] >= '2024-02-05 00:00:00+00:00']
df

In [None]:
product_data = pd.read_csv('../asset/item.csv')
product_data

In [None]:
user_to_idx = {ip:idx for idx, ip in enumerate(df['hashed_ip'].unique())}
idx_to_user = {idx:ip for idx, ip in enumerate(df['hashed_ip'].unique())}
item_to_idx = {pid:idx for idx, pid in enumerate(product_data['id'].unique())}
idx_to_item = {idx:pid for idx, pid in enumerate(product_data['id'].unique())}

In [None]:
df['hashed_ip'] = df['hashed_ip'].map(user_to_idx)
df['products'] = df['products'].map(item_to_idx)

## positive sample

In [None]:
# 상품이 최소 min_count 회 이상 등장해야 학습되고 모델에 저장됨
min_count = 3
# 1개의 positive sample당 negative sample의 개수
negative = 5

In [None]:
positive_samples = df.drop(columns='local_time', axis=0)
positive_samples = positive_samples.drop_duplicates(keep='first', ignore_index=True)

product_ids_for_training = list()
for pid in tqdm(positive_samples['products'].unique()):
    if positive_samples[positive_samples['products'] == pid].shape[0] >= min_count:
        product_ids_for_training.append(pid)

In [None]:
new_positive_samples = dict()
new_positive_samples['hashed_ip'] = list()
new_positive_samples['w_pid'] = list()
new_positive_samples['c_pid'] = list()

user_negative_samples = dict()

product_ids = df['products'].unique()

for ip in tqdm(df['hashed_ip'].unique()):
    user_positive_samples = positive_samples[positive_samples['hashed_ip'] == ip]
    user_products = user_positive_samples['products'].tolist()
    # sampling을 위해 각 user id의 negative products를 저장
    user_neg_products = [pid for pid in product_ids if pid not in user_products]
    user_negative_samples[ip] = np.array(user_neg_products)
    for w_pid in user_products:
        # 단어가 최소 등장 횟수를 만족하지 않음
        if w_pid not in product_ids_for_training:
            continue
        for c_pid in user_products:
            if c_pid == w_pid:
                continue
            new_positive_samples['hashed_ip'].append(ip)
            new_positive_samples['w_pid'].append(w_pid)
            new_positive_samples['c_pid'].append(c_pid)

new_positive_samples = pd.DataFrame(new_positive_samples)

## Creating dataset

In [None]:
new_positive_samples['hashed_ip'] = new_positive_samples['hashed_ip'].astype("category")
new_positive_samples['w_pid'] = new_positive_samples['w_pid'].astype("category")
new_positive_samples['c_pid'] = new_positive_samples['c_pid'].astype("category")

In [None]:
train_df, test_df = train_test_split(
    new_positive_samples, random_state=seed, test_size=0.20, #stratify=new_positive_samples['hashed_ip'], 
)
print('학습 데이터 크기:', train_df.shape)
print('테스트 데이터 크기:', test_df.shape)

In [None]:
# PyTorch의 DataLoader에서 사용할 수 있도록 변환
train_dataset = TensorDataset(torch.LongTensor(np.array(train_df)))
test_dataset = TensorDataset(torch.LongTensor(np.array(test_df)))

## Item2Vec

### Negative Sampler class

In [None]:
class Negative_Sampler(nn.Module):
    """
    Negative Sampler

    Args:
        - user_negative_samples: (Dict) keys: user id, items: list of negative samples
        - n_negs: (int) negative sample의 수
    Shape:
        - Input: (torch.Tensor) user id들. Shape: (batch size,)
        - Output: (torch.Tensor) sampling된 negative samples. Shape: (batch size, n_negs)
    """
    def __init__(self, user_negative_samples, n_negs):
        super(Negative_Sampler, self).__init__()
        self.user_negative_samples = user_negative_samples
        self.n_negs = n_negs

    def forward(self, user_ids):
        user_ids = user_ids.to('cpu').numpy()
        negative_samples = np.array([np.random.choice(a=self.user_negative_samples[user_id], size=self.n_negs,replace=False) for user_id in user_ids])
        return torch.from_numpy(negative_samples)


### SGNS class

In [None]:
class SGNS(nn.Module):
    """
    Skip-Gram with Negative Sampling

    Args:
        - n_items: (int) 전체 아이템의 수
        - emb_dim: (int) Embedding의 Dimension
        - user_negative_samples: (Dict) hashed_ip 별 전체 negative sample
        - n_negs: (int) negative sample의 수
    Shape:
        - Input: (torch.Tensor) input features, (hashed_ip, 중심 item id, 주변 item id). Shape: (batch size, 3)
        - Output: (torch.Tensor) sampling된 negative samples와 positive sample의 Loss 합. Shape: ()
    """
    def __init__(self, n_items, emb_dim, user_negative_samples, n_negs ,device=torch.device("cpu")):
        super(SGNS, self).__init__()

        # initialize Class attributes
        self.n_items = n_items
        self.emb_dim = emb_dim
        self.user_negative_samples = user_negative_samples
        self.n_negs = n_negs
        self.negative_sampler = Negative_Sampler(self.user_negative_samples, self.n_negs)

        # define embeddings
        # 중심 아이템
        self.w_item_embedding = nn.Embedding(self.n_items, self.emb_dim)
        # 주변 아이템
        self.c_item_embedding = nn.Embedding(self.n_items, self.emb_dim)

        self.sigmoid = nn.Sigmoid()

        self.loss_fn = nn.BCELoss()

        self.apply(self._init_weights)
        self.device = device
        self.to(device)

    # initialize weights
    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)

    def forward(self, input_feature):
        batch_size = input_feature.size()[0]

        user_ids, w_item, c_item = torch.split(input_feature, [1, 1, 1], -1)
        # 유저 id
        user_ids = user_ids.squeeze(-1)
        # 중심 아이템
        w_item = w_item.squeeze(-1)
        # 주변 아이템 (positive sample)
        c_item = c_item.squeeze(-1)
        # 주변 아이템 negative sampling
        neg_c_items = self.negative_sampler(user_ids).to(self.device) 

        # 중심 아이템 embedding
        w_item_e = self.w_item_embedding(w_item).to(self.device) 
        # 주변 아이템 (positive sample) embedding
        c_item_e = self.c_item_embedding(c_item).to(self.device) 
        # 주변 아이템 (negative sample) embedding
        neg_c_items_e = self.c_item_embedding(neg_c_items).to(self.device) 

        w_item_e = w_item_e.view(batch_size, 1, self.emb_dim)
        c_item_e = c_item_e.view(batch_size, self.emb_dim, 1)
        neg_c_items_e = neg_c_items_e.permute(0, 2, 1)

        pos_output = torch.bmm(w_item_e, c_item_e)
        pos_output = pos_output.squeeze(-1)
        pos_output = self.sigmoid(pos_output).squeeze(-1)

        pos_y = torch.ones(pos_output.size()).to(self.device)
        pos_loss = self.loss_fn(pos_output, pos_y)

        neg_output = torch.bmm(w_item_e, neg_c_items_e) 
        neg_output = neg_output.squeeze(-1)
        neg_output = self.sigmoid(neg_output)

        neg_y = torch.zeros(neg_output.size()).to(self.device)
        neg_loss = self.loss_fn(neg_output, neg_y)

        return pos_loss + neg_loss


### utils

In [None]:
def train_loop(dataloader, model, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0

    for batch, (X,) in enumerate(dataloader):
        X = X.to(device)
        # Compute prediction and loss
        loss = model(X)
        train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if (batch+1) % 100 == 0:
            loss, current = loss.item(), (batch+1) * len(X)
            print(f"Loss: {loss:>7f} | [{current:>5d}/{size:>5d}]")
    train_loss /= num_batches

    return train_loss


def test_loop(dataloader, model):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss= 0

    with torch.no_grad():
        for X, in dataloader:
            X= X.to(device)
            loss = model(X)
            test_loss += loss.item()
    test_loss /= num_batches
    print(f"Test Error:\n\tAvg Loss: {test_loss:>8f}")
    return test_loss


def train_and_test(train_dataloader, test_dataloader, model, optimizer, epochs):
    train_loss, test_loss = list(), list()

    for t in tqdm(range(epochs)):
        print(f"Epoch {t+1}\n-------------------------------")
        train_result= train_loop(train_dataloader, model, optimizer)
        train_loss.append(train_result)
        test_result = test_loop(test_dataloader, model)
        test_loss.append(test_result)
        print("-------------------------------\n")
    print("Done!")

    return train_loss, test_loss

### train and test

In [None]:
######## Hyperparameter ########

batch_size = 2048
data_shuffle = True
emb_dim = 512
epochs = 5
learning_rate = 0.001
gpu_idx = 0

n_items = product_data['id'].nunique()

################################
#torch.cuda.empty_cache() # if necessary
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=data_shuffle)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=data_shuffle)

model = SGNS(n_items, emb_dim, user_negative_samples, negative, device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True)

In [None]:
train_loss, test_loss = train_and_test(train_dataloader, test_dataloader, model, optimizer, epochs)

### get_most_similar

In [None]:
def get_most_similar(
    model: SGNS,
    product_id: int,
    n: int,
    n_products: int
) -> List:
    """
    해당 product id와 cosine similarity가 가장 높은 n개의 product id를 반환

    :param model: (SGNS) SGNS model
    :param product_id: (int) 기준이 되는 product id
    :param n: (int) 반환하는 product의 수
    :param n_products: (int) 전체 product의 수
    :return: (List) n개의 product id list
    """
    input_tensor = torch.tensor(product_id).to('cuda')
    product_embedding = model.w_item_embedding(input_tensor)
    product_embedding = product_embedding.squeeze(-1)
    similarity_list = list()
    for pid in range(n_products):
        if pid == product_id:
            continue
        p_tensor = torch.tensor(pid).to('cuda')
        p_embedding = model.w_item_embedding(p_tensor)
        p_embedding = p_embedding.squeeze(-1)
        similarity = torch.nn.CosineSimilarity(dim=0)(product_embedding, p_embedding)
        similarity_list.append((pid, similarity.item()))
    similarity_list.sort(key=lambda x: x[1], reverse=True)

    return similarity_list[:n]

In [None]:
n = 10
product_id = 22
n_products = df['products'].nunique()

print(product_data[product_data['id']==idx_to_item[product_id]]['title'])

similar_products = get_most_similar(model, product_id, n, n_products)
for similar_pid, similarity in similar_products:
    title = product_data[product_data['id']==idx_to_item[similar_pid]]['title']

    print(f'\ntitle: {title} | cosine similarity: {similarity}\n')