In [45]:
import math
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import torch
import torch.nn as nn
import implicit
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as data_split
import time

In [46]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [47]:
class NeuralNetwork(nn.Module):
    def __init__(self, n_items:int=6807, bn_dim:int=100,p:float=0.5):
        super().__init__()
        self.item_encoder = LabelEncoder()
        self.user_encoder = LabelEncoder()
        dims = [n_items, bn_dim, n_items]
        self.layers = nn.ModuleList(
            [nn.Linear(dims[i],dims[i+1]) for i in range(len(dims)-1)]
        )
        # self.linear = nn.Linear(n_items, bn_dim)
        self.drop = nn.Dropout(p)
        self.n_items = n_items

    def preprocess(self, df: pd.DataFrame):
        ucol, icol = df.columns
        items = self.item_encoder.fit_transform(df.loc[:,icol])
        users = self.user_encoder.fit_transform(df.loc[:,ucol])
        return users, items

    def forward(self, x) -> torch.Tensor:
        # with torch.no_grad():
        #     self.linear.weight.fill_diagonal_(0.)
        x = self.drop(x)
        for layer in self.layers:
            x = layer(x)
        # x = self.linear(x)
        return x

    def item_ids2tensor(self, input_ids:list) -> torch.Tensor:
        input_tensor = np.zeros((1, self.n_items), dtype=float)
        items = self.item_encoder.transform(input_ids)
        input_tensor[0,items] = 1.
        input_tensor = torch.FloatTensor(input_tensor).to(device)
        return input_tensor

    def predict_k(self, input_ids:list, k:int=10) -> pd.DataFrame:
        input_tensor = self.item_ids2tensor(input_ids)
        predictions = self.forward(input_tensor)[0].detach().cpu().numpy() # [[predictions]]
        predictions[self.item_encoder.transform(input_ids)] = -np.inf
        result_id = np.argpartition(predictions, -k)[-k:]
        sorted_order = np.argsort(-predictions[result_id])
        result_id = result_id[sorted_order]
        result_pred = predictions[result_id]
        return self.item_encoder.inverse_transform(result_id), result_pred


In [48]:
def split_validation(matrix:sparse.csr_matrix, n:int=10, n_item:int=6807):
    input_rows = []
    input_cols = []
    target_rows = []
    target_cols = []
    n_rows = matrix.shape[0]
    for i, item_seq in tqdm(enumerate(matrix.toarray()),total=n_rows):
        items = np.where(item_seq == 1)[0]
        target_idx = np.random.choice(len(items),size=(n),replace=False)
        # 레이블 저장
        target_items = items[target_idx]
        target_rows.append([i]*n)
        target_cols.append(target_items)
        
        # 입력 데이터 마스킹으로 저장
        mask = np.ones(len(items), dtype=bool)
        mask[target_idx] = False
        input_items = items[mask]

        input_rows.append([i]*len(input_items))
        input_cols.append(input_items)
    input_rows = np.concatenate(input_rows)
    input_cols = np.concatenate(input_cols)
    input_vals = np.ones(len(input_rows))
    target_rows = np.concatenate(target_rows)
    target_cols = np.concatenate(target_cols)
    target_vals = np.ones(len(target_rows))

    return sparse.csr_matrix((input_vals,(input_rows,input_cols)),shape=(n_rows,n_item)),sparse.csr_matrix((target_vals,(target_rows,target_cols)),shape=(n_rows,n_item))

In [49]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    Normalized Discounted Cumulative Gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = np.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)

    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG


def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = np.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

In [50]:
def naive_sparse2tensor(data):
    return torch.FloatTensor(data.toarray())

In [51]:
df = pd.read_csv('../../data/train/train_ratings.csv')
df = df[['user','item']]
model = NeuralNetwork()
ui, ii = model.preprocess(df)

In [52]:
values = (np.ones(len(ui)))
full_matrix = sparse.csr_matrix((values,(ui,ii)))
# Split into train and valid set
train_matrix, test_full = data_split(full_matrix, train_size=0.8)
valid_matrix, test_matrix = data_split(test_full, train_size=0.5)
# Split to valid and test set
valid_input, valid_target = split_validation(valid_matrix)
test_input, test_target = split_validation(test_matrix)

100%|██████████| 3136/3136 [00:00<00:00, 19736.83it/s]
100%|██████████| 3136/3136 [00:00<00:00, 19622.62it/s]


In [53]:
from torch.functional import F
def loss_function_dae(recon_x, x):
    BCE = -torch.mean(torch.sum(F.log_softmax(recon_x, 1) * x, -1))
    return BCE

In [54]:
batch_size = 500
epochs = 20
device = "cuda" if torch.cuda.is_available() else "cpu"
criterion = nn.MSELoss()
# criterion = loss_function_dae
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
log_interval = 100
save = './weights/model.pt'

In [55]:
def train(model, epoch, train_data):
    # Turn on training mode
    model.train()
    train_loss = 0.0
    n_rows = train_data.shape[0]
    idx_list = list(range(train_data.shape[0]))
    start_time = time.time()

    np.random.shuffle(idx_list)
    
    for batch_idx, start_idx in enumerate(range(0, n_rows, batch_size)):
        end_idx = min(start_idx + batch_size, n_rows)
        data = train_data[idx_list[start_idx:end_idx]]
        data = naive_sparse2tensor(data).to(device)
        optimizer.zero_grad()

        recon_batch = model(data)
        loss = criterion(recon_batch, data)

        loss.backward()
        train_loss += loss.item()
        optimizer.step()


        if batch_idx % log_interval == 0 and batch_idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:4d}/{:4d} batches | ms/batch {:4.2f} | '
                    'loss {:4.2f}'.format(
                        epoch, batch_idx, len(range(0, n_rows, batch_size)),
                        elapsed * 1000 / log_interval,
                        train_loss / log_interval))
            

            start_time = time.time()
            train_loss = 0.0

In [56]:
def evaluate(model, inputs:sparse.csr_matrix,targets:sparse.csr_matrix, k:int=10):
    recalls = []
    ndcgs = []
    total_loss = 0.0
    n_rows = inputs.shape[0]
    idx_list = list(range(inputs.shape[0]))
    
    model.eval()
    with torch.no_grad():
        for start_idx in range(0, n_rows, batch_size):
            end_idx = min(start_idx + batch_size, n_rows)
            data = inputs[idx_list[start_idx:end_idx]]
            heldout_data = targets[idx_list[start_idx:end_idx]]

            data_tensor = naive_sparse2tensor(data).to(device)
            recon_batch = model(data_tensor)

            loss = criterion(recon_batch, data_tensor)

            total_loss += loss.item()

            # Exclude examples from training set
            recon_batch = recon_batch.cpu().numpy()
            recon_batch[data.nonzero()] = -np.inf

            n10 = NDCG_binary_at_k_batch(recon_batch, heldout_data, 10)
            r10 = Recall_at_k_batch(recon_batch, heldout_data, 10)

            ndcgs.append(n10)
            recalls.append(r10)

    total_loss /= len(range(0, n_rows, batch_size))
    recalls = np.concatenate(recalls)
    ndcgs = np.concatenate(ndcgs)
    return total_loss, np.mean(recalls), np.mean(ndcgs)


In [57]:
def run(model, train_matrix, valid_input, valid_target, test_input, test_target):
    n_rows = train_matrix.shape[0]
    idxlist = list(range(n_rows))

    best_r10 = -np.inf
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model, epoch, train_matrix)
        val_loss, r10, n10= evaluate(model, valid_input, valid_target)
        print('-' * 102)
        print('| end of epoch {:3d} | time: {:4.4f}s | valid loss {:4.4f} | '
                'n10 {:5.4f} | r10 {:5.4f} |'.format(
                    epoch, time.time() - epoch_start_time, val_loss,
                    n10, r10))
        print('-' * 102)

        n_iter = epoch * len(range(0, n_rows, batch_size))


        # Save the self.model if the r10 is the best we've seen so far.
        if r10 > best_r10:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_r10 = r10


    # Load the best saved self.model.
    with open(save, 'rb') as f:
        model = torch.load(f)

    # Run on test data.
    test_loss, r10, n10 = evaluate(model, test_input, test_target)
    print('=' * 102)
    print('| End of training | test loss {:4.4f} | n10 {:4.4f} | r10 {:4.4f} |'.format(test_loss, n10, r10))
    print('=' * 102)


In [58]:
model = model.to(device)
run(model, train_matrix, valid_input, valid_target, test_input, test_target)

------------------------------------------------------------------------------------------------------
| end of epoch   1 | time: 1.3794s | valid loss 0.0178 | n10 0.0961 | r10 0.0857 |
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
| end of epoch   2 | time: 1.4822s | valid loss 0.0163 | n10 0.1378 | r10 0.1203 |
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
| end of epoch   3 | time: 1.4164s | valid loss 0.0156 | n10 0.1582 | r10 0.1365 |
------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------
| end of epoch   4 | time: 1.4

In [59]:
with open('./weights/model.pt', 'rb') as f:
    model = torch.load(f)

In [33]:
device = 'cuda'

In [43]:
df

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [60]:
model.predict_k([4643])

(array([ 2571,   318,  2959,   356,   296, 58559,   593,  2858,  4993,
         7153]),
 array([0.21931747, 0.19804136, 0.19775712, 0.18754138, 0.18538935,
        0.14561966, 0.14055654, 0.13715774, 0.12422715, 0.11471848],
       dtype=float32))

In [61]:
np.savetxt('item_classes.txt',model.item_encoder.classes_,fmt="%d")
np.savetxt('user_classes.txt',model.user_encoder.classes_,fmt="%d")

In [76]:
np.loadtxt('item_classes.txt')

dtype('float64')

In [None]:
LabelEncoder().fit()

In [74]:
model.item_encoder.classes_

array([     1,      2,      3, ..., 118997, 119141, 119145])

In [62]:
torch.save(model.state_dict(),'./weights/state.pt')