# CDAE - Movie Recommendation Top-k

## Library

In [1]:
import os
import sys
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from scipy import sparse

## Config

In [2]:
import yaml

def load_config(config_file):
    with open(config_file, 'r') as stream:
        try:
            config = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return config

cfg = load_config('config.yaml')

In [3]:
if torch.cuda.is_available():
    print('CUDA is available')
    cfg['device'] = True

device = torch.device('cuda' if cfg['device'] else 'cpu')
device

CUDA is available


device(type='cuda')

In [4]:
random.seed(cfg['seed'])
np.random.seed(cfg['seed'])
torch.manual_seed(cfg['seed'])


<torch._C.Generator at 0x7fe5fb439590>

In [5]:
raw_data = pd.read_csv(os.path.join(cfg['DATA_DIR'], cfg['data']), header=0, usecols=[0, 1])

In [6]:
raw_data['user'].nunique()

from sklearn.model_selection import train_test_split


# train, test = train_test_split(raw_data, test_size=0.2, random_state=cfg['seed'])

In [10]:
from typing import Iterable, Tuple

import numpy as np
import pandas as pd
import torch.utils.data as data
from sklearn.model_selection import train_test_split


class CDAEData(data.Dataset):
    def __init__(self, data: np.ndarray) -> None:
        super(CDAEData, self).__init__()

        self.data = data

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index) -> None:
        return index, self.data[index]

def get_count(data: np.ndarray, id: str) -> pd.DataFrame:
    r"""Helper function to get the count of each id.

    Parameters
    ----------
    data : np.ndarray
        Data which consists of user_id, item_id.

    id : str
        The name of the id.

    Returns
    -------
    pd.DataFrame
        The count of each id.
    """
    count_groupby = data[[id]].groupby(id, as_index=False).size()

    return count_groupby


def preprocess(data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, int, int]:
    r"""Helper function to preprocess data.

    Parameters
    ----------
    data : pd.DataFrame
        Data which consists of user_id, item_id.

    Returns
    -------
    Tuple[np.ndarray, np.ndarray, int, int]
    """
    num_users = data["user"].nunique()
    num_items = data["item"].nunique()

    user_activity = get_count(data=data, id="user")
    item_popularity = get_count(data=data, id="item")

    # Shuffle User Indices
    unique_uid = user_activity['user'].unique()
    unique_sid = item_popularity['item'].unique()

    # Indexing
    user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))
    item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))

    # # np.random.seed(seed)
    # idx_perm = np.random.permutation(unique_uid.size)
    # unique_uid = unique_uid[idx_perm]

    # # Split Train / Test User
    # tr_users = unique_uid[:-3000]
    # te_users = unique_uid[-3000:]

    # train = data.loc[raw_data['user'].isin(tr_users)]
    # test = data.loc[raw_data['user'].isin(te_users)]

    # Create a directory to save preprocessed data
    pro_dir = os.path.join('pro_sg')

    if not os.path.exists(pro_dir):
        os.makedirs(pro_dir)

    with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
        for sid in unique_sid:
            f.write('%s\n' % sid)

    with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
        for uid in unique_uid:
            f.write('%s\n' % uid)

    train = numerize(data, user2id, item2id)
    # train = numerize(train, user2id, item2id)
    # test = numerize(test, user2id, item2id)

    train_mat = _to_matrix(data=train, num_users=num_users, num_items=num_items)
    test_mat = _to_matrix(data=train, num_users=num_users, num_items=num_items)

    return train_mat, test_mat, num_users, num_items

def numerize(
        tp: pd.DataFrame,
        user2id: dict,
        item2id: dict,
        ) -> pd.DataFrame:
    r"""Helper function to numerize user and item ids.

    Parameters
    ----------
    tp : pd.DataFrame
        Data which consists of user_id, item_id.

    user2id : dict
        A dictionary which maps user_id to user index.

    item2id : dict
        A dictionary which maps item_id to item index.

    Returns
    -------
    pd.DataFrame
        Numerized data.
    """
    uid = tp['user'].apply(lambda x: user2id[x])
    sid = tp['item'].apply(lambda x: item2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])


def _to_matrix(
    data: Iterable,
    num_users: int = None,
    num_items: int = None,
) -> np.ndarray:
    r"""Helper function to convert an iterable object into the form of matrix.

    Parameters
    ----------
    data : Iterable
        Movie Lens prepocessed data which consists of user, item.
    num_users : int, optional
        The number of users, by default None
    num_items : int, optional
        The number of items, by default None

    Returns
    -------
    np.ndarray
        Rating matrix
    """
    if isinstance(data, np.ndarray):
        data = np.array(data, dtype=int)

    # initialize a matrix
    mat = np.zeros((num_users, num_items))

    for user, item in data[['uid', 'sid']].values:
        mat[user, item] = 1

    return mat

In [11]:
train_mat, test_mat, num_users, num_items = preprocess(raw_data)

In [12]:
train_set = CDAEData(train_mat)
test_set = CDAEData(test_mat)

In [13]:
train_loader = data.DataLoader(train_set, batch_size=cfg['batch_size'], shuffle=True)
test_loader = data.DataLoader(test_set, batch_size=cfg['batch_size'], shuffle=False)

In [14]:
class CDAE(nn.Module):
    r"""Collaborative Denoising Auto-Encoder

    Parameters
    ----------
    num_users : int
        _description_
    num_items : int
        _description_
    num_hidden_units : int
        _description_
    corruption_ratio : float
        _description_

    References
    ----------
    [1] Wu, Yao, et al. "Collaborative denoising auto-encoders for top-n
        recommender systems." Proceedings of the ninth ACM international
        conference on web search and data mining. 2016.

    """

    def __init__(
        self,
        num_users: int,
        num_items: int,
        num_hidden_units: int,
        corruption_ratio: float,
    ) -> None:
        super(CDAE, self).__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.num_hidden_units = num_hidden_units
        self.corruption_ratio = corruption_ratio

        # CDAE consists of user embedding, encoder, decoder
        self.user_embedding = nn.Embedding(num_users, num_hidden_units)
        self.encoder = nn.Linear(num_items, num_hidden_units)
        self.decoder = nn.Linear(num_hidden_units, num_items)

        # Set to use GPU
        self.cuda()

    def forward(
        self, user_idx: torch.Tensor, matrix: torch.Tensor
    ) -> torch.Tensor:
        # Apply corruption
        matrix = F.dropout(
            matrix, p=self.corruption_ratio, training=self.training
        )
        encoder = torch.tanh(
            self.encoder(matrix) + self.user_embedding(user_idx)
        )
        return self.decoder(encoder)

    def train_one_epoch(
        self, train_loader: data.DataLoader, optimizer: optim.Optimizer
    ) -> torch.Tensor:
        r"""Train a single epoch.

        Parameters
        ----------
        data_loader : data.DataLoader
            Training data loader
        optimizer : optim.Optimizer
            An optimizer

        Returns
        -------
        torch.Tensor
            Loss value after training one epoch.
        """
        loss = 0
        # Turn training mode on
        self.train()

        # By using BCEWithLogitsLoss, the return value of 'forward()' method
        # is not using sigmoid function.
        loss_f = CDAELoss(model=self, lambda_reg=0.01)

        for (indices, input_mat) in train_loader:
            indices = indices.cuda()
            input_mat = input_mat.float().cuda()
            self.zero_grad()

            predict_mat = self.forward(user_idx=indices, matrix=input_mat)
            batch_loss = loss_f(input=predict_mat, target=input_mat)
            batch_loss.backward()
            optimizer.step()
            loss += batch_loss

        return loss / len(train_loader)

    def predict(self, train_loader: data.DataLoader) -> np.ndarray:
        r"""Predict items per users.
        Observations that already seen by each user are masked with `-inf`.

        Parameters
        ----------
        train_loader : data.DataLoader
            Training data loader

        Returns
        -------
        np.ndarray
            Prediction matrix which of dimension is same as training data.
        """
        with torch.no_grad():
            preds = np.zeros_like(train_loader.dataset.data)

            for (indices, input_mat) in train_loader:
                indices = indices.cuda()
                input_mat = input_mat.float().cuda()
                batch_pred = torch.sigmoid(self.forward(indices, input_mat))
                batch_pred = batch_pred.masked_fill(
                    input_mat.bool(), float("-inf")
                )

                indices = indices.detach().cpu().numpy()
                preds[indices] = batch_pred.detach().cpu().numpy()

        return preds

class CDAELoss(nn.Module):
    def __init__(self, model: nn.Module, lambda_reg=0.01):
        """
        Initialize the CDAE loss module with L2 regularization.

        Parameters:
        - model: The CDAE model instance to apply regularization.
        - lambda_reg: Regularization strength.
        """
        super(CDAELoss, self).__init__()
        self.model = model
        self.lambda_reg = lambda_reg
        self.bce_with_logits_loss = nn.BCEWithLogitsLoss()

    def forward(self, input, target):
        """
        Compute the CDAE loss as the sum of BCE loss and L2 regularization.

        Parameters:
        - input: The predicted values from the CDAE.
        - target: The true values (actual user-item interactions).

        Returns:
        - The computed loss value.
        """
        # Compute the binary cross-entropy loss
        bce_loss = self.bce_with_logits_loss(input, target)

        # Compute the L2 regularization term
        l2_reg = torch.tensor(0.).cuda()
        for param in self.model.parameters():
            l2_reg += torch.norm(param)**2
        l2_reg_loss = self.lambda_reg * l2_reg

        # Total loss
        total_loss = bce_loss + l2_reg_loss
        return total_loss


In [15]:
model = CDAE(
    num_users=num_users,
    num_items=num_items,
    num_hidden_units=cfg['num_hidden_units'],
    corruption_ratio=cfg['corruption_ratio'],
).to(device)

In [16]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
from tqdm import tqdm

for epoch in tqdm(range(cfg['n_epochs'])):
        loss = model.train_one_epoch(train_loader, optimizer)
        print(f"[Epoch {epoch}]:: Loss: {loss}")

  1%|▏         | 1/80 [00:02<03:18,  2.52s/it]

[Epoch 0]:: Loss: 113995.3515625


  2%|▎         | 2/80 [00:04<02:38,  2.03s/it]

[Epoch 1]:: Loss: 93897.765625


  4%|▍         | 3/80 [00:05<02:26,  1.90s/it]

[Epoch 2]:: Loss: 77537.8828125


  5%|▌         | 4/80 [00:07<02:19,  1.84s/it]

[Epoch 3]:: Loss: 64113.58984375


  6%|▋         | 5/80 [00:09<02:17,  1.84s/it]

[Epoch 4]:: Loss: 53045.1484375


  8%|▊         | 6/80 [00:11<02:13,  1.81s/it]

[Epoch 5]:: Loss: 43890.55859375


  9%|▉         | 7/80 [00:13<02:13,  1.83s/it]

[Epoch 6]:: Loss: 36303.33203125


 10%|█         | 8/80 [00:14<02:09,  1.80s/it]

[Epoch 7]:: Loss: 30007.07421875


 11%|█▏        | 9/80 [00:16<02:04,  1.76s/it]

[Epoch 8]:: Loss: 24778.48046875


 12%|█▎        | 10/80 [00:18<02:03,  1.76s/it]

[Epoch 9]:: Loss: 20435.57421875


 14%|█▍        | 11/80 [00:20<02:03,  1.79s/it]

[Epoch 10]:: Loss: 16828.98828125


 15%|█▌        | 12/80 [00:22<02:03,  1.81s/it]

[Epoch 11]:: Loss: 13835.4970703125


 16%|█▋        | 13/80 [00:23<02:03,  1.84s/it]

[Epoch 12]:: Loss: 11352.962890625


 18%|█▊        | 14/80 [00:25<02:00,  1.83s/it]

[Epoch 13]:: Loss: 9296.48046875


 19%|█▉        | 15/80 [00:27<02:00,  1.85s/it]

[Epoch 14]:: Loss: 7595.28857421875


 20%|██        | 16/80 [00:29<01:57,  1.84s/it]

[Epoch 15]:: Loss: 6190.28466796875


 21%|██▏       | 17/80 [00:31<01:56,  1.85s/it]

[Epoch 16]:: Loss: 5032.04541015625


 22%|██▎       | 18/80 [00:33<01:52,  1.82s/it]

[Epoch 17]:: Loss: 4079.205322265625


 24%|██▍       | 19/80 [00:34<01:50,  1.81s/it]

[Epoch 18]:: Loss: 3297.117431640625


 25%|██▌       | 20/80 [00:36<01:47,  1.80s/it]

[Epoch 19]:: Loss: 2656.76953125


 26%|██▋       | 21/80 [00:38<01:45,  1.79s/it]

[Epoch 20]:: Loss: 2133.874267578125


 28%|██▊       | 22/80 [00:40<01:45,  1.82s/it]

[Epoch 21]:: Loss: 1708.106689453125


 29%|██▉       | 23/80 [00:42<01:43,  1.82s/it]

[Epoch 22]:: Loss: 1362.4794921875


 30%|███       | 24/80 [00:43<01:40,  1.79s/it]

[Epoch 23]:: Loss: 1082.8116455078125


 31%|███▏      | 25/80 [00:45<01:36,  1.75s/it]

[Epoch 24]:: Loss: 857.2839965820312


 32%|███▎      | 26/80 [00:47<01:33,  1.73s/it]

[Epoch 25]:: Loss: 676.0653686523438


 34%|███▍      | 27/80 [00:49<01:33,  1.77s/it]

[Epoch 26]:: Loss: 530.9949951171875


 35%|███▌      | 28/80 [00:50<01:32,  1.78s/it]

[Epoch 27]:: Loss: 415.3156433105469


 36%|███▋      | 29/80 [00:52<01:29,  1.76s/it]

[Epoch 28]:: Loss: 323.4482421875


 38%|███▊      | 30/80 [00:54<01:26,  1.74s/it]

[Epoch 29]:: Loss: 250.8002471923828


 39%|███▉      | 31/80 [00:56<01:26,  1.76s/it]

[Epoch 30]:: Loss: 193.60238647460938


 40%|████      | 32/80 [00:57<01:24,  1.76s/it]

[Epoch 31]:: Loss: 148.77346801757812


 41%|████▏     | 33/80 [00:59<01:22,  1.76s/it]

[Epoch 32]:: Loss: 113.80338287353516


 42%|████▎     | 34/80 [01:01<01:20,  1.74s/it]

[Epoch 33]:: Loss: 86.65589141845703


 44%|████▍     | 35/80 [01:03<01:19,  1.76s/it]

[Epoch 34]:: Loss: 65.6859130859375


 45%|████▌     | 36/80 [01:04<01:17,  1.77s/it]

[Epoch 35]:: Loss: 49.57046890258789


 46%|████▋     | 37/80 [01:06<01:16,  1.79s/it]

[Epoch 36]:: Loss: 37.25102615356445


 48%|████▊     | 38/80 [01:08<01:14,  1.79s/it]

[Epoch 37]:: Loss: 27.883724212646484


 49%|████▉     | 39/80 [01:10<01:13,  1.78s/it]

[Epoch 38]:: Loss: 20.800626754760742


 50%|█████     | 40/80 [01:12<01:11,  1.78s/it]

[Epoch 39]:: Loss: 15.475110054016113


 51%|█████▏    | 41/80 [01:13<01:09,  1.79s/it]

[Epoch 40]:: Loss: 11.493955612182617


 52%|█████▎    | 42/80 [01:15<01:08,  1.80s/it]

[Epoch 41]:: Loss: 8.535489082336426


 54%|█████▍    | 43/80 [01:17<01:06,  1.81s/it]

[Epoch 42]:: Loss: 6.350606918334961


 55%|█████▌    | 44/80 [01:19<01:04,  1.78s/it]

[Epoch 43]:: Loss: 4.746762275695801


 56%|█████▋    | 45/80 [01:21<01:02,  1.78s/it]

[Epoch 44]:: Loss: 3.5771305561065674


 57%|█████▊    | 46/80 [01:22<01:00,  1.78s/it]

[Epoch 45]:: Loss: 2.730046510696411


 59%|█████▉    | 47/80 [01:24<00:59,  1.79s/it]

[Epoch 46]:: Loss: 2.1202964782714844


 60%|██████    | 48/80 [01:26<00:57,  1.81s/it]

[Epoch 47]:: Loss: 1.6846855878829956


 61%|██████▏   | 49/80 [01:28<00:55,  1.78s/it]

[Epoch 48]:: Loss: 1.3758313655853271


 62%|██████▎   | 50/80 [01:29<00:53,  1.78s/it]

[Epoch 49]:: Loss: 1.1582449674606323


 64%|██████▍   | 51/80 [01:31<00:51,  1.77s/it]

[Epoch 50]:: Loss: 1.0063291788101196


 65%|██████▌   | 52/80 [01:33<00:50,  1.79s/it]

[Epoch 51]:: Loss: 0.901112973690033


 66%|██████▋   | 53/80 [01:35<00:47,  1.78s/it]

[Epoch 52]:: Loss: 0.828715443611145


 68%|██████▊   | 54/80 [01:37<00:46,  1.78s/it]

[Epoch 53]:: Loss: 0.7794148325920105


 69%|██████▉   | 55/80 [01:38<00:44,  1.78s/it]

[Epoch 54]:: Loss: 0.7460851669311523


 70%|███████   | 56/80 [01:40<00:42,  1.78s/it]

[Epoch 55]:: Loss: 0.7237800359725952


 71%|███████▏  | 57/80 [01:42<00:41,  1.80s/it]

[Epoch 56]:: Loss: 0.7089977264404297


 72%|███████▎  | 58/80 [01:44<00:40,  1.83s/it]

[Epoch 57]:: Loss: 0.699174702167511


 74%|███████▍  | 59/80 [01:46<00:38,  1.82s/it]

[Epoch 58]:: Loss: 0.6928858160972595


 75%|███████▌  | 60/80 [01:47<00:36,  1.81s/it]

[Epoch 59]:: Loss: 0.6888647079467773


 76%|███████▋  | 61/80 [01:49<00:34,  1.84s/it]

[Epoch 60]:: Loss: 0.6861808896064758


 78%|███████▊  | 62/80 [01:51<00:31,  1.78s/it]

[Epoch 61]:: Loss: 0.6845281720161438


 79%|███████▉  | 63/80 [01:53<00:30,  1.77s/it]

[Epoch 62]:: Loss: 0.6834279894828796


 80%|████████  | 64/80 [01:55<00:28,  1.78s/it]

[Epoch 63]:: Loss: 0.6828171014785767


 81%|████████▏ | 65/80 [01:56<00:26,  1.79s/it]

[Epoch 64]:: Loss: 0.682465672492981


 82%|████████▎ | 66/80 [01:58<00:25,  1.80s/it]

[Epoch 65]:: Loss: 0.6822625398635864


 84%|████████▍ | 67/80 [02:00<00:23,  1.79s/it]

[Epoch 66]:: Loss: 0.6821296811103821


 85%|████████▌ | 68/80 [02:02<00:21,  1.76s/it]

[Epoch 67]:: Loss: 0.6820774674415588


 86%|████████▋ | 69/80 [02:04<00:19,  1.79s/it]

[Epoch 68]:: Loss: 0.6819056868553162


 88%|████████▊ | 70/80 [02:05<00:17,  1.79s/it]

[Epoch 69]:: Loss: 0.6819186210632324


 89%|████████▉ | 71/80 [02:07<00:16,  1.79s/it]

[Epoch 70]:: Loss: 0.6819902658462524


 90%|█████████ | 72/80 [02:09<00:14,  1.81s/it]

[Epoch 71]:: Loss: 0.6818622946739197


 91%|█████████▏| 73/80 [02:11<00:12,  1.80s/it]

[Epoch 72]:: Loss: 0.6820031404495239


 92%|█████████▎| 74/80 [02:12<00:10,  1.79s/it]

[Epoch 73]:: Loss: 0.6819106936454773


 94%|█████████▍| 75/80 [02:14<00:09,  1.81s/it]

[Epoch 74]:: Loss: 0.6819629669189453


 95%|█████████▌| 76/80 [02:16<00:07,  1.81s/it]

[Epoch 75]:: Loss: 0.681944727897644


 96%|█████████▋| 77/80 [02:18<00:05,  1.83s/it]

[Epoch 76]:: Loss: 0.6819998025894165


 98%|█████████▊| 78/80 [02:20<00:03,  1.84s/it]

[Epoch 77]:: Loss: 0.6819154024124146


 99%|█████████▉| 79/80 [02:22<00:01,  1.80s/it]

[Epoch 78]:: Loss: 0.6819136738777161


100%|██████████| 80/80 [02:23<00:00,  1.80s/it]

[Epoch 79]:: Loss: 0.6818909645080566





In [18]:
preds = model.predict(train_loader=train_loader)

In [19]:
def recall_at_k(actual: np.ndarray, pred: np.ndarray, top_k: int) -> float:
    r"""_summary_

    Parameters
    ----------
    actual : np.ndarray
        _description_
    pred : np.ndarray
        _description_
    top_k : int
        _description_

    Returns
    -------
    float
        _description_
    """
    actual = np.asarray(actual).astype(np.bool)

    # Get indices of the top_k predicted scores
    top_k_pred_indices = np.argsort(pred)[-top_k:]

    # Create a binary array of the same shape as actual, where only the top k predictions are True
    top_k_preds = np.zeros_like(pred, dtype=np.bool)
    top_k_preds[top_k_pred_indices] = True

    # Calculate the number of relevant items retrieved in the top k predictions
    relevant_and_retrieved = np.sum(actual & top_k_preds)

    # Calculate the total number of relevant items
    total_relevant = np.sum(actual)

    # Handle the case where there are no relevant items
    if total_relevant == 0:
        return 0.0

    # Calculate recall
    recall = relevant_and_retrieved / total_relevant

    return recall

def map_at_k(actual: np.ndarray, pred: np.ndarray, top_k: int) -> float:
    r"""Mean average precision at k.

    Parameters
    ----------
    actual : np.ndarray
        A matrix with actual values.
    pred : np.ndarray
        A matrix with predictions.
    top_k : int

    Returns
    -------
    float
        Mean average precision at k
    """
    if not _assert_same_dimension(actual, pred):
        raise AssertionError("Two input matrices should have same dimension.")

    map_ = 0

    num_users = len(pred)
    top_k_items = _topk(input=pred, k=top_k)

    for i in range(num_users):
        actual_item = set(actual[i].nonzero()[0])
        pred_item = top_k_items[i]

        map_ += _ap_at_k(actual=actual_item, pred=pred_item, top_k=top_k)

    return map_ / num_users

def _ap_at_k(actual: np.array, pred: np.array, top_k: int) -> float:
    r"""Avearge precision at k

    Parameters
    ----------
    actual : np.array
        A list of item are to be predicted
    pred : np.array
        A list of predicted items
    top_k : int

    Returns
    -------
    float
        Average precision at k
    """

    if len(pred) > top_k:
        pred = pred[:top_k]

    p, cnt = 0, 0

    if not actual:
        return 0.0

    for idx, item in enumerate(pred):
        if item in actual:
            cnt += 1
            p += cnt / (idx + 1)

    return 0.0 if cnt == 0 else p / min(top_k, len(actual))

def _topk(input: np.ndarray, k: int) -> np.ndarray:
    r"""Returns indices of k largest element of the given input matrix along
    the horizontal axis.

    Parameters
    ----------
    input : np.ndarray
        _description_
    k : int
        _description_

    Returns
    -------
    np.ndarray
        _description_
    """
    return np.argsort(input)[:, -k:][:, ::-1]

def _assert_same_dimension(actual: np.ndarray, pred: np.ndarray) -> bool:
    r"""Check the actual matrix and the prediction have same dimension.

    Parameters
    ----------
    actual : np.ndarray
        Actual values
    pred : np.ndarray
        Predicted values

    Returns
    -------
    bool
    """
    return actual.shape == pred.shape


In [20]:
eval_result = map_at_k(actual=test_mat, pred=preds, top_k=10)

print(f"MAP@10: {eval_result:.6f}")

MAP@10: 0.000000


In [21]:
eval_result = recall_at_k(actual=test_mat, pred=preds, top_k=10)

print(f"RECALL@10: {eval_result:.6f}")

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  actual = np.asarray(actual).astype(np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  top_k_preds = np.zeros_like(pred, dtype=np.bool)


RECALL@10: 0.219061


In [22]:
watchedm=raw_data.groupby('user')['item'].apply(list)

In [23]:
watchedm

user
11        [4643, 170, 531, 616, 2140, 2722, 2313, 2688, ...
14        [8961, 1396, 471, 2105, 1042, 1947, 1269, 2394...
18        [1952, 1283, 3507, 4280, 51084, 593, 318, 356,...
25        [261, 22, 2161, 3255, 372, 1093, 428, 175, 214...
31        [260, 1196, 1210, 7153, 4993, 5952, 1270, 5855...
                                ...                        
138473    [524, 3354, 1025, 6565, 69757, 2085, 32, 55282...
138475    [1639, 1673, 1148, 246, 2019, 1267, 1172, 1235...
138486    [2694, 1994, 2723, 441, 2288, 637, 2013, 2423,...
138492    [2115, 908, 58, 2700, 2599, 1500, 1358, 1288, ...
138493    [3174, 2872, 48780, 2662, 2840, 1566, 2857, 20...
Name: item, Length: 31360, dtype: object

In [24]:
unique_sid = pd.read_csv(os.path.join('pro_sg/unique_sid.txt'),sep=" ",header=None)
unique_uid = pd.read_csv(os.path.join('pro_sg/unique_uid.txt'),sep=" ",header=None)

In [25]:
id2show = dict((i, sid) for (i, sid) in enumerate(unique_sid.squeeze()))
id2profile = dict((i, pid) for (i, pid) in enumerate(unique_uid.squeeze()))

In [26]:
temp = pd.DataFrame(preds)

column=list(temp.columns)
origin_mid=[id2show[x] for x in column]

row=list(temp.index)
origin_uid=[id2profile[x] for x in row]

temp.columns=origin_mid
temp.index=origin_uid

In [27]:
from tqdm import tqdm
sumbission=dict()
sumbission={'user': [],'item': []}
sumbission

for row in tqdm(temp.iterrows(),total=31360):
    userid=row[0]
    movies=row[1]
    watchedmovies=watchedm.get(userid, [])

    for _ in range(10):
        sumbission['user'].append(userid)

    itemp=[]
    for movie in reversed(list(movies.sort_values().index)):
        if len(itemp)==10:
            break
        else:
            if movie not in watchedmovies:
                itemp.append(movie)

    sumbission['item']+=itemp

sumbission=pd.DataFrame(sumbission)
sumbission.sort_values('user', inplace=True)
sumbission.to_csv('submission.csv', index=False)

100%|██████████| 31360/31360 [01:02<00:00, 499.18it/s]
