In [1]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
# pip install transformers
from transformers import BertTokenizer, BertModel
# 만약 주피터 노트북에서 아래와 관계있는 에러가 발생한다면
# pip install ipywidgets

In [16]:
main_df = pd.read_csv('/opt/ml/input/data/train/train_ratings.csv') # user-item-time
title_df = pd.read_csv('/opt/ml/input/data/train/titles.tsv', sep='\t') # item-title
year_df = pd.read_csv('/opt/ml/input/data/train/years.tsv', sep='\t') # item-year
director_df = pd.read_csv('/opt/ml/input/data/train/directors.tsv', sep='\t') # item-director
genre_name_df = pd.read_csv('/opt/ml/input/data/train/genres.tsv', sep='\t') # item-genre(name)
writer_df = pd.read_csv('/opt/ml/input/data/train/writers.tsv', sep='\t') # item-writer

In [17]:
no_dir_item_list = list(set(main_df['item']) - set(director_df['item']))
no_dir_item_df = pd.DataFrame([x for x in zip(no_dir_item_list, ['nm0000000']*len(no_dir_item_list))])
no_dir_item_df.columns=director_df.columns # 컬럼명 동일하게
director_df = pd.concat([director_df, no_dir_item_df]) # 기존 director_df 뒤에 감독없는 영화 추가

In [18]:
user_direct = main_df.merge(director_df, how='left', on='item')

In [19]:
user_direct

Unnamed: 0,user,item,time,director
0,11,4643,1230782529,nm0000318
1,11,170,1230782534,nm0812200
2,11,531,1230782539,nm0002140
3,11,616,1230782542,nm0718627
4,11,2140,1230782563,nm0000568
...,...,...,...,...
5708948,138493,44022,1260209449,nm0757858
5708949,138493,4958,1260209482,nm0601382
5708950,138493,68319,1260209720,nm0004303
5708951,138493,40819,1260209726,nm0003506


In [20]:
user_direct = user_direct.drop(columns=['time'], axis=1)

In [21]:
user_direct['rating'] = 1

In [22]:
# 3. Negative instance 생성 : 각 유저별 true:false = 5:5, 20분정도 소요
print("Create Nagetive instances")
# num_negative = 50
user_group_dfs = list(user_direct.groupby('user')['item'])
first_row = True
user_neg_dfs = pd.DataFrame()
items = set(user_direct.loc[:, 'item'])

for u, u_items in tqdm(user_group_dfs):
    u_items = set(u_items)
    i_user_neg_item = np.random.choice(list(items - u_items), len(u_items), replace=False)
    
    i_user_neg_df = pd.DataFrame({'user': [u]*len(u_items), 'item': i_user_neg_item, 'rating': [0]*len(u_items)})
    if first_row == True:
        user_neg_dfs = i_user_neg_df
        first_row = False
    else:
        user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

user_direct = pd.concat([user_direct, user_neg_dfs], axis = 0, sort=False)
user_direct = user_direct.drop(columns=['director'])
user_direct = user_direct.merge(director_df, how='left', on='item')
user_direct.to_csv("user_direct_neg_sampling.csv", index=False)
user_direct.isna().sum()

Create Nagetive instances


  2%|▏         | 695/31360 [00:01<00:59, 511.67it/s]


KeyboardInterrupt: 

In [4]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [3]:
user_direct = pd.read_csv('./user_direct_neg_sampling.csv')

In [None]:
user_direct

In [5]:
def load_data(basepath):
#     path1 = os.path.join(basepath, "train_data.csv")
#     path2 = os.path.join(basepath, "test_data.csv")
#     data1 = pd.read_csv(path1)
#     data2 = pd.read_csv(path2)

#     data = pd.concat([data1, data2])
#     data.drop_duplicates(
#         subset=["userID", "assessmentItemID"], keep="last", inplace=True
#     )

    data = pd.read_csv(basepath)
    return data


def separate_data(data):
    train_data = data[data.rating >= 0]
    test_data = data[data.rating < 0]

    return train_data, test_data


def indexing_data(data):
    userid, itemid = (
        sorted(list(set(data.user))),
        sorted(list(set(data.director))),
    )
    n_user, n_item = len(userid), len(itemid)

    userid_2_index = {v: i for i, v in enumerate(userid)}
    itemid_2_index = {v: i + n_user for i, v in enumerate(itemid)}
    id_2_index = dict(userid_2_index, **itemid_2_index)

    return id_2_index


def process_data(data, id_2_index, device):
    edge, label = [], []
    for user, item, acode in zip(data.user, data.director, data.rating):
        uid, iid = id_2_index[user], id_2_index[item]
        edge.append([uid, iid])
        label.append(acode)

    edge = torch.LongTensor(edge).T
    label = torch.LongTensor(label)

    return dict(edge=edge.to(device), label=label.to(device))

In [6]:
def prepare_dataset(device, basepath, verbose=True, logger=None):
    data = load_data(basepath)
    train_data, test_data = separate_data(data)
    id2index = indexing_data(data)
    train_data_proc = process_data(train_data, id2index, device)
    test_data_proc = process_data(test_data, id2index, device)

    # if verbose:
    #     print_data_stat(train_data, "Train", logger=logger)
    #     print_data_stat(test_data, "Test", logger=logger)

    return train_data_proc, test_data_proc, len(id2index), id2index 


In [7]:
# data prepare
train_data, test_data, n_node, edge_index = prepare_dataset(
    device, './user_direct_neg_sampling.csv', verbose=True
)

  edge = torch.LongTensor(edge).T


In [8]:
n_node

32701

In [None]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_sparse import SparseTensor

from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor


class LightGCN(torch.nn.Module):
    r"""The LightGCN model from the `"LightGCN: Simplifying and Powering
    Graph Convolution Network for Recommendation"
    <https://arxiv.org/abs/2002.02126>`_ paper.

    :class:`~torch_geometric.nn.models.LightGCN` learns embeddings by linearly
    propagating them on the underlying graph, and uses the weighted sum of the
    embeddings learned at all layers as the final embedding

    .. math::
        \textbf{x}_i = \sum_{l=0}^{L} \alpha_l \textbf{x}^{(l)}_i,

    where each layer's embedding is computed as

    .. math::
        \mathbf{x}^{(l+1)}_i = \sum_{j \in \mathcal{N}(i)}
        \frac{1}{\sqrt{\deg(i)\deg(j)}}\mathbf{x}^{(l)}_j.

    Two prediction heads and trainign objectives are provided:
    **link prediction** (via
    :meth:`~torch_geometric.nn.models.LightGCN.link_pred_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.predict_link`) and
    **recommendation** (via
    :meth:`~torch_geometric.nn.models.LightGCN.recommendation_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.recommend`).

    .. note::

        Embeddings are propagated according to the graph connectivity specified
        by :obj:`edge_index` while rankings or link probabilities are computed
        according to the edges specified by :obj:`edge_label_index`.

    Args:
        num_nodes (int): The number of nodes in the graph.
        embedding_dim (int): The dimensionality of node embeddings.
        num_layers (int): The number of
            :class:`~torch_geometric.nn.conv.LGConv` layers.
        alpha (float or Tensor, optional): The scalar or vector specifying the
            re-weighting coefficients for aggregating the final embedding.
            If set to :obj:`None`, the uniform initialization of
            :obj:`1 / (num_layers + 1)` is used. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of the underlying
            :class:`~torch_geometric.nn.conv.LGConv` layers.
    """
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        num_layers: int,
        alpha: Optional[Union[float, Tensor]] = None,
        **kwargs,
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        if alpha is None:
            alpha = 1. / (num_layers + 1)

        if isinstance(alpha, Tensor):
            assert alpha.size(0) == num_layers + 1
        else:
            alpha = torch.tensor([alpha] * (num_layers + 1))
        self.register_buffer('alpha', alpha)

        self.embedding = Embedding(num_nodes, embedding_dim)
        self.convs = ModuleList([LGConv(**kwargs) for _ in range(num_layers)])

        self.reset_parameters()

def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            conv.reset_parameters()


def get_embedding(self, edge_index: Adj) -> Tensor:
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            out = out + x * self.alpha[i + 1]

        return out


def forward(self, edge_index: Adj,
                edge_label_index: OptTensor = None) -> Tensor:
        r"""Computes rankings for pairs of nodes.

        Args:
            edge_index (Tensor or SparseTensor): Edge tensor specifying the
                connectivity of the graph.
            edge_label_index (Tensor, optional): Edge tensor specifying the
                node pairs for which to compute rankings or probabilities.
                If :obj:`edge_label_index` is set to :obj:`None`, all edges in
                :obj:`edge_index` will be used instead. (default: :obj:`None`)
        """
        if edge_label_index is None:
            if isinstance(edge_index, SparseTensor):
                edge_label_index = torch.stack(edge_index.coo()[:2], dim=0)
            else:
                edge_label_index = edge_index

        out = self.get_embedding(edge_index)

        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]
        return (out_src * out_dst).sum(dim=-1)


def predict_link(self, edge_index: Adj, edge_label_index: OptTensor = None,
                     prob: bool = False) -> Tensor:
        r"""Predict links between nodes specified in :obj:`edge_label_index`.

        Args:
            prob (bool): Whether probabilities should be returned. (default:
                :obj:`False`)
        """
        pred = self(edge_index, edge_label_index).sigmoid()
        return pred if prob else pred.round()


def recommend(self, edge_index: Adj, src_index: OptTensor = None,
                  dst_index: OptTensor = None, k: int = 1) -> Tensor:
        r"""Get top-:math:`k` recommendations for nodes in :obj:`src_index`.

        Args:
            src_index (Tensor, optional): Node indices for which
                recommendations should be generated.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            dst_index (Tensor, optional): Node indices which represent the
                possible recommendation choices.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            k (int, optional): Number of recommendations. (default: :obj:`1`)
        """
        out_src = out_dst = self.get_embedding(edge_index)

        if src_index is not None:
            out_src = out_src[src_index]

        if dst_index is not None:
            out_dst = out_dst[dst_index]

        pred = out_src @ out_dst.t()
        top_index = pred.topk(k, dim=-1).indices

        if dst_index is not None:  # Map local top-indices to original indices.
            top_index = dst_index[top_index.view(-1)].view(*top_index.size())

        return top_index


def link_pred_loss(self, pred: Tensor, edge_label: Tensor,
                       **kwargs) -> Tensor:
        r"""Computes the model loss for a link prediction objective via the
        :class:`torch.nn.BCEWithLogitsLoss`.

        Args:
            pred (Tensor): The predictions.
            edge_label (Tensor): The ground-truth edge labels.
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch.nn.BCEWithLogitsLoss` loss function.
        """
        loss_fn = torch.nn.BCEWithLogitsLoss(**kwargs)
        return loss_fn(pred, edge_label.to(pred.dtype))


def recommendation_loss(self, pos_edge_rank: Tensor, neg_edge_rank: Tensor,
                            lambda_reg: float = 1e-4, **kwargs) -> Tensor:
        r"""Computes the model loss for a ranking objective via the Bayesian
        Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`pos_edge_rank` vector and i-th entry
            in the :obj:`neg_edge_rank` entry must correspond to ranks of
            positive and negative edges of the same entity (*e.g.*, user).

        Args:
            pos_edge_rank (Tensor): Positive edge rankings.
            neg_edge_rank (Tensor): Negative edge rankings.
            lambda_reg (int, optional): The :math:`L_2` regularization strength
                of the Bayesian Personalized Ranking (BPR) loss.
                (default: 1e-4)
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch_geometric.nn.models.lightgcn.BPRLoss` loss
                function.
        """
        loss_fn = BPRLoss(lambda_reg, **kwargs)
        return loss_fn(pos_edge_rank, neg_edge_rank, self.embedding.weight)


def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.num_nodes}, '
                f'{self.embedding_dim}, num_layers={self.num_layers})')



class BPRLoss(_Loss):
    r"""The Bayesian Personalized Ranking (BPR) loss.

    The BPR loss is a pairwise loss that encourages the prediction of an
    observed entry to be higher than its unobserved counterparts
    (see `here <https://arxiv.org/abs/2002.02126>`__).

    .. math::
        L_{\text{BPR}} = - \sum_{u=1}^{M} \sum_{i \in \mathcal{N}_u}
        \sum_{j \not\in \mathcal{N}_u} \ln \sigma(\hat{y}_{ui} - \hat{y}_{uj})
        + \lambda \vert\vert \textbf{x}^{(0)} \vert\vert^2

    where :math:`lambda` controls the :math:`L_2` regularization strength.
    We compute the mean BPR loss for simplicity.

    Args:
        lambda_reg (float, optional): The :math:`L_2` regularization strength
            (default: 0).
        **kwargs (optional): Additional arguments of the underlying
            :class:`torch.nn.modules.loss._Loss` class.
    """
    __constants__ = ['lambda_reg']
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs) -> None:
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = lambda_reg

    def forward(self, positives: Tensor, negatives: Tensor,
                parameters: Tensor = None) -> Tensor:
        r"""Compute the mean Bayesian Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`positives` vector and i-th entry
            in the :obj:`negatives` entry should correspond to the same
            entity (*.e.g*, user), as the BPR is a personalized ranking loss.

        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        n_pairs = positives.size(0)
        log_prob = F.logsigmoid(positives - negatives).mean()
        regularization = 0

        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)

        return (-log_prob + regularization) / n_pairs

In [81]:
#!python -m pip install torch-sparse==0.6.13 -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html

Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
Collecting torch-sparse==0.6.13
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.13-cp38-cp38-linux_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.13
[0m

In [2]:
# !conda install pyg -c pyg

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: / 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
\ ^C
failed

CondaError: KeyboardInterrupt



In [None]:
from torch_geometric.nn.models import LightGCN

def build(itemnode, n_node, weight=None, logger=None, **kwargs):
    model = LightGCN(n_node, **kwargs)
    
    # if itemnode != "assessmentItemID":
    #     weight = "/opt/ml/dkt_team/code/lightgcn/weight/" + itemnode + "_best_model.pt"
    # else :
    #     weight = "/opt/ml/dkt_team/code/lightgcn/weight/best_model.pt"
        
    if weight:
        if not os.path.isfile(weight):
            logger.fatal("Model Weight File Not Exist")
        logger.info("Load model")
        state = torch.load(weight)["model"]
        model.load_state_dict(state)
        return model
    else:
        logger.info("No load model")
        return model

In [11]:
!python -m pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (setup.py) ... [?25ldone
[?25h  Created wheel for torch_geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773285 sha256=21f2ae9d2a095bb95fa5350730d6c4705dd2fb4c3325552d573daa9feb673e8b
  Stored in directory: /opt/ml/.cache/pip/wheels/b5/f0/b1/623215620977e23579933d227d42c0eb8db77489da26727c56
Successfully built torch_geometric
Installing collected packag

In [9]:
# model build
model = build(
    n_node,
    embedding_dim=8,
    num_layers=3,
    alpha=0.005,
    logger=None,
)
model.to(device)

NameError: name 'build' is not defined

In [None]:
# model train
train(
    model,
    train_data,
    n_epoch=20,
    learning_rate=0.005,
    use_wandb=False,
    weight="./weight",
    # logger=logger.getChild("train"),
)
logger.info("Task Complete")

In [None]:
# if not os.path.exists(CFG.output_dir):
#     os.makedirs(CFG.output_dir)

In [None]:
pred = inference(model, test_data)#, logger=logger.getChild("infer"))

In [None]:
pred = pred.detach().cpu().numpy()
pd.DataFrame({"prediction": pred}).to_csv(
        "plus_test_submission.csv", index_label="id"
)

# 임베딩 추출

In [None]:
n_node

In [None]:
model.embedding.weight.shape

In [None]:
model.embedding.weight

In [None]:
embed = model.get_embedding(train_data['edge'])

In [None]:
embed.to(device)

In [None]:
embed.shape

In [None]:
p1 = embed[edge_index[0]]
p2 = embed[edge_index['A060001001']]
print(p1)