In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

import torch
import torch.nn as nn
from torch_geometric.nn.models import LightGCN

import math
import scipy.sparse as sp

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')
train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [4]:
concat = pd.concat([train, test]).reset_index(drop=True)
concat.shape

(2526700, 6)

In [5]:
user2vec = {v:k+1 for k, v in enumerate(sorted(concat.userID.unique()))}
n_user = concat.userID.nunique()

item2vec = {v:k+n_user for k, v in enumerate(sorted(concat.assessmentItemID.unique()))}
tag2vec = {v:k for k, v in enumerate(sorted(concat.KnowledgeTag.unique()))}

train['userID'] = train['userID'].apply(lambda x : user2vec[x])
test['userID'] = test['userID'].apply(lambda x : user2vec[x])
concat['userID'] = concat['userID'].apply(lambda x : user2vec[x])
train['assessmentItemID'] = train['assessmentItemID'].apply(lambda x : item2vec[x])
test['assessmentItemID'] = test['assessmentItemID'].apply(lambda x : item2vec[x])
concat['assessmentItemID'] = concat['assessmentItemID'].apply(lambda x : item2vec[x])
train['KnowledgeTag'] = train['KnowledgeTag'].apply(lambda x : tag2vec[x])
test['KnowledgeTag'] = test['KnowledgeTag'].apply(lambda x : tag2vec[x])
concat['KnowledgeTag'] = concat['KnowledgeTag'].apply(lambda x : tag2vec[x])

train.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,1,12796,A060000001,1,2020-03-24 00:17:11,556
1,1,12797,A060000001,1,2020-03-24 00:17:14,557
2,1,12798,A060000001,1,2020-03-24 00:17:22,557
3,1,12799,A060000001,1,2020-03-24 00:17:29,557
4,1,12800,A060000001,1,2020-03-24 00:17:36,557


In [6]:
data = concat[concat.answerCode >= 0]

train_index, valid_index = train_test_split(data.index, random_state=0, test_size=0.2)

train = data.loc[train_index, ['userID', 'assessmentItemID', 'KnowledgeTag', 'answerCode']]
userid, itemid, tags, answer = train.userID, train.assessmentItemID, train.KnowledgeTag, train.answerCode
train_edge, train_feature, train_label = [], [], []
for user, item, tag, acode in zip(userid, itemid, tags, answer):
    train_edge.append([user, item])
    train_feature.append(tag)
    train_label.append(acode)

edge_train = torch.LongTensor(train_edge).T
feature_train = torch.LongTensor(train_feature)
y_train = torch.LongTensor(train_label)

valid = data.loc[valid_index, ['userID', 'assessmentItemID', 'KnowledgeTag', 'answerCode']]
userid, itemid, tags, answer = valid.userID, valid.assessmentItemID, valid.KnowledgeTag, valid.answerCode
valid_edge, valid_feature, valid_label = [], [], []
for user, item, tag, acode in zip(userid, itemid, tags, answer):
    valid_edge.append([user, item])
    valid_feature.append(tag)
    valid_label.append(acode)
    
edge_valid = torch.LongTensor(valid_edge).T
feature_valid = torch.LongTensor(valid_feature)
y_valid = torch.LongTensor(valid_label)

train_data = dict(edge=edge_train.to(device), feature=feature_train.to(device), label=y_train.to(device))
valid_data = dict(edge=edge_valid.to(device), feature=feature_valid.to(device), label=y_valid.to(device))

In [9]:
from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_sparse import SparseTensor

from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor


from typing import Optional, Union

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Embedding, ModuleList
from torch.nn.modules.loss import _Loss
from torch_sparse import SparseTensor

from torch_geometric.nn.conv import LGConv
from torch_geometric.typing import Adj, OptTensor


class LightGCN(torch.nn.Module):
    r"""The LightGCN model from the `"LightGCN: Simplifying and Powering
    Graph Convolution Network for Recommendation"
    <https://arxiv.org/abs/2002.02126>`_ paper.

    :class:`~torch_geometric.nn.models.LightGCN` learns embeddings by linearly
    propagating them on the underlying graph, and uses the weighted sum of the
    embeddings learned at all layers as the final embedding

    .. math::
        \textbf{x}_i = \sum_{l=0}^{L} \alpha_l \textbf{x}^{(l)}_i,

    where each layer's embedding is computed as

    .. math::
        \mathbf{x}^{(l+1)}_i = \sum_{j \in \mathcal{N}(i)}
        \frac{1}{\sqrt{\deg(i)\deg(j)}}\mathbf{x}^{(l)}_j.

    Two prediction heads and trainign objectives are provided:
    **link prediction** (via
    :meth:`~torch_geometric.nn.models.LightGCN.link_pred_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.predict_link`) and
    **recommendation** (via
    :meth:`~torch_geometric.nn.models.LightGCN.recommendation_loss` and
    :meth:`~torch_geometric.nn.models.LightGCN.recommend`).

    .. note::

        Embeddings are propagated according to the graph connectivity specified
        by :obj:`edge_index` while rankings or link probabilities are computed
        according to the edges specified by :obj:`edge_label_index`.

    Args:
        num_nodes (int): The number of nodes in the graph.
        embedding_dim (int): The dimensionality of node embeddings.
        num_layers (int): The number of
            :class:`~torch_geometric.nn.conv.LGConv` layers.
        alpha (float or Tensor, optional): The scalar or vector specifying the
            re-weighting coefficients for aggregating the final embedding.
            If set to :obj:`None`, the uniform initialization of
            :obj:`1 / (num_layers + 1)` is used. (default: :obj:`None`)
        **kwargs (optional): Additional arguments of the underlying
            :class:`~torch_geometric.nn.conv.LGConv` layers.
    """
    def __init__(
        self,
        num_nodes: int,
        embedding_dim: int,
        hidden_dim: int,
        feature_len: int,
        num_layers: int,
        alpha: Optional[Union[float, Tensor]] = None,
        **kwargs,
    ):
        super().__init__()

        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.feature_len = feature_len
        
        if alpha is None:
            alpha = 1. / (num_layers + 1)

        if isinstance(alpha, Tensor):
            assert alpha.size(0) == num_layers + 1
        else:
            alpha = torch.tensor([alpha] * (num_layers + 1))
        self.register_buffer('alpha', alpha)

        self.embedding = Embedding(self.num_nodes, self.embedding_dim)
        self.convs = ModuleList([LGConv(**kwargs) for _ in range(self.num_layers)])
        self.embedding_feature = Embedding(self.feature_len, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim*2, self.hidden_dim)
        self.ln = nn.Linear(self.hidden_dim, 1)
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.embedding.weight)
        for conv in self.convs:
            conv.reset_parameters()


    def get_embedding(self, edge_index: Adj) -> Tensor:
        x = self.embedding.weight
        out = x * self.alpha[0]

        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            out = out + x * self.alpha[i + 1]

        return out


    def forward(self, edge_index: Adj, feature,
                edge_label_index: OptTensor = None) -> Tensor:
        r"""Computes rankings for pairs of nodes.

        Args:
            edge_index (Tensor or SparseTensor): Edge tensor specifying the
                connectivity of the graph.
            edge_label_index (Tensor, optional): Edge tensor specifying the
                node pairs for which to compute rankings or probabilities.
                If :obj:`edge_label_index` is set to :obj:`None`, all edges in
                :obj:`edge_index` will be used instead. (default: :obj:`None`)
        """
        if edge_label_index is None:
            if isinstance(edge_index, SparseTensor):
                edge_label_index = torch.stack(edge_index.coo()[:2], dim=0)
            else:
                edge_label_index = edge_index

        out = self.get_embedding(edge_index)
        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]
        
        feature = self.embedding_feature(feature)
        out = torch.concat([(out_src * out_dst), feature], 1)
        
        out, _ = self.lstm(out)
        out = self.ln(out)
        return out.view(-1)

    def predict_link(self, edge_index: Adj, edge_label_index: OptTensor = None,
                     prob: bool = False) -> Tensor:
        r"""Predict links between nodes specified in :obj:`edge_label_index`.

        Args:
            prob (bool): Whether probabilities should be returned. (default:
                :obj:`False`)
        """
        pred = self(edge_index, edge_label_index).sigmoid()
        return pred if prob else pred.round()


    def recommend(self, edge_index: Adj, src_index: OptTensor = None,
                  dst_index: OptTensor = None, k: int = 1) -> Tensor:
        r"""Get top-:math:`k` recommendations for nodes in :obj:`src_index`.

        Args:
            src_index (Tensor, optional): Node indices for which
                recommendations should be generated.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            dst_index (Tensor, optional): Node indices which represent the
                possible recommendation choices.
                If set to :obj:`None`, all nodes will be used.
                (default: :obj:`None`)
            k (int, optional): Number of recommendations. (default: :obj:`1`)
        """
        out_src = out_dst = self.get_embedding(edge_index)

        if src_index is not None:
            out_src = out_src[src_index]

        if dst_index is not None:
            out_dst = out_dst[dst_index]

        pred = out_src @ out_dst.t()
        top_index = pred.topk(k, dim=-1).indices

        if dst_index is not None:  # Map local top-indices to original indices.
            top_index = dst_index[top_index.view(-1)].view(*top_index.size())

        return top_index


    def link_pred_loss(self, pred: Tensor, edge_label: Tensor,
                       **kwargs) -> Tensor:
        r"""Computes the model loss for a link prediction objective via the
        :class:`torch.nn.BCEWithLogitsLoss`.

        Args:
            pred (Tensor): The predictions.
            edge_label (Tensor): The ground-truth edge labels.
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch.nn.BCEWithLogitsLoss` loss function.
        """
        loss_fn = torch.nn.BCEWithLogitsLoss(**kwargs)
        return loss_fn(pred, edge_label.to(pred.dtype))


    def recommendation_loss(self, pos_edge_rank: Tensor, neg_edge_rank: Tensor,
                            lambda_reg: float = 1e-4, **kwargs) -> Tensor:
        r"""Computes the model loss for a ranking objective via the Bayesian
        Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`pos_edge_rank` vector and i-th entry
            in the :obj:`neg_edge_rank` entry must correspond to ranks of
            positive and negative edges of the same entity (*e.g.*, user).

        Args:
            pos_edge_rank (Tensor): Positive edge rankings.
            neg_edge_rank (Tensor): Negative edge rankings.
            lambda_reg (int, optional): The :math:`L_2` regularization strength
                of the Bayesian Personalized Ranking (BPR) loss.
                (default: 1e-4)
            **kwargs (optional): Additional arguments of the underlying
                :class:`torch_geometric.nn.models.lightgcn.BPRLoss` loss
                function.
        """
        loss_fn = BPRLoss(lambda_reg, **kwargs)
        return loss_fn(pos_edge_rank, neg_edge_rank, self.embedding.weight)


    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.num_nodes}, '
                f'{self.embedding_dim}, num_layers={self.num_layers})')



class BPRLoss(_Loss):
    r"""The Bayesian Personalized Ranking (BPR) loss.

    The BPR loss is a pairwise loss that encourages the prediction of an
    observed entry to be higher than its unobserved counterparts
    (see `here <https://arxiv.org/abs/2002.02126>`__).

    .. math::
        L_{\text{BPR}} = - \sum_{u=1}^{M} \sum_{i \in \mathcal{N}_u}
        \sum_{j \not\in \mathcal{N}_u} \ln \sigma(\hat{y}_{ui} - \hat{y}_{uj})
        + \lambda \vert\vert \textbf{x}^{(0)} \vert\vert^2

    where :math:`lambda` controls the :math:`L_2` regularization strength.
    We compute the mean BPR loss for simplicity.

    Args:
        lambda_reg (float, optional): The :math:`L_2` regularization strength
            (default: 0).
        **kwargs (optional): Additional arguments of the underlying
            :class:`torch.nn.modules.loss._Loss` class.
    """
    __constants__ = ['lambda_reg']
    lambda_reg: float

    def __init__(self, lambda_reg: float = 0, **kwargs) -> None:
        super().__init__(None, None, "sum", **kwargs)
        self.lambda_reg = lambda_reg

    def forward(self, positives: Tensor, negatives: Tensor,
                parameters: Tensor = None) -> Tensor:
        r"""Compute the mean Bayesian Personalized Ranking (BPR) loss.

        .. note::

            The i-th entry in the :obj:`positives` vector and i-th entry
            in the :obj:`negatives` entry should correspond to the same
            entity (*.e.g*, user), as the BPR is a personalized ranking loss.

        Args:
            positives (Tensor): The vector of positive-pair rankings.
            negatives (Tensor): The vector of negative-pair rankings.
            parameters (Tensor, optional): The tensor of parameters which
                should be used for :math:`L_2` regularization
                (default: :obj:`None`).
        """
        n_pairs = positives.size(0)
        log_prob = F.logsigmoid(positives - negatives).mean()
        regularization = 0

        if self.lambda_reg != 0:
            regularization = self.lambda_reg * parameters.norm(p=2).pow(2)

        return (-log_prob + regularization) / n_pairs

In [10]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data['label'])

    def __getitem__(self, idx):
        edge = torch.stack([self.data['edge'][0][idx], self.data['edge'][1][idx]])
        feature = self.data['feature'][idx]
        label = self.data['label'][idx]
        return edge, feature, label

In [11]:
train_set = CustomDataset(train_data)
train_dataloader = DataLoader(train_set, batch_size=2048, shuffle=True)

In [22]:
from tqdm import tqdm

embedding = 256
hidden_dim = 512
layers = 2

valid_edge = torch.stack([i for i in valid_data['edge']])
valid_feature = valid_data['feature']
valid_label = valid_data['label']
feature = max(concat.KnowledgeTag.values)+1

model = LightGCN(concat.userID.nunique()+concat.assessmentItemID.nunique(), embedding, hidden_dim, feature, layers)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
best_auc = 0
best_acc = 0
patience = 0
for epoch in range(1001):
    total_loss = 0.0
    total_acc = 0.0
    total_auc = 0.0
    model.train()
    for edge, feature, label in tqdm(train_dataloader, total=len(train_dataloader)):
        preds = model(edge.T, feature)
        loss = model.link_pred_loss(preds, label)
        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        prob = preds.sigmoid()
        prob = prob.detach().cpu().numpy()
        acc = accuracy_score(label.detach().cpu().numpy(), prob > 0.5)
        auc = roc_auc_score(label.detach().cpu().numpy(), prob)
        total_loss += loss.item()
        total_acc += acc
        total_auc += auc
        
    if not epoch % 1:
        print(f" * In epoch {(epoch+1):04}, loss={total_loss/len(train_dataloader):.03f}, acc={total_acc/len(train_dataloader):.03f}, AUC={total_auc/len(train_dataloader):.03f}")    
    
    with torch.no_grad() :
        model.eval()
        preds = model(valid_edge, valid_feature)
        loss = model.link_pred_loss(preds, valid_label)

        prob = preds.sigmoid()
        prob = prob.detach().cpu().numpy()
        acc = accuracy_score(valid_label.detach().cpu().numpy(), prob > 0.5)
        auc = roc_auc_score(valid_label.detach().cpu().numpy(), prob)
        
    if auc > best_auc :
        best_auc = auc
        best_acc = acc
        torch.save(
            {
                "state_dict": model.state_dict()
            },
            "model_2.pt",
        )
        patience = 0
    else :
        patience += 1
        
    if patience == 3 :
        break
        
    if not epoch % 1:
        print(f" * In epoch {(epoch+1):04}, val_loss={loss:.03f}, val_acc={acc:.03f}, val_AUC={auc:.03f}")
        
print(f"best_acc={best_acc:.03f}, best_AUC={best_auc:.03f}")

100%|██████████| 987/987 [02:26<00:00,  6.73it/s]


 * In epoch 0001, loss=0.607, acc=0.668, AUC=0.663


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0001, val_loss=0.604, val_acc=0.669, val_AUC=0.671


100%|██████████| 987/987 [02:25<00:00,  6.81it/s]


 * In epoch 0002, loss=0.601, acc=0.675, AUC=0.675


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0002, val_loss=0.586, val_acc=0.698, val_AUC=0.701


100%|██████████| 987/987 [02:26<00:00,  6.75it/s]


 * In epoch 0003, loss=0.512, acc=0.751, AUC=0.796


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0003, val_loss=0.488, val_acc=0.767, val_AUC=0.821


100%|██████████| 987/987 [02:26<00:00,  6.75it/s]


 * In epoch 0004, loss=0.432, acc=0.800, AUC=0.864


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0004, val_loss=0.484, val_acc=0.772, val_AUC=0.831


100%|██████████| 987/987 [02:26<00:00,  6.76it/s]


 * In epoch 0005, loss=0.321, acc=0.860, AUC=0.928


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0005, val_loss=0.555, val_acc=0.761, val_AUC=0.815


100%|██████████| 987/987 [02:26<00:00,  6.74it/s]


 * In epoch 0006, loss=0.177, acc=0.930, AUC=0.979


  0%|          | 0/987 [00:00<?, ?it/s]

 * In epoch 0006, val_loss=0.744, val_acc=0.749, val_AUC=0.797


100%|██████████| 987/987 [02:25<00:00,  6.78it/s]


 * In epoch 0007, loss=0.087, acc=0.970, AUC=0.995
best_acc=0.772, best_AUC=0.831


In [23]:
test = concat[concat.answerCode == -1]
users, features, items = [], [], []
for _, (user, item, tag) in test[['userID', 'assessmentItemID', 'KnowledgeTag']].iterrows() :
    users.append(user)
    items.append(item)
    features.append(tag)
users = torch.LongTensor(users)
items = torch.LongTensor(items)
features = torch.LongTensor(features)
test_edge = torch.stack([users, items])
test_edge

tensor([[    4,     5,    14,  ...,  7417,  7418,  7440],
        [12407, 15190, 14926,  ..., 12795, 12795, 11174]])

In [24]:
feature = max(data.KnowledgeTag.values)+1

In [25]:
test_model = LightGCN(concat.userID.nunique()+concat.assessmentItemID.nunique(), embedding, hidden_dim, feature, layers)
test_model = test_model.to(device)
test_model.load_state_dict(torch.load('model_2.pt')['state_dict'])

test_model.eval()
test_edge = test_edge.to(device)
features = features.to(device)
prediction = test_model(test_edge, features).sigmoid()
prediction

tensor([0.2905, 0.6706, 0.3809, 0.6843, 0.1597, 0.8890, 0.3735, 0.2440, 0.0682,
        0.9097, 0.5829, 0.3120, 0.9815, 0.2285, 0.5526, 0.9674, 0.0602, 0.6482,
        0.9040, 0.0627, 0.9611, 0.4802, 0.4992, 0.4836, 0.2932, 0.8730, 0.9356,
        0.8392, 0.5730, 0.7386, 0.8369, 0.8767, 0.9242, 0.3176, 0.9703, 0.8153,
        0.1886, 0.3730, 0.2812, 0.3085, 0.6109, 0.1006, 0.1279, 0.1599, 0.3782,
        0.8856, 0.3330, 0.2337, 0.9865, 0.8707, 0.3951, 0.4807, 0.8490, 0.0713,
        0.1354, 0.8612, 0.3778, 0.9026, 0.0641, 0.2295, 0.9163, 0.9140, 0.7971,
        0.2752, 0.1240, 0.3064, 0.6886, 0.4206, 0.1543, 0.1878, 0.6296, 0.8736,
        0.0423, 0.1443, 0.1656, 0.4593, 0.1129, 0.8866, 0.5015, 0.9003, 0.4466,
        0.4766, 0.2121, 0.4335, 0.6643, 0.4071, 0.3626, 0.3372, 0.2966, 0.1143,
        0.3527, 0.0314, 0.1771, 0.7526, 0.9534, 0.7454, 0.0267, 0.5991, 0.9624,
        0.9016, 0.3473, 0.0726, 0.5335, 0.1426, 0.3810, 0.6283, 0.7870, 0.8780,
        0.1149, 0.1987, 0.9290, 0.8416, 

In [26]:
sub = pd.read_csv('sample_submission.csv')
sub['prediction'] = prediction.detach().cpu()
sub.to_csv(f'{embedding}_{hidden_dim}_{layers}_{best_auc:.03f}_tag.csv', index=False)