In [2]:
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np


from sklearn import  preprocessing


import torch
from torch import nn, Tensor

from torch_sparse import SparseTensor

from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

from lightgcn.datasets import load_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_dir = "/opt/ml/input/data"
device = "cuda"
data = load_data(data_dir=data_dir)

In [6]:
data.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


In [7]:
len(data)

2476706

In [12]:
def sampling_func(data, sample_pct):

    np.random.seed(42)

    N = len(data)

    sample_n = int(len(data)*sample_pct) # integer

    sample = data.take(np.random.permutation(N)[:sample_n])

    return sample

In [14]:
valid_set = data.groupby('userID', group_keys=False).apply(sampling_func, sample_pct=0.2)

In [22]:
valid_set

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A080035008,A080000035,0,2020-05-28 00:50:46,4685
1,0,A060056006,A060000056,0,2020-06-09 02:21:10,610
2,0,A060022005,A060000022,1,2020-04-28 00:47:41,593
3,0,A060031002,A060000031,1,2020-05-14 03:00:48,596
4,0,A060089005,A060000089,0,2020-07-14 02:14:52,625
...,...,...,...,...,...,...
492430,7439,A040197001,A040000197,1,2020-08-21 07:35:06,2132
492431,7440,A030136005,A030000136,0,2020-09-23 09:21:53,7691
492432,7440,A030197002,A030000197,0,2020-10-21 08:32:46,1984
492433,7440,A050096001,A050000096,1,2020-08-19 04:57:19,5267


In [21]:
valid_set.reset_index(drop=True, inplace=True)

In [17]:
data[["userID"]].value_counts()

userID
730       1860
481       1847
1112      1777
926       1773
1600      1737
          ... 
7398        14
7414        13
7390        13
7252        13
7441         9
Name: count, Length: 7442, dtype: int64

In [16]:
valid_set[["userID"]].value_counts()

userID
730       372
481       369
1112      355
926       354
1600      347
         ... 
7329        2
7252        2
7084        2
6981        2
7441        1
Name: count, Length: 7442, dtype: int64

In [13]:
data.groupby("userID")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8dd2663610>

In [4]:
data.groupby("userID")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8ddf893400>

In [8]:
data["userID"].value_counts().tail(3)

260111    7439
260112    7439
260113    7439
Name: userID, dtype: int64

In [5]:
lbl_user = preprocessing.LabelEncoder()
lbl_item = preprocessing.LabelEncoder()

data.userID = lbl_user.fit_transform(data.userID.values)
data.assessmentItemID = lbl_item.fit_transform(data.assessmentItemID.values)

In [7]:
data.head(20)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,A060000001,1,2020-03-24 00:17:11,7224
1,0,5355,A060000001,1,2020-03-24 00:17:14,7225
2,0,5356,A060000001,1,2020-03-24 00:17:22,7225
3,0,5357,A060000001,1,2020-03-24 00:17:29,7225
4,0,5358,A060000001,1,2020-03-24 00:17:36,7225
5,0,5359,A060000001,1,2020-03-24 00:17:47,7225
6,0,5367,A060000003,0,2020-03-26 05:52:03,7226
7,0,5368,A060000003,1,2020-03-26 05:52:10,7226
8,0,5369,A060000003,1,2020-03-26 05:53:14,7226
9,0,5370,A060000003,1,2020-03-26 05:53:29,7226


In [8]:
def load_edge_csv(df, 
                  src_index_col, 
                  dst_index_col, 
                  link_index_col):
    """Loads csv containing edges between users and items

    Args:
        src_index_col (str): column name of users
        dst_index_col (str): column name of items
        link_index_col (str): column name of user item interaction

    Returns:
        list of list: edge_index -- 2 by N matrix containing the node ids of N user-item edges
        N here is the number of interactions
    """
    
    edge_index = None
    
    # Constructing COO format edge_index
    
    src = [user_id for user_id in  df[src_index_col]]    
    dst = [(movie_id) for movie_id in df[dst_index_col]]

    edge_attr = torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long) == 1

    edge_index = [[], []]
    for i in range(edge_attr.shape[0]):
        if edge_attr[i]:
            edge_index[0].append(src[i])
            edge_index[1].append(dst[i])
    return edge_index

In [9]:
edge_index = load_edge_csv(data, src_index_col="userID", dst_index_col="assessmentItemID", link_index_col="answerCode")

In [10]:
edge_index = torch.LongTensor(edge_index) 
print(edge_index)
print(edge_index.size())

tensor([[   0,    0,    0,  ..., 7439, 7439, 7439],
        [5354, 5355, 5356,  ..., 3729, 3730, 3731]])
torch.Size([2, 1620024])


In [11]:
num_users = len(data['userID'].unique())
num_items = len(data['assessmentItemID'].unique())

In [12]:
def convert_r_mat_edge_index_to_adj_mat_edge_index(input_edge_index):
    R = torch.zeros((num_users, num_items))
    for i in range(len(input_edge_index[0])):
        row_idx = input_edge_index[0][i]
        col_idx = input_edge_index[1][i]
        R[row_idx][col_idx] = 1

    R_transpose = torch.transpose(R, 0, 1)
    adj_mat = torch.zeros((num_users + num_items , num_users + num_items))
    adj_mat[: num_users, num_users :] = R.clone()
    adj_mat[num_users :, : num_users] = R_transpose.clone()
    adj_mat_coo = adj_mat.to_sparse_coo()
    adj_mat_coo = adj_mat_coo.indices()
    return adj_mat_coo



In [13]:
edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(edge_index)
print(edge_index.size())

torch.Size([2, 4953412])


In [None]:
## Implementing LightGCN ##

In [14]:
class LightGCN(MessagePassing):
    """LightGCN Model as proposed in https://arxiv.org/abs/2002.02126
    """

    def __init__(self, num_users, 
                 num_items, 
                 embedding_dim=128, # define the embding vector length for each node
                 K=3, 
                 add_self_loops=False):
        """Initializes LightGCN Model

        Args:
            num_users (int): Number of users
            num_items (int): Number of items
            embedding_dim (int, optional): Dimensionality of embeddings. Defaults to 8.
            K (int, optional): Number of message passing layers. Defaults to 3.
            add_self_loops (bool, optional): Whether to add self loops for message passing. Defaults to False.
        """
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.K = K
        self.add_self_loops = add_self_loops

        # define user and item embedding for direct look up. 
        # embedding dimension: num_user/num_item x embedding_dim
        
        self.users_emb = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.embedding_dim) # e_u^0
        
        self.items_emb = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.embedding_dim) # e_i^0

        # "Fills the input Tensor with values drawn from the normal distribution"
        # according to LightGCN paper, this gives better performance
        nn.init.normal_(self.users_emb.weight, std=0.1)
        nn.init.normal_(self.items_emb.weight, std=0.1)

    def forward(self, edge_index: Tensor):
        
        edge_index_norm = gcn_norm(edge_index=edge_index, 
                                   add_self_loops=self.add_self_loops)

        # concat the user_emb and item_emb as the layer0 embing matrix
        # size will be (n_users + n_items) x emb_vector_len.   e.g: 10334 x 64
        emb_0 = torch.cat([self.users_emb.weight, self.items_emb.weight]) # E^0
        
        embs = [emb_0] # save the layer0 emb to the embs list
        
        # emb_k is the emb that we are actually going to push it through the graph layers
        # as described in lightGCN paper formula 7
        emb_k = emb_0 

        # push the embedding of all users and items through the Graph Model K times.
        # K here is the number of layers
        for i in range(self.K):
            emb_k = self.propagate(edge_index=edge_index_norm[0], x=emb_k, norm=edge_index_norm[1])
            embs.append(emb_k)
            
            
        # this is doing the formula8 in LightGCN paper  
            
        # the stacked embs is a list of embedding matrix at each layer
        #    it's of shape n_nodes x (n_layers + 1) x emb_vector_len. 
        #        e.g: torch.Size([10334, 4, 64])
        embs = torch.stack(embs, dim=1)
        
        # From LightGCn paper: "In our experiments, we find that setting α_k uniformly as 1/(K + 1)
        #    leads to good performance in general."
        emb_final = torch.mean(embs, dim=1) # E^K


        # splits into e_u^K and e_i^K
        users_emb_final, items_emb_final = torch.split(emb_final, [self.num_users, self.num_items]) 

        # returns e_u^K, e_u^0, e_i^K, e_i^0
        # here using .weight to get the tensor weights from n.Embedding
        return users_emb_final, self.users_emb.weight, items_emb_final, self.items_emb.weight

    def message(self, x_j, norm):
        # x_j is of shape:  edge_index_len x emb_vector_len
        #    e.g: torch.Size([77728, 64]
        #
        # x_j is basically the embedding of all the neighbors based on the src_list in coo edge index
        # 
        # elementwise multiply by the symmetrically norm. So it's essentiall what formula 7 in LightGCN
        # paper does but here we are using edge_index rather than Adj Matrix
        return norm.view(-1, 1) * x_j

layers = 3    
model = LightGCN(num_users=num_users, 
                 num_items=num_items, 
                 K=layers)

In [15]:
users_emb_final, users_emb_0, items_emb_final, items_emb_0 = model.forward(edge_index)