#Imports

In [5]:
!pip install torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch_sparse

# import required modules
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import ctypes
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics, preprocessing
import copy
from torch_geometric.utils import degree

import torch
from torch import nn, optim, Tensor

from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html


In [6]:
# load dataset from analysis
df = pd.read_csv("/content/output1.csv")
df

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code
0,196,242,3,881250949,0.49,1,0.952381,0.365239
1,196,393,4,881251863,0.49,1,0.952381,0.365239
2,196,381,4,881251728,0.49,1,0.952381,0.365239
3,196,251,3,881251274,0.49,1,0.952381,0.365239
4,196,655,5,881251793,0.49,1,0.952381,0.365239
...,...,...,...,...,...,...,...,...
99995,941,919,5,875048887,0.20,1,0.857143,0.013592
99996,941,273,3,875049038,0.20,1,0.857143,0.013592
99997,941,1,5,875049144,0.20,1,0.857143,0.013592
99998,941,294,4,875048532,0.20,1,0.857143,0.013592


In [32]:
# perform encoding preprocessing to ensure that user_id and item_id are both
# in the range of [0, unique_count] so it won't cause out of bound issue when indexing embeddings
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()

df.user_id = lbl_user.fit_transform(df.user_id.values)
df.movie_id = lbl_movie.fit_transform(df.movie_id.values)

# Constructing Edges and Nodes


In [33]:
# load edges between users and movies
def load_edge_csv(df,
                  src_index_col,
                  dst_index_col,
                  link_index_col,
                  rating_threshold=3):
    """Loads csv containing edges between users and items

    Args:
        src_index_col (str): column name of users
        dst_index_col (str): column name of items
        link_index_col (str): column name of user item interaction
        rating_threshold (int, optional): Threshold to determine positivity of edge. Defaults to 4.

    Returns:
        list of list: edge_index -- 2 by N matrix containing the node ids of N user-item edges
        N here is the number of interactions
    """

    edge_index = None

    # Constructing COO format edge_index from input rating events

    # get user_ids from rating events in the order of occurance
    src = [user_id for user_id in  df['user_id']]
    # get movie_id from rating events in the order of occurance
    dst = [(movie_id) for movie_id in df['movie_id']]

    # apply rating threshold
    edge_attr = torch.from_numpy(df[link_index_col].values).view(-1, 1).to(torch.long) >= rating_threshold

    edge_index = [[], []]
    for i in range(edge_attr.shape[0]):
        if edge_attr[i]:
            edge_index[0].append(src[i])
            edge_index[1].append(dst[i])
    return edge_index

In [34]:
edge_index = load_edge_csv(
    df,
    src_index_col='user_id',
    dst_index_col='movie_id',
    link_index_col='rating',
    rating_threshold=3.5,
)

print(f"{len(edge_index)} x {len(edge_index[0])}")


2 x 55375


In [35]:
# Convert to tensor
# We use LongTensor here because the .propagate() method in the model needs either LongTensor or SparseTensor
edge_index = torch.LongTensor(edge_index)
print(edge_index)
print(edge_index.size())

tensor([[ 195,  195,  195,  ...,  940,  940,  940],
        [ 392,  380,  654,  ...,    0,  293, 1006]])
torch.Size([2, 55375])


In [36]:
# Note: this is the total num_users and num_movies before we apply the rating_threshold
num_users = len(df['user_id'].unique())
num_movies = len(df['movie_id'].unique())

In [44]:
print(num_users)
print(num_movies)

943
1682


In [37]:
num_interactions = edge_index.shape[1]

# split the edges of the graph using a 80/10/10 train/validation/test split
all_indices = [i for i in range(num_interactions)]

train_indices, test_indices = train_test_split(all_indices,
                                               test_size=0.2,
                                               random_state=1)

val_indices, test_indices = train_test_split(test_indices,
                                             test_size=0.5,
                                             random_state=1)

train_edge_index = edge_index[:, train_indices]
val_edge_index = edge_index[:, val_indices]
test_edge_index = edge_index[:, test_indices]

In [38]:
print(f"num_users {num_users}, num_movies {num_movies}, num_interactions {num_interactions}")
print(f"train_edge_index {train_edge_index}")
print((num_users + num_movies))
print(torch.unique(train_edge_index[0]).size())
print(torch.unique(train_edge_index[1]).size())


num_users 943, num_movies 1682, num_interactions 55375
train_edge_index tensor([[338, 455,  29,  ...,  58,  15, 520],
        [ 31, 345, 254,  ..., 671, 497, 175]])
2625
torch.Size([942])
torch.Size([1407])


In [39]:
def convert_r_mat_edge_index_to_adj_mat_edge_index(input_edge_index):
    R = torch.zeros((num_users, num_movies))
    for i in range(len(input_edge_index[0])):
        row_idx = input_edge_index[0][i]
        col_idx = input_edge_index[1][i]
        R[row_idx][col_idx] = 1

    R_transpose = torch.transpose(R, 0, 1)
    adj_mat = torch.zeros((num_users + num_movies , num_users + num_movies))
    adj_mat[: num_users, num_users :] = R.clone()
    adj_mat[num_users :, : num_users] = R_transpose.clone()
    adj_mat_coo = adj_mat.to_sparse_coo()
    adj_mat_coo = adj_mat_coo.indices()
    return adj_mat_coo

In [40]:
def convert_adj_mat_edge_index_to_r_mat_edge_index(input_edge_index):
    sparse_input_edge_index = SparseTensor(row=input_edge_index[0],
                                           col=input_edge_index[1],
                                           sparse_sizes=((num_users + num_movies), num_users + num_movies))
    adj_mat = sparse_input_edge_index.to_dense()
    interact_mat = adj_mat[: num_users, num_users :]
    r_mat_edge_index = interact_mat.to_sparse_coo().indices()
    return r_mat_edge_index

In [41]:
# convert from r_mat (interaction matrix) edge index to adjescency matrix's edge index
# so we can feed it to model
train_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(train_edge_index)
val_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(val_edge_index)
test_edge_index = convert_r_mat_edge_index_to_adj_mat_edge_index(test_edge_index)

In [45]:
torch.save(train_edge_index, 'train_edge_index.pt')
torch.save(val_edge_index, 'val_edge_index.pt')
torch.save(test_edge_index, 'test_edge_index.pt')

In [43]:
print(train_edge_index)
print(train_edge_index.size())
print(val_edge_index)
print(val_edge_index.size())
print(test_edge_index)
print(test_edge_index.size())

tensor([[   0,    0,    0,  ..., 2606, 2606, 2616],
        [ 943,  945,  948,  ...,  781,  869,  839]])
torch.Size([2, 88600])
tensor([[   0,    0,    0,  ..., 2562, 2570, 2585],
        [ 954, 1001, 1033,  ...,  643,  674,  900]])
torch.Size([2, 11074])
tensor([[   0,    0,    0,  ..., 2570, 2598, 2606],
        [ 962,  981,  986,  ...,  706,  882,  879]])
torch.Size([2, 11076])
