In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

ratings_df  = pd.read_pickle("../data/engage_rating.pkl")
ratings_df["rating"] = [1 for i in range(len(ratings_df))]
ratings_df 

Unnamed: 0,user_id,item_id,parent_asin,rating,timestamp
0,0,28833,B01G15TGCU,1,1491601677000
0,0,29361,B01HRPJQ2S,1,1475448112000
0,0,32319,B06XQYN77L,1,1658498138614
0,0,37990,B07F9SG3RX,1,1673250487226
0,0,40014,B07L4GYFK9,1,1677469171342
...,...,...,...,...,...
999,999,69440,B0BYSG291N,1,1559594592832
999,999,70383,B0C37XY2JZ,1,1540929869776
999,999,71229,B0C662M3GG,1,1547336147049
999,999,71551,B0C78KPQYH,1,1552672042595


In [2]:
unique_user_id = ratings_df['user_id'].unique()
unique_user_id = pd.DataFrame(data={
    'user_id': unique_user_id,
    'mapped_id': pd.RangeIndex(len(unique_user_id)),
})

unique_item_id = ratings_df['item_id'].unique()
unique_item_id = pd.DataFrame(data={
    'item_id': unique_item_id,
    'mapped_id': pd.RangeIndex(len(unique_item_id)),
})

In [3]:
ratings_user_id = pd.merge(ratings_df['user_id'], unique_user_id,
                            left_on='user_id', right_on='user_id', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mapped_id'].values)
ratings_item_id = pd.merge(ratings_df['item_id'], unique_item_id,
                            left_on='item_id', right_on='item_id', how='left')
ratings_item_id = torch.from_numpy(ratings_item_id['mapped_id'].values)

In [4]:
edge_index_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)
edge_index_user_to_item

tensor([[    0,     0,     0,  ...,   999,   999,   999],
        [    0,     1,     2,  ..., 42437,  9920, 14799]])

In [5]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data["user"].node_id = torch.arange(len(unique_user_id))
data["item"].node_id = torch.arange(len(unique_item_id))
data["user", "rates", "item"].edge_index = edge_index_user_to_item
data = T.ToUndirected()(data)

In [6]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "item"),
    rev_edge_types=("item", "rev_rates", "user"), 
)
train_data, val_data, test_data = transform(data)

In [7]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges:
edge_label_index = train_data["user", "rates", "item"].edge_label_index
edge_label = train_data["user", "rates", "item"].edge_label
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)



In [8]:
from torch_geometric.nn import SAGEConv, to_hetero
from torch import Tensor
import torch.nn.functional as F
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    def forward(self, x: Tensor, edge_index: Tensor):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor):
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_item = x_item[edge_label_index[1]]
        return (edge_feat_user * edge_feat_item).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.item_emb = torch.nn.Embedding(data["item"].num_nodes, hidden_channels)
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()
    def forward(self, data: HeteroData):
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "item": self.item_emb(data["item"].node_id),
        } 
        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["user"],
            x_dict["item"],
            data["user", "rates", "item"].edge_label_index,
        )
        return pred
        
model = Model(hidden_channels=64)

  torch.has_cuda,
  torch.has_cudnn,
  torch.has_mps,
  torch.has_mkldnn,


In [10]:
import tqdm
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cuda'


  0%|          | 0/309 [00:00<?, ?it/s]


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'

In [None]:
model.load_state_dict(model_parameters)

avg_loss, avg_precision = evaluate(model, test_loader, criterion)
print(avg_loss, avg_precision)

100%|██████████| 63/63 [00:43<00:00,  1.44it/s]

12.824420323447576 0.0013712416656183166





In [None]:
print(f'Val Loss: {avg_loss:.4f}, Precision@20: {avg_precision:.4f}')

Val Loss: 12.8244, Precision@20: 0.0014
