In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import ClusterData, ClusterLoader



In [2]:
processed_co_purchase_df = pd.read_csv('kcore70_title_imurl.csv', index_col=0)
processed_co_purchase_df.head()

Unnamed: 0,asin,also_bought
26843,B0006AAS7E,B000GX8WAG
26844,B0006AAS7E,B000JQJS6W
26845,B0006AAS7E,B000GX8W2O
26846,B0006AAS7E,B00FALPXLU
26847,B0006AAS7E,B00LSYRUVE


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
class AsinIdMap:
    def __init__(self, asin_list):
        self.asin_list = asin_list
        self.asin_to_idx = {}
        self.product_idx_to_asin = {}
        self._build()
    
    def _build(self):
        for idx, asin in enumerate(self.asin_list):
            self.asin_to_idx[asin] = idx
            self.product_idx_to_asin[idx] = asin
    
    def get_idx(self, asin):
        return self.asin_to_idx[asin]
    
    def get_asin(self, product_idx):
        return self.product_idx_to_asin[product_idx]
    
    def get_count(self):
        return len(self.asin_to_idx)

In [5]:
asin_list = list(processed_co_purchase_df['asin'].unique())
also_bought_list = list(processed_co_purchase_df['also_bought'].unique())
asin_list.extend(also_bought_list)
asin_list = list(set(asin_list))
asinIdLookup = AsinIdMap(asin_list)

In [6]:
from torch_geometric.data import Data

def build_product_to_product_edge_index(edges_pd, asin_id_map, right_key):
    product_to_product_edge_index = []
    for idx, row in edges_pd.iterrows():
            asin1 = row['asin']
            product1_idx = asin_id_map.get_idx(asin1)
            asin2 = row[right_key]
            product2_idx = asin_id_map.get_idx(asin2)
            if product1_idx == product2_idx:
                continue
            product_to_product_edge_index.append([product1_idx, product2_idx])

    return torch.tensor(product_to_product_edge_index).t().contiguous()

product_feature_dim = 128
num_products = len(asin_list)
x = torch.randn(num_products, product_feature_dim)
edge_index = build_product_to_product_edge_index(processed_co_purchase_df, asinIdLookup, 'also_bought')
print("Edge shape: ", edge_index.shape)

Edge shape:  torch.Size([2, 120987])


In [7]:
import torch_geometric.transforms as T
from torch_geometric.utils import to_undirected

data = Data(x=x, edge_index=edge_index)
data.edge_index = to_undirected(data.edge_index)

# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=True,
)
train_data, val_data, test_data = transform(data)


In [8]:
class EdgeGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(EdgeGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        # Edge prediction MLP
        self.edge_mlp = torch.nn.Sequential(
            torch.nn.Linear(2 * hidden_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_channels, 1)
        )

    def forward(self, x, edge_index):
        # First GCN layer
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        # Second GCN layer
        x = self.conv2(x, edge_index)

        # Apply MLP to pairs of node embeddings to predict edges
        row, col = edge_index
        edge_feat = torch.cat([x[row], x[col]], dim=1)
        return self.edge_mlp(edge_feat).squeeze()


In [9]:
model = EdgeGCN(128, hidden_channels=16).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [10]:
cluster_data = ClusterData(train_data, num_parts=2, recursive=False)
train_loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True)


Computing METIS partitioning...
Done!


In [11]:
def train():
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        edge_pred = model(batch.x, batch.edge_index)
        # Use only the edges used for supervision
        train_edges = batch.edge_label_index[:, batch.train_mask]
        labels = batch.edge_label[batch.train_mask].float()
        loss = F.binary_cross_entropy_with_logits(edge_pred[train_edges], labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Training loop
for epoch in range(300):
    loss = train()
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')

AttributeError: 'GlobalStorage' object has no attribute 'train_mask'