In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as pyg_nn
from torch_geometric.transforms import RandomLinkSplit

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, aggr='mean'):
        super(Encoder, self).__init__()
        self.conv1 = pyg_nn.SAGEConv(in_channels, 2 * out_channels, aggr=aggr) # Eventually change layer type
        self.batch1 = pyg_nn.BatchNorm(2 * out_channels)
        self.drop1 = nn.Dropout(p=0.2)
        self.conv2 = pyg_nn.SAGEConv(2 * out_channels, out_channels,  aggr=aggr) # Eventually change layer type
        self.batch2 = pyg_nn.BatchNorm(out_channels)

    def forward(self, x, edge_index): # Modify to adapt to the dataset complexity
        x = self.conv1(x, edge_index)
        x = self.batch1(x)
        x = self.drop1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.batch2(x)

        return x
    
class Decoder(nn.Module):
    def __init__(self, in_channels):
        super(Decoder, self).__init__()
        self.linear = nn.Linear(in_channels, 1)

    def forward(self, z, edge_index):
        # By default, it computes the inner product of the node embeddings
        adj = torch.matmul(z[edge_index[0]], z[edge_index[1]].t())
        return adj[edge_index[0], edge_index[1]]
    
# Defining traing and evaluation functions
def train(epoch):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_data.edge_index.to(device))
    loss = model.recon_loss(z, train_data.pos_edge_label_index.to(device))
    loss.backward()
    optimizer.step()

def test(pos_edge_index, neg_edge_index):
    model.eval()
    # Encode the node features to get the embeddings
    with torch.no_grad():
        z = model.encode(x, train_data.edge_index.to(device))
    
    # Compute the reconstruction loss
    return model.test(z, pos_edge_index.to(device), neg_edge_index.to(device))
    

In [3]:
# Model definition
import os
LATENT_DIM = 32
data_path = os.path.join('..', 'data', 'graph_data.pt')
dataset = torch.load(data_path, weights_only=False)
data = dataset['data']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Instantiate the customized encoder in GAE model
transform = RandomLinkSplit(is_undirected=True, split_labels=True,add_negative_train_samples=True)
train_data, val_data, test_data = transform(data)

model = pyg_nn.GAE(Encoder(data.num_features, LATENT_DIM)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x = train_data.x.to(device)

Using device: cuda


In [4]:
# Training loop
for epoch in range(1, 1000):
    loss = train(epoch)
    # AUC tells how well the model is able to distinguish real nodes from fake ones
    # AP - Average Precision is the complementary metric of AUC 
    auc, ap = test(test_data.pos_edge_label_index, test_data.neg_edge_label_index)

    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.7651, AP: 0.7431
Epoch: 020, AUC: 0.6375, AP: 0.5898
Epoch: 030, AUC: 0.6699, AP: 0.6291
Epoch: 040, AUC: 0.7742, AP: 0.7416
Epoch: 050, AUC: 0.8173, AP: 0.7723
Epoch: 060, AUC: 0.8281, AP: 0.7813
Epoch: 070, AUC: 0.8487, AP: 0.8110
Epoch: 080, AUC: 0.8543, AP: 0.8228
Epoch: 090, AUC: 0.8574, AP: 0.8274
Epoch: 100, AUC: 0.8604, AP: 0.8299
Epoch: 110, AUC: 0.8616, AP: 0.8342
Epoch: 120, AUC: 0.8572, AP: 0.8316
Epoch: 130, AUC: 0.8625, AP: 0.8360
Epoch: 140, AUC: 0.8616, AP: 0.8361
Epoch: 150, AUC: 0.8646, AP: 0.8374
Epoch: 160, AUC: 0.8622, AP: 0.8371
Epoch: 170, AUC: 0.8575, AP: 0.8339
Epoch: 180, AUC: 0.8585, AP: 0.8349
Epoch: 190, AUC: 0.8579, AP: 0.8346
Epoch: 200, AUC: 0.8572, AP: 0.8326
Epoch: 210, AUC: 0.8410, AP: 0.8189
Epoch: 220, AUC: 0.8401, AP: 0.8184
Epoch: 230, AUC: 0.8229, AP: 0.8028
Epoch: 240, AUC: 0.8443, AP: 0.8223
Epoch: 250, AUC: 0.8483, AP: 0.8243
Epoch: 260, AUC: 0.8395, AP: 0.8181
Epoch: 270, AUC: 0.8499, AP: 0.8172
Epoch: 280, AUC: 0.8447, AP:

In [5]:
model.eval()
z = model.encode(x, train_data.edge_index.to(device))
emb_sample = z[0]
print(f'Number of nodes: {z.shape[0]}')
print(f'Number of edges: {train_data.edge_index.shape[1]}')
print(f'Embedding sample: {z}')

Number of nodes: 4877
Number of edges: 20618
Embedding sample: tensor([[ 0.3338, -0.5992, -0.1126,  ...,  0.0440, -0.1389, -0.3640],
        [ 0.0763, -0.2880, -0.2657,  ..., -0.4623, -0.2076, -0.0222],
        [ 0.4400, -0.7084,  0.0102,  ..., -0.2066,  0.0652,  0.1009],
        ...,
        [-3.2017,  4.4704, -2.5313,  ...,  3.7020,  4.2714, -3.1474],
        [-3.2012,  4.4690, -2.5299,  ...,  3.7012,  4.2710, -3.1457],
        [-3.2019,  4.4708, -2.5316,  ...,  3.7022,  4.2715, -3.1479]],
       device='cuda:0', grad_fn=<NativeBatchNormBackward0>)


In [6]:
mapping_list = dataset['idx2user']
embeddings = z
embeddings_np = embeddings.cpu().detach().numpy()

df = pd.DataFrame(embeddings_np)
df.insert(loc=0, column='id', value=range(len(embeddings_np)))
df.insert(loc=1, column='user_id', value= df['id'].map(mapping_list))

df.to_csv('user_embeddings.csv', index=False)

NameError: name 'pd' is not defined