In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as pyg_nn
from torch_geometric.transforms import RandomLinkSplit
import pandas as pd
import os
from tqdm import tqdm

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, aggr='lstm'):
        super(Encoder, self).__init__()
        self.conv1 = pyg_nn.SAGEConv(in_channels, 2 * out_channels, aggr=aggr) # Eventually change layer type
        self.batch1 = pyg_nn.BatchNorm(2 * out_channels)
        self.drop1 = nn.Dropout(p=0.2)
        self.conv2 = pyg_nn.SAGEConv(2 * out_channels, out_channels,  aggr=aggr) # Eventually change layer type
        self.batch2 = pyg_nn.BatchNorm(out_channels)
        self.drop2 = nn.Dropout(p=0.2)

    def forward(self, x, edge_index): # Modify to adapt to the dataset complexity
        x = self.conv1(x, edge_index)
        x = self.batch1(x)
        x = self.drop1(x)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = self.batch2(x)
        x = self.drop2(x)
        x = F.relu(x)

        return x

# Defining early stopping callback
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_auc = 0
        self.best_model_state = None

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0

# Defining traing and evaluation functions
def train(epoch):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_data.edge_index.to(device))
    loss = model.recon_loss(z, train_data.pos_edge_label_index.to(device))
    loss.backward()
    optimizer.step()

    return loss

def test(pos_edge_index, neg_edge_index):
    model.eval()
    # Encode the node features to get the embeddings
    with torch.no_grad():
        z = model.encode(x, train_data.edge_index.to(device))
    
    # Compute the reconstruction loss
    return model.test(z, pos_edge_index.to(device), neg_edge_index.to(device))
    

In [27]:
# Model definition
LATENT_DIM = 32
data_path = os.path.join('..', 'data', 'graph_data.pt')
dataset = torch.load(data_path, weights_only=False)
data = dataset['data']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Instantiate the customized encoder in GAE model
transform = RandomLinkSplit(is_undirected=True, split_labels=True,add_negative_train_samples=True)
train_data, val_data, test_data = transform(data)

model = pyg_nn.GAE(Encoder(data.num_features, LATENT_DIM)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x = train_data.x.to(device)

# Instantiating early stopper
early_stopper = EarlyStopping(patience=10)


Using device: cuda


In [28]:
# Training loop
for epoch in tqdm(range(1, 1001)):
    loss = train(epoch)
    # AUC tells how well the model is able to distinguish real nodes from fake ones
    # AP - Average Precision is the complementary metric of AUC 
    auc_val, ap_val = test(val_data.pos_edge_label_index, val_data.neg_edge_label_index)
    early_stopper(auc_val)

    if epoch % 10 == 0:
        print(f"[Epoch {epoch}] | Val AUC: {auc_val:.4f} | Val AP: {ap_val:.4f}")
    
    if early_stopper.early_stop:
        print("Early stopping triggered.")
        break
    
auc_test, ap_test = test(test_data.pos_edge_label_index, test_data.neg_edge_label_index)
print('Final Test Evaluation: AUC: {:.4f}, AP: {:.4f}'.format(auc_test, ap_test))

  1%|          | 12/1000 [00:00<00:43, 22.95it/s]

[Epoch 10] | Val AUC: 0.6417 | Val AP: 0.6825


  3%|▎         | 26/1000 [00:01<00:34, 28.29it/s]

[Epoch 20] | Val AUC: 0.7659 | Val AP: 0.8089


  3%|▎         | 34/1000 [00:01<00:32, 29.84it/s]

[Epoch 30] | Val AUC: 0.7876 | Val AP: 0.8081


  5%|▍         | 46/1000 [00:01<00:29, 32.17it/s]

[Epoch 40] | Val AUC: 0.8269 | Val AP: 0.8420


  5%|▌         | 54/1000 [00:01<00:32, 28.71it/s]

[Epoch 50] | Val AUC: 0.8278 | Val AP: 0.8423
Early stopping triggered.
Final Test Evaluation: AUC: 0.8187, AP: 0.8279





In [30]:
model.eval()
z = model.encode(x, data.edge_index.to(device))
emb_sample = z[0]
print(f'Number of nodes: {z.shape[0]}')
print(f'Number of edges: {data.edge_index.shape[1]}')
print(f'Embedding sample: {z}')

Number of nodes: 4250
Number of edges: 28266
Embedding sample: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [2.4177, 2.3349, 2.3633,  ..., 2.2226, 2.3124, 2.1132],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [2.5407, 2.6702, 2.6679,  ..., 2.2492, 2.6820, 2.4381],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)


In [16]:
mapping_list = dataset['idx2user']
embeddings = z
embeddings_np = embeddings.cpu().detach().numpy()

df = pd.DataFrame(embeddings_np)
df.insert(loc=0, column='id', value=range(len(embeddings_np)))
df.insert(loc=1, column='user_id', value= df['id'].map(mapping_list))

df.to_csv('user_embeddings.csv', index=False)