In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import pandas as pd
import numpy as np

class Embedder(nn.Module):
    def __init__(self, n_nodes, a_dim, r_dim):
        super(Embedder, self).__init__()
        self.A_embs = nn.Embedding(n_nodes, a_dim)
        self.R_embs = None
        if r_dim > 0:
            self.R_embs = nn.Embedding(n_nodes, r_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, lhs, rhs):
        A_lhs = self.A_embs(lhs)
        A_rhs = self.A_embs(rhs)
        scores = (A_lhs * A_rhs).sum(1).squeeze()
        if self.R_embs:
            R_lhs = self.R_embs(lhs)
            R_rhs = self.R_embs(rhs)
            scores = scores - (R_lhs * R_rhs).sum(1).squeeze()
        return self.sigmoid(scores)
    
    def adj_mat(self):
        sims = torch.matmul(self.A_embs.weight.data, torch.transpose(self.A_embs.weight.data, 0, 1))
        if self.R_embs:
            Rsims = torch.matmul(self.R_embs.weight.data, torch.transpose(self.R_embs.weight.data, 0, 1))
            sims = sims - Rsims
        
        return self.sigmoid(sims)


def evalReconstructionV2(adj_mat, score_mat):
    score_mat_non_neighbors = score_mat.copy()
    score_mat_non_neighbors[adj_mat==1] = -np.Inf
    
    adj_mat2 = adj_mat.copy()
    adj_mat2[np.diag_indices_from(adj_mat2)] = -1

    ranks = []
    for i in range(adj_mat.shape[0]):
        neighbor_indexes = np.where(adj_mat2[i,:]==1)[0]
        for j in neighbor_indexes:
            target_score = score_mat[i,j]
            ranks.append(np.sum(score_mat_non_neighbors[i,:] > target_score) + 1)
    
    return ranks
    
def trainLNEAR(lhs_train, rhs_train, a_dim, r_dim, epochs=500, report_every=5):
    model = Embedder(name2idx_rhs.shape[0], a_dim, r_dim)
    optimizer = optim.Adam(model.parameters(), lr=1e-2)
    BCELoss = nn.BCELoss()

    for e in range(epochs):
        optimizer.zero_grad()
        pos_scores = model(lhs_train, rhs_train)

        # Create negatives by permuting the positives
        #check that you don't create a real positive as a negative by accident 
        neg1_rhs = rhs_train[np.random.permutation(rhs_train.shape[0])]
        valid_neg1 = (rhs_train != neg1_rhs)
        neg_scores1 = model(lhs_train[valid_neg1], 
                           neg1_rhs[valid_neg1])

        neg2_lhs = lhs_train[np.random.permutation(lhs_train.shape[0])] 
        valid_neg2 = (lhs_train != neg2_lhs)
        neg_scores2 = model(neg2_lhs[valid_neg2],
                           rhs_train[valid_neg2])

        # Create negatives by sampling from the set of ids uniformly
        #check that you don't create a real positive as a negative by accident
        neg3_rhs = torch.LongTensor(np.random.randint(0, name2idx_rhs.shape[0], lhs_train.shape[0]))
        valid_neg3 = (neg3_rhs != rhs_train)
        neg_scores3 = model(lhs_train[valid_neg3], 
                            neg3_rhs[valid_neg3])

        neg4_lhs = torch.LongTensor(np.random.randint(0, name2idx_rhs.shape[0], lhs_train.shape[0]))
        valid_neg4 = (neg4_lhs != lhs_train)

        neg_scores4 = model(neg4_lhs[valid_neg4], 
                            rhs_train[valid_neg4])

        pos_loss = BCELoss(pos_scores, torch.FloatTensor([1.]).expand_as(pos_scores))
        neg_loss1 = BCELoss(neg_scores1, torch.FloatTensor([0.]).expand_as(neg_scores1))
        neg_loss2 = BCELoss(neg_scores2, torch.FloatTensor([0.]).expand_as(neg_scores2))
        neg_loss3 = BCELoss(neg_scores3, torch.FloatTensor([0.]).expand_as(neg_scores3))
        neg_loss4 = BCELoss(neg_scores4, torch.FloatTensor([0.]).expand_as(neg_scores4))

        # Weight the negative and positive losses equally. This doesn't matter, but
        # is done to follow some conventions.
        loss = pos_loss + neg_loss1 + neg_loss2 + neg_loss3 + neg_loss4
        loss.backward()
        optimizer.step()

        if e % report_every == 1:
            print(f"Epoch: {e} | Train loss: {loss.data.numpy():.2f} ")
    
    return model, loss.data.numpy()





In [2]:
# dataset
dataset = 'mammal_closure.csv'

edges = pd.read_csv(dataset)
edges.columns = ['id1', 'id2', 'weight']
# For this script a dataset is an edgelist of the form ['id1', 'id2']

## Make sure that we have contiguous ids regardless of the range of id1, id2
name2idx_lhs = pd.DataFrame({'id1': np.unique(edges[['id1', 'id2']])})
name2idx_lhs['lhs_idx'] = np.arange(name2idx_lhs.shape[0])
name2idx_rhs = pd.DataFrame({'id2': np.unique(edges[['id1', 'id2']])})
name2idx_rhs['rhs_idx'] = np.arange(name2idx_rhs.shape[0])

edges = edges.merge(name2idx_lhs).merge(name2idx_rhs)

lhs = torch.LongTensor(edges['lhs_idx'])
rhs = torch.LongTensor(edges['rhs_idx'])


a_model, a_only_loss = trainLNEAR(lhs, rhs, 
                               a_dim=10, r_dim=0, 
                               epochs=10000, report_every=1000)

ar_model, ar_loss = trainLNEAR(lhs, rhs, 
                               a_dim=8, r_dim=2, 
                               epochs=10000, report_every=1000)

print(a_only_loss)
print(ar_loss)

Epoch: 1 | Train loss: 6.67 
Epoch: 1001 | Train loss: 2.98 
Epoch: 2001 | Train loss: 2.95 
Epoch: 3001 | Train loss: 3.13 
Epoch: 4001 | Train loss: 3.18 
Epoch: 5001 | Train loss: 3.12 
Epoch: 6001 | Train loss: 3.23 
Epoch: 7001 | Train loss: 3.17 
Epoch: 8001 | Train loss: 3.13 
Epoch: 9001 | Train loss: 3.05 
Epoch: 1 | Train loss: 7.61 
Epoch: 1001 | Train loss: 1.27 
Epoch: 2001 | Train loss: 1.23 
Epoch: 3001 | Train loss: 1.21 
Epoch: 4001 | Train loss: 1.20 
Epoch: 5001 | Train loss: 1.22 
Epoch: 6001 | Train loss: 1.21 
Epoch: 7001 | Train loss: 1.21 
Epoch: 8001 | Train loss: 1.20 
Epoch: 9001 | Train loss: 1.21 
3.2205403
1.201174


In [3]:
# Construct the real adjacency matrix from the edges
adj_mat = np.zeros((name2idx_rhs.shape[0], name2idx_rhs.shape[0]))
for j in range(edges.shape[0]):
    adj_mat[edges.loc[j, 'lhs_idx'], edges.loc[j, 'rhs_idx']] = 1
    adj_mat[edges.loc[j, 'rhs_idx'], edges.loc[j, 'lhs_idx']] = 1
    
for j in range(adj_mat.shape[0]):
    adj_mat[j,j] = 1
    
# Get the implied matrices from both models
a_model_adj_est = a_model.adj_mat().data.numpy()
ar_model_adj_est = ar_model.adj_mat().data.numpy()

# Compute average rank of every true edge
a_ranks = evalReconstructionV2(adj_mat, a_model_adj_est)
ar_ranks = evalReconstructionV2(adj_mat, ar_model_adj_est)

# Lower rank is better (1 is perfect)
print(np.mean(a_ranks))
print(np.mean(ar_ranks))


260.57003058103976
1.1662844036697249
