In [15]:
%matplotlib inline

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial
import dgl
from dgl.contrib.data import load_data
import pandas as pd
import numpy as np

In [17]:
from dgl.nn.pytorch import RelGraphConv
class BaseRGCN(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels, num_bases,
                 num_hidden_layers=1, dropout=0,
                 use_self_loop=False, use_cuda=False):
        super(BaseRGCN, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = None if num_bases < 0 else num_bases
        self.num_hidden_layers = num_hidden_layers
        self.dropout = dropout
        self.use_self_loop = use_self_loop
        self.use_cuda = use_cuda

        # create rgcn layers
        self.build_model()

    def build_model(self):
        self.layers = nn.ModuleList()
        # i2h
        i2h = self.build_input_layer()
        if i2h is not None:
            self.layers.append(i2h)
        # h2h
        for idx in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer(idx)
            self.layers.append(h2h)
        # h2o
        h2o = self.build_output_layer()
        if h2o is not None:
            self.layers.append(h2o)

    def build_input_layer(self):
        return None

    def build_hidden_layer(self, idx):
        raise NotImplementedError

    def build_output_layer(self):
        return None

    def forward(self, g, h, r, norm):
        for layer in self.layers:
            h = layer(g, h, r, norm)
        return h

In [18]:
class EmbeddingLayer(nn.Module):
    def __init__(self, num_nodes, h_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = torch.nn.Embedding(num_nodes, h_dim)

    def forward(self, g, h, r, norm):
        return self.embedding(h.squeeze())

class RGCN(BaseRGCN):
    def build_input_layer(self):
        return EmbeddingLayer(self.num_nodes, self.h_dim)

    def build_hidden_layer(self, idx):
        act = F.relu if idx < self.num_hidden_layers - 1 else None
        return RelGraphConv(self.h_dim, self.h_dim, self.num_rels, "bdd",
                self.num_bases, activation=act, self_loop=True,
                dropout=self.dropout)

class LinkPredict(nn.Module):
    def __init__(self, in_dim, h_dim, num_rels, num_bases=-1,
                 num_hidden_layers=1, dropout=0, use_cuda=False, reg_param=0):
        super(LinkPredict, self).__init__()
        self.rgcn = RGCN(in_dim, h_dim, h_dim, num_rels * 2, num_bases,
                         num_hidden_layers, dropout, use_cuda)
        self.reg_param = reg_param
        self.w_relation = nn.Parameter(torch.Tensor(num_rels, h_dim))
        nn.init.xavier_uniform_(self.w_relation,
                                gain=nn.init.calculate_gain('relu'))

    def calc_score(self, embedding, triplets):
        # DistMult
        s = embedding[triplets[:,0]]
        r = self.w_relation[triplets[:,1]]
        o = embedding[triplets[:,2]]
        score = torch.sum(s * r * o, dim=1)
        return score

    def forward(self, g, h, r, norm):
        return self.rgcn.forward(g, h, r, norm)

    def regularization_loss(self, embedding):
        return torch.mean(embedding.pow(2)) + torch.mean(self.w_relation.pow(2))

    def get_loss(self, g, embed, triplets, labels):
        # triplets is a list of data samples (positive and negative)
        # each row in the triplets is a 3-tuple of (source, relation, destination)
        score = self.calc_score(embed, triplets)
        predict_loss = F.binary_cross_entropy_with_logits(score, labels)
        reg_loss = self.regularization_loss(embed)
        return predict_loss + self.reg_param * reg_loss
    def evaluate(self, g):
        # get embedding and relation weight without grad
        embedding = self.forward(g)
        return embedding, self.w_relation

def node_norm_to_edge_norm(g, node_norm):
    g = g.local_var()
    # convert to edge norm
    g.ndata['norm'] = node_norm
    g.apply_edges(lambda edges : {'norm' : edges.dst['norm']})
    return g.edata['norm']

In [76]:
data = pd.read_csv('sample_dataset_learnavi.csv', index_col='Unnamed: 0')

In [20]:
num_nodes = len(data.source_name.append(data.target_name).unique())
num_rels = len(data.edge.unique())

In [21]:
num_rels, num_nodes

(7, 4623)

In [22]:
model = LinkPredict(num_nodes,
                    140,
                    num_rels,
                    num_bases=140,
                    num_hidden_layers=2,
                    dropout=0.2,
                    use_cuda=-1,
                    reg_param=0.01)

In [23]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(data[['source','edge','target']].values, test_size=0.33, random_state=42)
valid_data, test_data = train_test_split(valid_data, test_size=0.5, random_state=42)
# data = shuffle(data).reset_index(drop=True)
# train_data = data[:6055]
# valid_data = data[6055:]

In [24]:
train_data = data[['source','edge','target']].values

In [25]:
train_data.shape, valid_data.shape, test_data.shape

((7569, 3), (1249, 3), (1249, 3))

In [26]:
valid_data = torch.LongTensor(valid_data)
test_data = torch.LongTensor(test_data)

In [70]:
def get_adj_and_degrees(num_nodes, triplets):
    """ Get adjacency list and degrees of the graph
    """
    adj_list = [[] for _ in range(num_nodes)]
    for i,triplet in enumerate(triplets):
        adj_list[triplet[0]].append([i, triplet[2]])
        adj_list[triplet[2]].append([i, triplet[0]])

    degrees = np.array([len(a) for a in adj_list])
    adj_list = [np.array(a) for a in adj_list]
    return adj_list, degrees

def sample_edge_neighborhood(adj_list, degrees, n_triplets, sample_size):
    """Sample edges by neighborhool expansion.
    This guarantees that the sampled edges form a connected graph, which
    may help deeper GNNs that require information from more than one hop.
    """
    edges = np.zeros((sample_size), dtype=np.int32)

    #initialize
    sample_counts = np.array([d for d in degrees])
    picked = np.array([False for _ in range(n_triplets)])
    seen = np.array([False for _ in degrees])

    for i in range(0, sample_size):
        weights = sample_counts * seen

        if np.sum(weights) == 0:
            weights = np.ones_like(weights)
            weights[np.where(sample_counts == 0)] = 0

        probabilities = (weights) / np.sum(weights)
        chosen_vertex = np.random.choice(np.arange(degrees.shape[0]),
                                         p=probabilities)
        chosen_adj_list = adj_list[chosen_vertex]
        seen[chosen_vertex] = True

        chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
        chosen_edge = chosen_adj_list[chosen_edge]
        edge_number = chosen_edge[0]

        while picked[edge_number]:
            chosen_edge = np.random.choice(np.arange(chosen_adj_list.shape[0]))
            chosen_edge = chosen_adj_list[chosen_edge]
            edge_number = chosen_edge[0]

        edges[i] = edge_number
        other_vertex = chosen_edge[1]
        picked[edge_number] = True
        sample_counts[chosen_vertex] -= 1
        sample_counts[other_vertex] -= 1
        seen[other_vertex] = True

    return edges

def sample_edge_uniform(adj_list, degrees, n_triplets, sample_size):
    """Sample edges uniformly from all the edges."""
    all_edges = np.arange(n_triplets)
    return np.random.choice(all_edges, sample_size, replace=False)

def generate_sampled_graph_and_labels(triplets, sample_size, split_size,
                                      num_rels, adj_list, degrees,
                                      negative_rate, sampler="uniform"):
    """Get training graph and signals
    First perform edge neighborhood sampling on graph, then perform negative
    sampling to generate negative samples
    """
    # perform edge neighbor sampling
    if sampler == "uniform":
        edges = sample_edge_uniform(adj_list, degrees, len(triplets), sample_size)
    elif sampler == "neighbor":
        edges = sample_edge_neighborhood(adj_list, degrees, len(triplets), sample_size)
    else:
        raise ValueError("Sampler type must be either 'uniform' or 'neighbor'.")

    # relabel nodes to have consecutive node ids
    edges = triplets[edges]
    src, rel, dst = edges.transpose()
    uniq_v, edges = np.unique((src, dst), return_inverse=True)
    src, dst = np.reshape(edges, (2, -1))
    relabeled_edges = np.stack((src, rel, dst)).transpose()

    # negative sampling
    samples, labels = negative_sampling(relabeled_edges, len(uniq_v),
                                        negative_rate)

    # further split graph, only half of the edges will be used as graph
    # structure, while the rest half is used as unseen positive samples
    split_size = int(sample_size * split_size)
    graph_split_ids = np.random.choice(np.arange(sample_size),
                                       size=split_size, replace=False)
    src = src[graph_split_ids]
    dst = dst[graph_split_ids]
    rel = rel[graph_split_ids]

    # build DGL graph
    print("# sampled nodes: {}".format(len(uniq_v)))
    print("# sampled edges: {}".format(len(src) * 2))
    g, rel, norm = build_graph_from_triplets(len(uniq_v), num_rels,
                                             (src, rel, dst))
    return g, uniq_v, rel, norm, samples, labels

def comp_deg_norm(g):
    g = g.local_var()
    in_deg = g.in_degrees(range(g.number_of_nodes())).float().numpy()
    norm = 1.0 / in_deg
    norm[np.isinf(norm)] = 0
    return norm

def build_graph_from_triplets(num_nodes, num_rels, triplets):
    """ Create a DGL graph. The graph is bidirectional because RGCN authors
        use reversed relations.
        This function also generates edge type and normalization factor
        (reciprocal of node incoming degree)
    """
    g = dgl.DGLGraph()
    g.add_nodes(num_nodes)
    src, rel, dst = triplets
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
    rel = np.concatenate((rel, rel + num_rels))
    edges = sorted(zip(dst, src, rel))
    dst, src, rel = np.array(edges).transpose()
    g.add_edges(src, dst)
    norm = comp_deg_norm(g)
    print("# nodes: {}, # edges: {}".format(num_nodes, len(src)))
    return g, rel.astype('int64'), norm.astype('int64')

def build_test_graph(num_nodes, num_rels, edges):
    src, rel, dst = edges.transpose()
    print("Test graph:")
    return build_graph_from_triplets(num_nodes, num_rels, (src, rel, dst))

def negative_sampling(pos_samples, num_entity, negative_rate):
    size_of_batch = len(pos_samples)
    num_to_generate = size_of_batch * negative_rate
    neg_samples = np.tile(pos_samples, (negative_rate, 1))
    labels = np.zeros(size_of_batch * (negative_rate + 1), dtype=np.float32)
    labels[: size_of_batch] = 1
    values = np.random.randint(num_entity, size=num_to_generate)
    choices = np.random.uniform(size=num_to_generate)
    subj = choices > 0.5
    obj = choices <= 0.5
    neg_samples[subj, 0] = values[subj]
    neg_samples[obj, 2] = values[obj]

    return np.concatenate((pos_samples, neg_samples)), labels

#######################################################################
#
# Utility functions for evaluations (raw)
#
#######################################################################

def sort_and_rank(score, target):
    _, indices = torch.sort(score, dim=1, descending=True)
    indices = torch.nonzero(indices == target.view(-1, 1))
    indices = indices[:, 1].view(-1)
    return indices

def perturb_and_get_raw_rank(embedding, w, a, r, b, test_size, batch_size=100):
    """ Perturb one element in the triplets
    """
    n_batch = (test_size + batch_size - 1) // batch_size
    ranks = []
    for idx in range(n_batch):
        print("batch {} / {}".format(idx, n_batch))
        batch_start = idx * batch_size
        batch_end = min(test_size, (idx + 1) * batch_size)
        batch_a = a[batch_start: batch_end]
        batch_r = r[batch_start: batch_end]
        emb_ar = embedding[batch_a] * w[batch_r]
        emb_ar = emb_ar.transpose(0, 1).unsqueeze(2) # size: D x E x 1
        emb_c = embedding.transpose(0, 1).unsqueeze(1) # size: D x 1 x V
        # out-prod and reduce sum
        out_prod = torch.bmm(emb_ar, emb_c) # size D x E x V
        score = torch.sum(out_prod, dim=0) # size E x V
        score = torch.sigmoid(score)
        target = b[batch_start: batch_end]
        print(score)
        print(target)
        ranks.append(sort_and_rank(score, target))
        print(ranks)
    return torch.cat(ranks)

# return MRR (raw), and Hits @ (1, 3, 10)
def calc_raw_mrr(embedding, w, test_triplets, hits=[], eval_bz=100):
    with torch.no_grad():
        s = test_triplets[:, 0]
        r = test_triplets[:, 1]
        o = test_triplets[:, 2]
        test_size = test_triplets.shape[0]
        print(s,r,o)
        # perturb subject
        ranks_s = perturb_and_get_raw_rank(embedding, w, o, r, s, test_size, eval_bz)
        # perturb object
        ranks_o = perturb_and_get_raw_rank(embedding, w, s, r, o, test_size, eval_bz)

        ranks = torch.cat([ranks_s, ranks_o])
        ranks += 1 # change to 1-indexed
        print(ranks)
        mrr = torch.mean(1.0 / ranks.float())
        print("MRR (raw): {:.6f}".format(mrr.item()))

        for hit in hits:
            avg_count = torch.mean((ranks <= hit).float())
            print("Hits (raw) @ {}: {:.6f}".format(hit, avg_count.item()))
    return mrr.item()

def filter_o(triplets_to_filter, target_s, target_r, target_o, num_entities):
    target_s, target_r, target_o = int(target_s), int(target_r), int(target_o)
    filtered_o = []
    # Do not filter out the test triplet, since we want to predict on it
    if (target_s, target_r, target_o) in triplets_to_filter:
        triplets_to_filter.remove((target_s, target_r, target_o))
    # Do not consider an object if it is part of a triplet to filter
    for o in range(num_entities):
        if (target_s, target_r, o) not in triplets_to_filter:
            filtered_o.append(o)
    return torch.LongTensor(filtered_o)

def filter_s(triplets_to_filter, target_s, target_r, target_o, num_entities):
    target_s, target_r, target_o = int(target_s), int(target_r), int(target_o)
    filtered_s = []
    # Do not filter out the test triplet, since we want to predict on it
    if (target_s, target_r, target_o) in triplets_to_filter:
        triplets_to_filter.remove((target_s, target_r, target_o))
    # Do not consider a subject if it is part of a triplet to filter
    for s in range(num_entities):
        if (s, target_r, target_o) not in triplets_to_filter:
            filtered_s.append(s)
    return torch.LongTensor(filtered_s)

def perturb_o_and_get_filtered_rank(embedding, w, s, r, o, test_size, triplets_to_filter):
    """ Perturb object in the triplets
    """
    num_entities = embedding.shape[0]
    ranks = []
    for idx in range(test_size):
        if idx % 100 == 0:
            print("test triplet {} / {}".format(idx, test_size))
        target_s = s[idx]
        target_r = r[idx]
        target_o = o[idx]
        filtered_o = filter_o(triplets_to_filter, target_s, target_r, target_o, num_entities)
        target_o_idx = int((filtered_o == target_o).nonzero())
        emb_s = embedding[target_s]
        emb_r = w[target_r]
        emb_o = embedding[filtered_o]
        emb_triplet = emb_s * emb_r * emb_o
        scores = torch.sigmoid(torch.sum(emb_triplet, dim=1))
        _, indices = torch.sort(scores, descending=True)
        rank = int((indices == target_o_idx).nonzero())
        ranks.append(rank)
    return torch.LongTensor(ranks)

def perturb_s_and_get_filtered_rank(embedding, w, s, r, o, test_size, triplets_to_filter):
    """ Perturb subject in the triplets
    """
    num_entities = embedding.shape[0]
    ranks = []
    for idx in range(test_size):
        if idx % 100 == 0:
            print("test triplet {} / {}".format(idx, test_size))
        target_s = s[idx]
        target_r = r[idx]
        target_o = o[idx]
        filtered_s = filter_s(triplets_to_filter, target_s, target_r, target_o, num_entities)
        target_s_idx = int((filtered_s == target_s).nonzero())
        emb_s = embedding[filtered_s]
        emb_r = w[target_r]
        emb_o = embedding[target_o]
        emb_triplet = emb_s * emb_r * emb_o
        scores = torch.sigmoid(torch.sum(emb_triplet, dim=1))
        _, indices = torch.sort(scores, descending=True)
        rank = int((indices == target_s_idx).nonzero())
        ranks.append(rank)
    return torch.LongTensor(ranks)

def calc_filtered_mrr(embedding, w, train_triplets, valid_triplets, test_triplets, hits=[]):
    with torch.no_grad():
        s = test_triplets[:, 0]
        r = test_triplets[:, 1]
        o = test_triplets[:, 2]
        test_size = test_triplets.shape[0]

        triplets_to_filter = torch.cat([train_triplets, valid_triplets, test_triplets]).tolist()
        triplets_to_filter = {tuple(triplet) for triplet in triplets_to_filter}
        print('Perturbing subject...')
        ranks_s = perturb_s_and_get_filtered_rank(embedding, w, s, r, o, test_size, triplets_to_filter)
        print('Perturbing object...')
        ranks_o = perturb_o_and_get_filtered_rank(embedding, w, s, r, o, test_size, triplets_to_filter)

        ranks = torch.cat([ranks_s, ranks_o])
        ranks += 1 # change to 1-indexed

        mrr = torch.mean(1.0 / ranks.float())
        print("MRR (filtered): {:.6f}".format(mrr.item()))

        for hit in hits:
            avg_count = torch.mean((ranks <= hit).float())
            print("Hits (filtered) @ {}: {:.6f}".format(hit, avg_count.item()))
    return mrr.item()

def calc_mrr(embedding, w, train_triplets, valid_triplets, test_triplets, hits=[], eval_bz=100, eval_p="filtered"):
    if eval_p == "filtered":
        mrr = calc_filtered_mrr(embedding, w, train_triplets, valid_triplets, test_triplets, hits)
    else:
        mrr = calc_raw_mrr(embedding, w, test_triplets, hits, eval_bz)
    return mrr

In [45]:
# build test graph
test_graph, test_rel, test_norm = build_test_graph(
    num_nodes, num_rels, train_data)
test_deg = test_graph.in_degrees(range(test_graph.number_of_nodes())).float().view(-1,1)
test_node_id = torch.arange(0, num_nodes, dtype=torch.long).view(-1, 1)
test_rel = torch.from_numpy(test_rel)
test_norm = node_norm_to_edge_norm(test_graph, torch.from_numpy(test_norm).view(-1, 1))
# test_graph.ndata.update({'id': test_node_id, 'norm': test_norm})
# test_graph.edata['type'] = test_rel

Test graph:
# nodes: 4623, # edges: 15138


In [29]:
 # build adj list and calculate degrees for sampling
adj_list, degrees = get_adj_and_degrees(num_nodes, train_data)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

In [30]:
import time
model_state_file = 'model_state.pth'
forward_time = []
backward_time = []
use_cuda=False
# training loop
print("start training...")

epoch = 0
best_mrr = 0
while True:
    model.train()
    epoch += 1

    # perform edge neighborhood sampling to generate training graph and data
    g, node_id, edge_type, node_norm, data, labels = \
        generate_sampled_graph_and_labels(
            train_data, 20, 0.5,
            num_rels, adj_list, degrees, 10,
            'uniform')
    print("Done edge sampling")

        # set node/edge feature
    node_id = torch.from_numpy(node_id).view(-1, 1).long()
    edge_type = torch.from_numpy(edge_type)
    edge_norm = node_norm_to_edge_norm(g, torch.from_numpy(node_norm).view(-1, 1))
    data, labels = torch.from_numpy(data), torch.from_numpy(labels)
    deg = g.in_degrees(range(g.number_of_nodes())).float().view(-1, 1)
    if use_cuda:
        node_id, deg = node_id.cuda(), deg.cuda()
        edge_type, edge_norm = edge_type.cuda(), edge_norm.cuda()
        data, labels = data.cuda(), labels.cuda()

    t0 = time.time()
    embed = model(g, node_id, edge_type, edge_norm)
    loss = model.get_loss(g, embed, data, labels)
    t1 = time.time()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip gradients
    optimizer.step()
    t2 = time.time()

    forward_time.append(t1 - t0)
    backward_time.append(t2 - t1)
    print("Epoch {:04d} | Loss {:.4f} | Best MRR {:.4f} | Forward {:.4f}s | Backward {:.4f}s".
            format(epoch, loss.item(), best_mrr, forward_time[-1], backward_time[-1]))

    optimizer.zero_grad()

        # validation
    if epoch % 2 == 0:
        # perform validation on CPU because full graph is too large
        if use_cuda:
            model.cpu()
        model.eval()
        print("start eval")
        embed = model(test_graph, test_node_id, test_rel, test_norm)
        mrr = calc_mrr(embed, model.w_relation, torch.LongTensor(train_data),
                                valid_data, test_data, hits=[1, 3, 10], eval_bz=500,
                                eval_p='filtered')
        # save best model
        if mrr < best_mrr:
            if epoch >= 30:
                break
        else:
            best_mrr = mrr
            torch.save({'state_dict': model.state_dict(), 'epoch': epoch},
                        model_state_file)
        if use_cuda:
            model.cuda()

print("training done")
print("Mean forward time: {:4f}s".format(np.mean(forward_time)))
print("Mean Backward time: {:4f}s".format(np.mean(backward_time)))

start training...
# sampled nodes: 34
# sampled edges: 20




# nodes: 34, # edges: 20
Done edge sampling
Epoch 0001 | Loss 3.2399 | Best MRR 0.0000 | Forward 0.1469s | Backward 0.0907s
# sampled nodes: 34
# sampled edges: 20
# nodes: 34, # edges: 20
Done edge sampling
Epoch 0002 | Loss 1.4982 | Best MRR 0.0000 | Forward 0.0221s | Backward 0.0536s
start eval
Perturbing subject...
test triplet 0 / 1249
test triplet 100 / 1249
test triplet 200 / 1249
test triplet 300 / 1249
test triplet 400 / 1249
test triplet 500 / 1249
test triplet 600 / 1249
test triplet 700 / 1249
test triplet 800 / 1249
test triplet 900 / 1249
test triplet 1000 / 1249
test triplet 1100 / 1249
test triplet 1200 / 1249
Perturbing object...
test triplet 0 / 1249
test triplet 100 / 1249
test triplet 200 / 1249
test triplet 300 / 1249
test triplet 400 / 1249
test triplet 500 / 1249
test triplet 600 / 1249
test triplet 700 / 1249
test triplet 800 / 1249
test triplet 900 / 1249
test triplet 1000 / 1249
test triplet 1100 / 1249
test triplet 1200 / 1249
MRR (filtered): 0.002905
Hits (f

test triplet 0 / 1249
test triplet 100 / 1249
test triplet 200 / 1249
test triplet 300 / 1249
test triplet 400 / 1249
test triplet 500 / 1249
test triplet 600 / 1249
test triplet 700 / 1249
test triplet 800 / 1249
test triplet 900 / 1249
test triplet 1000 / 1249
test triplet 1100 / 1249
test triplet 1200 / 1249
Perturbing object...
test triplet 0 / 1249
test triplet 100 / 1249
test triplet 200 / 1249
test triplet 300 / 1249
test triplet 400 / 1249
test triplet 500 / 1249
test triplet 600 / 1249
test triplet 700 / 1249
test triplet 800 / 1249
test triplet 900 / 1249
test triplet 1000 / 1249
test triplet 1100 / 1249
test triplet 1200 / 1249
MRR (filtered): 0.209623
Hits (filtered) @ 1: 0.185348
Hits (filtered) @ 3: 0.224980
Hits (filtered) @ 10: 0.247798
# sampled nodes: 27
# sampled edges: 20
# nodes: 27, # edges: 20
Done edge sampling
Epoch 0017 | Loss 0.6592 | Best MRR 0.2171 | Forward 0.0518s | Backward 0.0426s
# sampled nodes: 34
# sampled edges: 20
# nodes: 34, # edges: 20
Done edg

Perturbing object...
test triplet 0 / 1249
test triplet 100 / 1249
test triplet 200 / 1249
test triplet 300 / 1249
test triplet 400 / 1249
test triplet 500 / 1249
test triplet 600 / 1249
test triplet 700 / 1249
test triplet 800 / 1249
test triplet 900 / 1249
test triplet 1000 / 1249
test triplet 1100 / 1249
test triplet 1200 / 1249
MRR (filtered): 0.163005
Hits (filtered) @ 1: 0.130104
Hits (filtered) @ 3: 0.173339
Hits (filtered) @ 10: 0.221777
training done
Mean forward time: 0.045414s
Mean Backward time: 0.055883s


In [31]:
test_node_id.shape

torch.Size([4623, 1])

In [32]:
state = torch.load('model_state.pth')

In [33]:
model = LinkPredict(num_nodes,
                                 140,
                                 num_rels,
                                 num_bases=140,
                                 num_hidden_layers=2,
                                 dropout=0.2,
                                 use_cuda=-1,
                                 reg_param=0.01)

In [34]:
model.load_state_dict(state['state_dict'])

<All keys matched successfully>

In [35]:
coba = 2
coba = torch.LongTensor([coba]).view(-1,1)

In [36]:
embed = model(test_graph, test_node_id, test_rel, test_norm)

In [72]:
coba = torch.LongTensor([1,2,3]).view(-1,3)

In [73]:
train_data

array([[4622,    3, 3653],
       [4622,    3, 3765],
       [4622,    3, 3844],
       ...,
       [1432,    4, 2437],
       [1432,    4, 1838],
       [1513,    4, 2397]], dtype=int64)

In [74]:
calc_raw_mrr(embed, model.w_relation,coba,[1,3,10],100)

tensor([1]) tensor([2]) tensor([3])
batch 0 / 1
tensor([[2.1096e-06, 1.2538e-21, 1.2835e-06,  ..., 1.2513e-11, 1.9853e-11,
         4.2081e-02]])
tensor([1])
[tensor([4587])]
batch 0 / 1
tensor([[5.0884e-08, 7.1135e-25, 4.0413e-06,  ..., 1.8919e-10, 9.3751e-10,
         1.1866e-01]])
tensor([3])
[tensor([4592])]
tensor([4588, 4593])
MRR (raw): 0.000218
Hits (raw) @ 1: 0.000000
Hits (raw) @ 3: 0.000000
Hits (raw) @ 10: 0.000000


0.0002178412687499076

In [52]:
test_data[:,:2]

tensor([[4622,    3],
        [4622,    1],
        [1249,    5],
        ...,
        [1035,    6],
        [4622,    1],
        [1356,    4]])

In [75]:
data

tensor([[27,  3, 18],
        [27,  3, 18],
        [ 3,  0, 15],
        [27,  3, 16],
        [ 7,  2, 11],
        [ 4,  4,  6],
        [27,  3, 19],
        [27,  1,  2],
        [17,  0, 13],
        [27,  1,  5],
        [27,  3, 22],
        [27,  3, 25],
        [12,  5,  0],
        [27,  1,  2],
        [ 1,  6, 27],
        [ 8,  2, 10],
        [20,  4, 26],
        [23,  4, 24],
        [21,  0, 14],
        [ 9,  2, 11],
        [27,  3, 22],
        [ 9,  3, 18],
        [ 3,  0,  6],
        [20,  3, 16],
        [ 9,  2, 11],
        [21,  4,  6],
        [14,  3, 19],
        [19,  1,  2],
        [17,  0, 11],
        [ 3,  1,  5],
        [ 4,  3, 22],
        [12,  3, 25],
        [ 6,  5,  0],
        [27,  1, 12],
        [22,  6, 27],
        [ 8,  2, 24],
        [15,  4, 26],
        [23,  4, 13],
        [ 7,  0, 14],
        [16,  2, 11],
        [27,  3, 23],
        [27,  3, 15],
        [ 3,  0, 12],
        [27,  3, 11],
        [12,  2, 11],
        [ 

In [85]:
source = data.source
target = data.target
num_nodes = len(source.append(target).unique())

In [88]:
data[['source','source_name','target','target_name']]

Unnamed: 0,source,source_name,target,target_name
0,4622,user_0,3653,topic_1499
1,4622,user_0,3765,topic_2414
2,4622,user_0,3844,topic_3056
3,4622,user_0,3324,topic_11898
4,4622,user_0,3844,topic_3056
...,...,...,...,...
7564,1382,externalResources_2244,2281,library_5838
7565,1376,externalResources_1921,2414,library_7664
7566,1432,externalResources_5436,2437,library_7977
7567,1432,externalResources_5436,1838,library_13770


In [91]:
dicti = source.append(target)
dict_name = data.source_name.append(data.target_name)

In [97]:
dict(zip(dicti,dict_name))

4623

In [99]:
dict(zip(data.edge_name,data.edge))

{'INTERESTED_IN': 3,
 'BOOKMARK': 1,
 'TALK_OF': 5,
 'ASSUME_UNDERSTANDING_OF': 0,
 'WRITTEN_BY': 6,
 'ENCAPSULATES': 2,
 'SYNONYM': 4}

In [105]:
search = data[(data.source==4622)&(data.edge==3)].reset_index(drop=True)

In [121]:
dest = set(search.target)
dest_name = set(search.target_name)

In [122]:
result = []
for a in dest:
    result.append(4622+3+a)

In [123]:
set(zip(result,dest_name))

{(7722, 'topic_6501'),
 (7725, 'topic_5395'),
 (7732, 'topic_4031'),
 (7739, 'topic_2911'),
 (7748, 'topic_7377'),
 (7751, 'topic_3366'),
 (7762, 'topic_160'),
 (7798, 'topic_10316'),
 (7805, 'topic_13398'),
 (7816, 'topic_10580'),
 (7835, 'topic_5913'),
 (7843, 'topic_8123'),
 (7870, 'topic_823'),
 (7878, 'topic_11173'),
 (7883, 'topic_3364'),
 (7949, 'topic_1002'),
 (7961, 'topic_7704'),
 (7998, 'topic_11990'),
 (8001, 'topic_2882'),
 (8014, 'topic_2747'),
 (8058, 'topic_4617'),
 (8071, 'topic_10721'),
 (8074, 'topic_12872'),
 (8081, 'topic_606'),
 (8126, 'topic_2925'),
 (8134, 'topic_216'),
 (8150, 'topic_10060'),
 (8176, 'topic_12955'),
 (8210, 'topic_14476'),
 (8238, 'topic_5295'),
 (8256, 'topic_9065'),
 (8261, 'topic_1943'),
 (8267, 'topic_12897'),
 (8278, 'topic_5904'),
 (8296, 'topic_13506'),
 (8327, 'topic_12351'),
 (8338, 'topic_10198'),
 (8339, 'topic_6759'),
 (8359, 'topic_11898'),
 (8386, 'topic_14428'),
 (8390, 'topic_6980'),
 (8392, 'topic_2505'),
 (8401, 'topic_7124'),