# Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import dgl
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import itertools

In [3]:
from dgl.nn import SAGEConv
import dgl.function as fn

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [4]:
G = nx.read_gexf("../graph_creation/2020-07-01__to__2020-07-05__15-overlap__hashmap.gexf")

In [5]:
# load the dictionary that is pickled in the feature computation directory in the main.ipynb
import pickle
with open('feature_combination/final_node_features.pkl', 'rb') as handle:
    device_to_features = pickle.load(handle)

In [6]:
nx.set_node_attributes(G, device_to_features, "features")

In [7]:
node_features = {node: G.nodes[node]['features'] for node in G.nodes()}

In [8]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, workers=4)
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

node_embeddings = np.array([model.wv.get_vector(str(node)) for node in G.nodes()])
node_embedding_tensor = torch.tensor(node_embeddings)
node_features = node_embedding_tensor

Computing transition probabilities:   0%|          | 0/14964 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 3/3 [01:26<00:00, 28.72s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [01:34<00:00, 31.65s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [01:03<00:00, 31.66s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:41<00:00, 20.53s/it]


In [9]:
# node2vec embeddings
node2vec_embeds = np.array([model.wv.get_vector(str(node)) for node in G.nodes()])
node2vec_embeds_tensor = torch.tensor(node2vec_embeds)
import pickle
with open("node2vec_tensors.pkl", "wb") as f:
    pickle.dump(node2vec_embeds_tensor, f)

In [13]:
nodes_list = [str(node) for node in G.nodes()]
# pickle the list
with open("nodes_list.pkl", "wb") as f:
    pickle.dump(nodes_list, f)

In [14]:
# create dictionary of node2vec embeddings
node2vec_embeds_dict = {}
for i, node in enumerate(G.nodes()):
    node2vec_embeds_dict[node] = node2vec_embeds_tensor[i]

In [8]:
# unpickle the nodes_list to test
with open("nodes_list.pkl", "rb") as f:
    nodes_list_test = pickle.load(f)
# unpickle the node2vec embeddings to test
with open("node2vec_tensors.pkl", "rb") as f:
    node2vec_embeds_tensor_test = pickle.load(f)

In [8]:
nodes_list_test[100]

'4ea409ee4ba5a873e6539f4eae068b5ba2e4eef079bbb5f6172e1fab8a84a356'

In [9]:
node2vec_embeds_tensor_test[100]

tensor([ 0.0678, -0.0978,  0.1176,  0.2491,  0.4019, -0.0688, -0.2275, -0.1570,
         0.1999,  0.3366, -0.1170, -0.2604,  0.1415,  0.2174,  0.1607,  0.4422,
        -0.2545,  0.4776, -0.0220,  0.3182,  0.2983,  0.1519,  0.0132, -0.3153,
        -0.1475,  0.2473, -0.2500, -0.1532,  0.0027,  0.0490,  0.4991, -0.1282,
         0.1540,  0.5315, -0.1423,  0.3636,  0.5508,  0.1003, -0.1884,  0.0398,
         0.1673, -0.1138, -0.1999,  0.0361,  0.3141, -0.7169,  0.0681, -0.2923,
         0.2735, -0.4227, -0.0826, -0.1850, -0.0240,  0.1338,  0.1711,  0.0237,
        -0.2996,  0.3038,  0.1666,  0.3178,  0.2808,  0.0485, -0.4842, -0.0363,
         0.1714,  0.3209, -0.1156, -0.3539, -0.1264,  0.0433, -0.0614, -0.3220,
         0.0480,  0.2568, -0.0291, -0.2346,  0.1942, -0.0291, -0.0733, -0.3597,
        -0.0826, -0.1162, -0.0592, -0.1016,  0.2694,  0.4005,  0.0024, -0.5245,
        -0.0310,  0.4137,  0.0034, -0.1947, -0.1438,  0.4507, -0.1649,  0.5285,
        -0.0175, -0.1653, -0.0440, -0.01

In [10]:
node2vec_embeds_dict['4ea409ee4ba5a873e6539f4eae068b5ba2e4eef079bbb5f6172e1fab8a84a356']

NameError: name 'node2vec_embeds_dict' is not defined

In [78]:
model.wv.get_vector('34189a0454ae546a57a3823c62136acda961c3f7bad59f6323e43992152f4bca')

array([-0.5912568 , -0.13132314,  0.28953677, -0.5395234 , -0.11881539,
       -0.05290396,  0.42854133, -0.23255344, -0.68026453,  0.2876868 ,
        0.06303207, -0.06183719, -0.6032361 , -0.11136108, -0.4981049 ,
       -0.28247738, -0.13522837, -0.50488794, -0.34460413, -0.158043  ,
       -0.13343477,  0.27773163,  0.23683423, -0.6131927 ,  0.10993141,
       -0.17654072, -0.46577018,  0.16274883,  0.39340004, -0.24064618,
        0.13003385,  0.0938566 , -0.42562392,  0.40010926, -0.19992991,
        0.1533329 , -0.11335753, -0.14831966,  0.01840584,  0.13027258,
       -0.46312875,  0.2404868 , -0.00921797, -0.4054955 ,  0.62533545,
       -0.3304973 ,  0.19512108, -0.7676782 ,  0.3637736 ,  0.11157789,
       -0.18401256, -0.11701671, -0.06234975, -0.00643352, -0.19885735,
       -0.15239982,  0.11919613,  0.16333309,  0.19894603, -0.20397334,
        0.13950363, -0.161765  , -0.22726528,  0.22587793,  0.16542503,
        0.02220782,  0.5635086 , -0.3135901 ,  0.10524624,  0.16

In [60]:
# concatenate node2vec embeddings with the features
node_embeddings_np = np.array([np.concatenate((model.wv.get_vector(str(node)), np.array(list(device_to_features[node].values()), dtype=np.float32)), axis=0) for node in G.nodes()])
node_embedding_tensor = torch.tensor(node_embeddings_np)
node_features = node_embedding_tensor

In [9]:
# concatnation with normalized features
nodes = nodes_list_test
node2vec_embeddings = node2vec_embeds_tensor_test.numpy()
device_features = device_to_features

node_embeddings_np_normalized = np.array([np.concatenate((node2vec_embeddings[i], np.array(list(device_features[nodes[i]].values()), dtype=np.float32)), axis=0) for i in range(len(nodes))])
node_embedding_tensor_normalized = torch.tensor(node_embeddings_np_normalized)
node_features = node_embedding_tensor_normalized

In [10]:
node_features.shape[1]

134

In [12]:
dgl_G = dgl.from_networkx(G)
u, v = dgl_G.edges()

eids = np.arange(dgl_G.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = dgl_G.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(dgl_G.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), dgl_G.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]


: 

In [64]:
train_g = dgl.remove_edges(dgl_G, eids[:test_size])

In [65]:
dgl_G.ndata

{}

In [66]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=dgl_G.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=dgl_G.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=dgl_G.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=dgl_G.number_of_nodes())

In [67]:
dgl_model = GraphSAGE(node_features.shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [69]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(dgl_model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
loss_values = []
for e in range(500):
    # forward
    h = dgl_model(train_g, node_features)
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    loss_values.append(loss.item())

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 1.5259660482406616
In epoch 5, loss: 8.507662773132324
In epoch 10, loss: 5.302502632141113
In epoch 15, loss: 4.197474479675293
In epoch 20, loss: 3.1162357330322266
In epoch 25, loss: 1.9709445238113403
In epoch 30, loss: 1.4767886400222778
In epoch 35, loss: 1.3032402992248535
In epoch 40, loss: 1.1380212306976318
In epoch 45, loss: 1.1069239377975464
In epoch 50, loss: 1.0788660049438477
In epoch 55, loss: 1.008415699005127
In epoch 60, loss: 0.9744167923927307
In epoch 65, loss: 0.9500839710235596
In epoch 70, loss: 0.9250357747077942
In epoch 75, loss: 0.9073923826217651
In epoch 80, loss: 0.8924516439437866
In epoch 85, loss: 0.8792530298233032
In epoch 90, loss: 0.8693866729736328
In epoch 95, loss: 0.8604459762573242
In epoch 100, loss: 0.8523284792900085
In epoch 105, loss: 0.8452860713005066
In epoch 110, loss: 0.8388813138008118
In epoch 115, loss: 0.8329708576202393
In epoch 120, loss: 0.8275228142738342
In epoch 125, loss: 0.8224677443504333
In epoch 130