# Import Modules

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import itertools
import dgl

In [56]:
from dgl.nn import SAGEConv
from dgl.nn import GraphConv
import dgl.function as fn

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class GraphGCN(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphGCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, h_feats, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [57]:
G = nx.read_gexf("../graph_creation/2020-07-01__to__2020-07-05__15-overlap__hashmap.gexf")

In [58]:
# load the dictionary that is pickled in the feature computation directory in the main.ipynb
import pickle
with open('feature_combination/final_node_features.pkl', 'rb') as handle:
    device_to_features = pickle.load(handle)

In [59]:
nx.set_node_attributes(G, device_to_features, "features")

In [60]:
node_features = {node: G.nodes[node]['features'] for node in G.nodes()}

In [30]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G, workers=4)
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

node_embeddings = np.array([model.wv.get_vector(str(node)) for node in G.nodes()])
node_embedding_tensor = torch.tensor(node_embeddings)
node_features = node_embedding_tensor

Computing transition probabilities:   0%|          | 0/14964 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 3/3 [01:58<00:00, 39.55s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [01:18<00:00, 39.39s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [02:02<00:00, 40.80s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:36<00:00, 18.04s/it]


In [31]:
# node2vec embeddings
node2vec_embeds = np.array([model.wv.get_vector(str(node)) for node in G.nodes()])
node2vec_embeds_tensor = torch.tensor(node2vec_embeds)
import pickle
with open("node2vec_tensors.pkl", "wb") as f:
    pickle.dump(node2vec_embeds_tensor, f)

In [32]:
nodes_list = [str(node) for node in G.nodes()]
# pickle the list
with open("nodes_list.pkl", "wb") as f:
    pickle.dump(nodes_list, f)

In [33]:
# create dictionary of node2vec embeddings
node2vec_embeds_dict = {}
for i, node in enumerate(G.nodes()):
    node2vec_embeds_dict[node] = node2vec_embeds_tensor[i]

In [61]:
# unpickle the nodes_list to test
with open("nodes_list.pkl", "rb") as f:
    nodes_list = pickle.load(f)
# unpickle the node2vec embeddings to test
with open("node2vec_tensors.pkl", "rb") as f:
    node2vec_embeds_tensor = pickle.load(f)

In [8]:
nodes_list_test[100]

'4ea409ee4ba5a873e6539f4eae068b5ba2e4eef079bbb5f6172e1fab8a84a356'

In [9]:
node2vec_embeds_tensor_test[100]

tensor([ 0.0678, -0.0978,  0.1176,  0.2491,  0.4019, -0.0688, -0.2275, -0.1570,
         0.1999,  0.3366, -0.1170, -0.2604,  0.1415,  0.2174,  0.1607,  0.4422,
        -0.2545,  0.4776, -0.0220,  0.3182,  0.2983,  0.1519,  0.0132, -0.3153,
        -0.1475,  0.2473, -0.2500, -0.1532,  0.0027,  0.0490,  0.4991, -0.1282,
         0.1540,  0.5315, -0.1423,  0.3636,  0.5508,  0.1003, -0.1884,  0.0398,
         0.1673, -0.1138, -0.1999,  0.0361,  0.3141, -0.7169,  0.0681, -0.2923,
         0.2735, -0.4227, -0.0826, -0.1850, -0.0240,  0.1338,  0.1711,  0.0237,
        -0.2996,  0.3038,  0.1666,  0.3178,  0.2808,  0.0485, -0.4842, -0.0363,
         0.1714,  0.3209, -0.1156, -0.3539, -0.1264,  0.0433, -0.0614, -0.3220,
         0.0480,  0.2568, -0.0291, -0.2346,  0.1942, -0.0291, -0.0733, -0.3597,
        -0.0826, -0.1162, -0.0592, -0.1016,  0.2694,  0.4005,  0.0024, -0.5245,
        -0.0310,  0.4137,  0.0034, -0.1947, -0.1438,  0.4507, -0.1649,  0.5285,
        -0.0175, -0.1653, -0.0440, -0.01

In [34]:
# concatenate node2vec embeddings with the features
node_embeddings_np = np.array([np.concatenate((model.wv.get_vector(str(node)), np.array(list(device_to_features[node].values()), dtype=np.float32)), axis=0) for node in G.nodes()])
node_embedding_tensor = torch.tensor(node_embeddings_np)
node_features = node_embedding_tensor

In [62]:
# concatnation with normalized features
nodes = nodes_list
node2vec_embeddings = node2vec_embeds_tensor.numpy()
device_features = device_to_features

node_embeddings_np_normalized = np.array([np.concatenate((node2vec_embeddings[i], np.array(list(device_features[nodes_list[i]].values()), dtype=np.float32)), axis=0) for i in range(len(nodes_list))])
node_embedding_tensor_normalized = torch.tensor(node_embeddings_np_normalized)
node_features = node_embedding_tensor_normalized

In [37]:
node_features.shape[1]

134

In [63]:
dgl_G = dgl.from_networkx(G)
u, v = dgl_G.edges()

eids = np.arange(dgl_G.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = dgl_G.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(dgl_G.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), dgl_G.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]


In [24]:
node_features[0]

tensor([-0.0081,  0.0378,  0.0752,  0.0053,  0.1754,  0.0295, -0.0224,  0.2999,
         0.0721, -0.0512, -0.2032, -0.0236,  0.0942, -0.2013, -0.4150, -0.0148,
        -0.2460,  0.1249, -0.0174,  0.2050,  0.3013,  0.2295, -0.1425,  0.2067,
        -0.0708, -0.1469, -0.1771,  0.5667,  0.1792, -0.1908,  0.4507, -0.1286,
         0.4415,  0.1812,  0.3994, -0.0927,  0.0656, -0.1599, -0.1556,  0.2036,
         0.0573,  0.0811,  0.0896, -0.0717,  0.1291,  0.0311,  0.0117,  0.1952,
         0.3511, -0.0533,  0.2512, -0.0193,  0.2186,  0.0161, -0.2315,  0.0807,
         0.1363, -0.1678,  0.0364, -0.1003, -0.2711, -0.1683,  0.1880,  0.1512,
         0.0233, -0.1649,  0.2490, -0.1761, -0.1515, -0.0136, -0.1970, -0.1427,
        -0.4788, -0.4810,  0.2824, -0.2517, -0.1407, -0.1000, -0.0960, -0.0625,
        -0.0250, -0.0551, -0.0850,  0.0331, -0.1444,  0.5126,  0.0960, -0.0399,
        -0.0298,  0.4612,  0.1812, -0.1223,  0.3441, -0.0591, -0.1406, -0.0399,
        -0.4342,  0.2252,  0.1658,  0.07

In [41]:
device_to_features['ad9adea8e7d63428e9372e0d670244e5033d4d2988e5546f8801692bf9d40646']

{'avg_locations_per_day': -0.754601226993865,
 'avg_distance_per_day': -0.472425051270307,
 'age': 2,
 'score': -0.8855280414619378,
 'female': 1,
 'male': 0}

In [39]:
model.wv.get_vector(str('ad9adea8e7d63428e9372e0d670244e5033d4d2988e5546f8801692bf9d40646'))

array([-0.00813514,  0.03777379,  0.07524519,  0.0052942 ,  0.1754264 ,
        0.0294719 , -0.02243937,  0.29994112,  0.07209021, -0.05122218,
       -0.2031843 , -0.02355146,  0.09417741, -0.20132823, -0.4149674 ,
       -0.01476257, -0.24599603,  0.12489565, -0.01735623,  0.20502837,
        0.3013142 ,  0.22954972, -0.14247212,  0.20670253, -0.07077803,
       -0.14691035, -0.17707963,  0.56672496,  0.17919561, -0.19083361,
        0.45068824, -0.12860635,  0.44148946,  0.1812185 ,  0.39938694,
       -0.09265961,  0.06558739, -0.15990822, -0.1556166 ,  0.20360014,
        0.05728378,  0.08111922,  0.08961319, -0.07174372,  0.1290746 ,
        0.03112129,  0.01166186,  0.19517194,  0.3511475 , -0.05327895,
        0.2511663 , -0.01925827,  0.21864109,  0.01611416, -0.23149537,
        0.08066583,  0.13633803, -0.16776836,  0.036378  , -0.10026155,
       -0.2711056 , -0.16832137,  0.18802607,  0.1512132 ,  0.02330018,
       -0.16486892,  0.24897288, -0.17610708, -0.15153402, -0.01

In [37]:
node_features[0]

tensor([-0.0081,  0.0378,  0.0752,  0.0053,  0.1754,  0.0295, -0.0224,  0.2999,
         0.0721, -0.0512, -0.2032, -0.0236,  0.0942, -0.2013, -0.4150, -0.0148,
        -0.2460,  0.1249, -0.0174,  0.2050,  0.3013,  0.2295, -0.1425,  0.2067,
        -0.0708, -0.1469, -0.1771,  0.5667,  0.1792, -0.1908,  0.4507, -0.1286,
         0.4415,  0.1812,  0.3994, -0.0927,  0.0656, -0.1599, -0.1556,  0.2036,
         0.0573,  0.0811,  0.0896, -0.0717,  0.1291,  0.0311,  0.0117,  0.1952,
         0.3511, -0.0533,  0.2512, -0.0193,  0.2186,  0.0161, -0.2315,  0.0807,
         0.1363, -0.1678,  0.0364, -0.1003, -0.2711, -0.1683,  0.1880,  0.1512,
         0.0233, -0.1649,  0.2490, -0.1761, -0.1515, -0.0136, -0.1970, -0.1427,
        -0.4788, -0.4810,  0.2824, -0.2517, -0.1407, -0.1000, -0.0960, -0.0625,
        -0.0250, -0.0551, -0.0850,  0.0331, -0.1444,  0.5126,  0.0960, -0.0399,
        -0.0298,  0.4612,  0.1812, -0.1223,  0.3441, -0.0591, -0.1406, -0.0399,
        -0.4342,  0.2252,  0.1658,  0.07

In [64]:
train_g = dgl.remove_edges(dgl_G, eids[:test_size])

In [43]:
dgl_G.ndata

{}

In [65]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=dgl_G.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=dgl_G.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=dgl_G.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=dgl_G.number_of_nodes())

In [66]:
# dgl_model = GraphSAGE(node_features.shape[1], 16)
dgl_model = GraphGCN(node_features.shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [67]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(dgl_model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
loss_values = []
for e in range(1000):
    # forward
    h = dgl_model(train_g, node_features)
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    loss_values.append(loss.item())

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 0.7945080399513245
In epoch 5, loss: 0.6341795921325684
In epoch 10, loss: 0.6351076364517212
In epoch 15, loss: 0.6233730912208557
In epoch 20, loss: 0.6124357581138611
In epoch 25, loss: 0.6022491455078125
In epoch 30, loss: 0.593620777130127
In epoch 35, loss: 0.5854993462562561
In epoch 40, loss: 0.5779070258140564
In epoch 45, loss: 0.5874858498573303
In epoch 50, loss: 0.5741157531738281
In epoch 55, loss: 0.5693366527557373
In epoch 60, loss: 0.5647937059402466
In epoch 65, loss: 0.5628263354301453
In epoch 70, loss: 0.5611305236816406
In epoch 75, loss: 0.5601641535758972
In epoch 80, loss: 0.5592981576919556
In epoch 85, loss: 0.5586416125297546
In epoch 90, loss: 0.5580713748931885
In epoch 95, loss: 0.5576263666152954
In epoch 100, loss: 0.5570919513702393
In epoch 105, loss: 0.5565720796585083
In epoch 110, loss: 0.5559881925582886
In epoch 115, loss: 0.5553585290908813
In epoch 120, loss: 0.5546765923500061
In epoch 125, loss: 0.5542374849319458
In epoch 