In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import torch
import scipy.sparse as sp
import torch.nn as nn
from node2vec import Node2Vec
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import itertools
import dgl
import random

In [36]:
from dgl.nn import SAGEConv
from dgl.nn import GraphConv
import dgl.function as fn

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class GraphGCN(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphGCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, h_feats, allow_zero_in_degree=True)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            # Compute a new edge feature named 'score' by a dot-product between the
            # source node feature 'h' and destination node feature 'h'.
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            # u_dot_v returns a 1-element vector for each edge so you need to squeeze it.
            return g.edata['score'][:, 0]

In [37]:
G_5day = nx.read_gexf("../graph_creation/2020-07-01__to__2020-07-05__15-overlap__hashmap.gexf")
G_6day = nx.read_gexf("../graph_creation/2020-07-01__to__2020-07-06__15-overlap__hashmap.gexf")

In [151]:
sorted_6day = sorted(G_6day.nodes())
node2id_6day = {n: i for i, n in enumerate(sorted_6day)}

In [152]:
# Create the training set
positive_edges_train = list(G_5day.edges())
negative_edges_train = [(u, v) for u, v in random.sample(list(nx.non_edges(G_5day)), len(positive_edges_train))]

# Create the test set
positive_edges_test = list(set(G_6day.edges()) - set(G_5day.edges()))
negative_edges_test = [(u, v) for u, v in random.sample(list(nx.non_edges(G_6day)), len(positive_edges_test))]

# TODO: Train and test a link prediction model using the training and test sets

In [155]:
# Create the training set
positive_edges_train_tensors = torch.tensor([[node2id_6day[u], node2id_6day[v]] for u, v in positive_edges_train])
negative_edges_train_tensors = torch.tensor([[node2id_6day[u], node2id_6day[v]] for u, v in negative_edges_train])

# Create the test set
positive_edges_test_tensors = torch.tensor([[node2id_6day[u], node2id_6day[v]] for u, v in positive_edges_test])
negative_edges_test_tensors = torch.tensor([[node2id_6day[u], node2id_6day[v]] for u, v in negative_edges_test])

In [159]:
train_pos_u = positive_edges_train_tensors[:, 0]
train_pos_v = positive_edges_train_tensors[:, 1]

train_neg_u = negative_edges_train_tensors[:, 0]
train_neg_v = negative_edges_train_tensors[:, 1]

In [197]:
test_pos_u = positive_edges_test_tensors[:, 0]
test_pos_v = positive_edges_test_tensors[:, 1]

test_neg_u = negative_edges_test_tensors[:, 0]
test_neg_v = negative_edges_test_tensors[:, 1]

In [160]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(G_6day, workers=4)
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

node_embeddings = np.array([model.wv.get_vector(str(node)) for node in G_6day.nodes()])
node_embedding_tensor = torch.tensor(node_embeddings)
node_features = node_embedding_tensor

Computing transition probabilities: 100%|██████████| 16237/16237 [20:33<00:00, 13.16it/s] 
Generating walks (CPU: 1): 100%|██████████| 3/3 [02:36<00:00, 52.05s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [02:43<00:00, 54.64s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [01:33<00:00, 46.53s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:42<00:00, 21.29s/it]


In [187]:
node_embeddings = np.array([model.wv.get_vector(str(node)) for node in sorted_6day])
node_embedding_tensor = torch.tensor(node_embeddings)
node_features = node_embedding_tensor

In [188]:
# create dictionary of node2vec embeddings
node2vec_embeds_dict = {}
for i, node in enumerate(sorted_6day):
    node2vec_embeds_dict[node] = node_embedding_tensor[i]

In [189]:
import pickle
with open("node2vec_tensors_6.pkl", "wb") as f:
    pickle.dump(node_embedding_tensor, f)

nodes_list = list(sorted(G_6day.nodes()))
with open("nodes_list_6.pkl", "wb") as f:
    pickle.dump(nodes_list, f)

In [190]:
# load the dictionary that is pickled in the feature computation directory in the main.ipynb
import pickle
with open('feature_combination/final_node_features.pkl', 'rb') as handle:
    device_to_features = pickle.load(handle)

In [192]:
# create dictionary of node2vec embeddings
node2vec_embeds_dict = {}
for i, node in enumerate(sorted_6day):
    node2vec_embeds_dict[node] = node_embedding_tensor[i]

In [193]:
# concatenate node2vec embeddings with the features
node_embeddings_np = np.array([np.concatenate((model.wv.get_vector(str(node)), np.array(list(device_to_features[node].values()), dtype=np.float32)), axis=0) for node in sorted(G_6day.nodes())])
node_embedding_tensor = torch.tensor(node_embeddings_np)
node_features = node_embedding_tensor

In [194]:
dgl_6 = dgl.from_networkx(G_6day)

In [198]:
test_pos_edge_ids = dgl_6.edge_ids(test_pos_u, test_pos_v)

In [199]:
train_g = dgl.remove_edges(dgl_6, test_pos_edge_ids)

In [200]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=dgl_6.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=dgl_6.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=dgl_6.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=dgl_6.number_of_nodes())

In [201]:
# dgl_model = GraphSAGE(node_features.shape[1], 16)
dgl_model = GraphGCN(node_features.shape[1], 16)
# You can replace DotPredictor with MLPPredictor.
#pred = MLPPredictor(16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [202]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(dgl_model.parameters(), pred.parameters()), lr=0.01)

# ----------- 4. training -------------------------------- #
all_logits = []
loss_values = []
for e in range(1000):
    # forward
    h = dgl_model(train_g, node_features)
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    loss_values.append(loss.item())

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    print('AUC', compute_auc(pos_score, neg_score))


# Thumbnail credits: Link Prediction with Neo4j, Mark Needham
# sphinx_gallery_thumbnail_path = '_static/blitz_4_link_predict.png'

In epoch 0, loss: 0.6915445923805237
In epoch 5, loss: 0.6326662302017212
In epoch 10, loss: 0.5815366506576538
In epoch 15, loss: 0.5476576685905457
In epoch 20, loss: 0.5250999331474304
In epoch 25, loss: 0.5027560591697693
In epoch 30, loss: 0.49401551485061646
In epoch 35, loss: 0.4869621694087982
In epoch 40, loss: 0.48078566789627075
In epoch 45, loss: 0.4753807485103607
In epoch 50, loss: 0.47119441628456116
In epoch 55, loss: 0.4675285220146179
In epoch 60, loss: 0.46456998586654663
In epoch 65, loss: 0.4621361196041107
In epoch 70, loss: 0.4601689875125885
In epoch 75, loss: 0.45835423469543457
In epoch 80, loss: 0.4566519558429718
In epoch 85, loss: 0.4553396999835968
In epoch 90, loss: 0.45435768365859985
In epoch 95, loss: 0.45353588461875916
In epoch 100, loss: 0.4527946412563324
In epoch 105, loss: 0.45209404826164246
In epoch 110, loss: 0.45144057273864746
In epoch 115, loss: 0.45086023211479187
In epoch 120, loss: 0.45035451650619507
In epoch 125, loss: 0.44991886615753