<a href="https://colab.research.google.com/github/HarlinLee/science4cast/blob/main/node_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -q stellargraph[demos]==1.2.1

In [2]:
from google.colab import drive
import os
drive.mount('/content/drive')
DRIVE_PATH = "/content/drive/My Drive/science4cast"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import matplotlib.pyplot as plt
from math import isclose
from scipy import sparse
import networkx as nx
import numpy as np
import pickle
import pandas as pd
import random
import time
from datetime import date
from collections import Counter
import multiprocessing
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
import stellargraph as sg
from stellargraph import StellarGraph, datasets
from stellargraph.data import EdgeSplitter

In [4]:
NUM_OF_VERTICES=64719 # number of vertices of the semantic net

data_source = os.path.join(DRIVE_PATH, 'competition_data', 'CompetitionSet2017_3.pkl')
#data_source = os.path.join(DRIVE_PATH, 'TrainSet2014_3.pkl')
full_dynamic_graph_sparse,unconnected_vertex_pairs,year_start,years_delta = pickle.load( open( data_source, "rb" ) )

print(data_source+' has '+str(len(full_dynamic_graph_sparse))+' edges between a total of '+str(NUM_OF_VERTICES)+ ' vertices.\n\n')
print('The goal is to predict which of '+str(len(unconnected_vertex_pairs))+' unconnectedvertex-pairs\nin unconnected_vertex_pairs will be connected until '+str(year_start+years_delta)+'.')


/content/drive/My Drive/science4cast/competition_data/CompetitionSet2017_3.pkl has 7652945 edges between a total of 64719 vertices.


The goal is to predict which of 1000000 unconnectedvertex-pairs
in unconnected_vertex_pairs will be connected until 2020.


In [5]:
def create_training_data(full_graph,year_start,years_delta,edges_used=500000,vertex_degree_cutoff=10):
    """
    :param full_graph: Full graph, numpy array dim(n,3) [vertex 1, vertex 2, time stamp]
    :param year_start: year of graph
    :param years_delta: distance for prediction in years (prediction on graph of year_start+years_delta)
    :param edges_used: optional filter to create a random subset of edges for rapid prototyping (default: 500,000)
    :param vertex_degree_cutoff: optional filter, for vertices in training set having a minimal degree of at least vertex_degree_cutoff  (default: 10)
    :return:

    all_edge_list: graph of year_start, numpy array dim(n,2)
    unconnected_vertex_pairs: potential edges for year_start+years_delta
    unconnected_vertex_pairs_solution: numpy array with integers (0=unconnected, 1=connected), solution, length = len(unconnected_vertex_pairs)
    """

    years=[year_start]    
    day_origin = date(1990,1,1)

    all_G=[]
    all_edge_lists=[]
    all_sparse=[]
    for yy in years:
        print('    Create Graph for ', yy)
        day_curr=date(yy,12,31)
        all_edges_curr=full_graph[full_graph[:,2]<(day_curr-day_origin).days]
        adj_mat_sparse_curr = sparse.csr_matrix((np.ones(len(all_edges_curr)), (all_edges_curr[:,0], all_edges_curr[:,1])), shape=(NUM_OF_VERTICES,NUM_OF_VERTICES))
        G_curr=nx.from_scipy_sparse_matrix(adj_mat_sparse_curr, parallel_edges=False, create_using=None, edge_attribute='weight')

        all_G.append(G_curr)
        all_sparse.append(adj_mat_sparse_curr)
        all_edge_lists.append(all_edges_curr)

        print('    Done: Create Graph for ', yy)
        print('    num of edges: ', G_curr.number_of_edges())

    all_degs=np.array(all_sparse[0].sum(0))[0]

    unconnected_vertex_pairs=np.array([])
    unconnected_vertex_pairs_solution=np.array([])

    all_edge_list=np.array(all_edge_lists[0])
    
    return all_edge_list, unconnected_vertex_pairs, unconnected_vertex_pairs_solution

edges_used=1*10**6 # Best would be to use all vertices, to create more training data. But that takes long and requires huge amount of memory. So here we use a random subset.
vertex_degree_cutoff=10

In [6]:
graphs = []
years = [1994] # CHANGE ME TO DIFFERENT YEARS

for yy in years:
  train_dynamic_graph_sparse,train_edges_for_checking,train_edges_solution = create_training_data(full_dynamic_graph_sparse, yy, years_delta, edges_used=edges_used, vertex_degree_cutoff=vertex_degree_cutoff)

  print(train_dynamic_graph_sparse[:,:-1].shape, train_edges_for_checking.shape, train_edges_solution.shape)
  graph_train = StellarGraph(nodes=sg.IndexedArray(index=range(NUM_OF_VERTICES)), 
                          edges=pd.DataFrame(train_dynamic_graph_sparse[:,:-1], columns=["source", "target"]))
  print(graph_train.info())
  graphs.append(graph_train)

    Create Graph for  1994
    Done: Create Graph for  1994
    num of edges:  671
(671, 2) (0,) (0,)
StellarGraph: Undirected multigraph
 Nodes: 64719, Edges: 671

 Node types:
  default: [64719]
    Features: none
    Edge types: default-default->default

 Edge types:
    default-default->default: [671]
        Weights: all 1 (default)
        Features: none


## Node Embedding

https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/node2vec-link-prediction.html

In [7]:
from stellargraph.data import BiasedRandomWalk


def create_biased_random_walker(graph, walk_num, walk_length):
    # parameter settings for "p" and "q":
    p = 2.0
    q = 1.0
    return BiasedRandomWalk(graph, n=walk_num, length=walk_length, p=p, q=q)

In [8]:
walk_length = 5
epochs = 100
batch_size = 512

In [9]:
from stellargraph.data import UnsupervisedSampler
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator
from stellargraph.layer import Node2Vec, link_classification
from tensorflow import keras

def node2vec_embedding(graph, name, year_start):

    # Set the embedding dimension and walk number:
    dimension = 128
    walk_number = 20

    print(f"Training Node2Vec for '{name}':")
    print(year_start)

    graph_node_list = list(graph.nodes())

    # Create the biased random walker to generate random walks
    walker = create_biased_random_walker(graph, walk_number, walk_length)

    # Create the unsupervised sampler to sample (target, context) pairs from random walks
    unsupervised_samples = UnsupervisedSampler(
        graph, nodes=graph_node_list, walker=walker
    )

    # Define a Node2Vec training generator, which generates batches of training pairs
    generator = Node2VecLinkGenerator(graph, batch_size)

    # Create the Node2Vec model
    node2vec = Node2Vec(dimension, generator=generator)

    # Build the model and expose input and output sockets of Node2Vec, for node pair inputs
    x_inp, x_out = node2vec.in_out_tensors()

    # Use the link_classification function to generate the output of the Node2Vec model
    prediction = link_classification(
        output_dim=1, output_act="sigmoid", edge_embedding_method="dot"
    )(x_out)

    # Stack the Node2Vec encoder and prediction layer into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=[keras.metrics.binary_accuracy],
    )

    # Train the model
    model.fit(
        generator.flow(unsupervised_samples),
        epochs=epochs,
        verbose=2,
        callbacks = [keras.callbacks.EarlyStopping(monitor='loss', patience=2, restore_best_weights=True),
                     keras.callbacks.ModelCheckpoint(filepath=os.path.join(DRIVE_PATH,'node2vec'+str(date.today())+'.h5'),
                                                     monitor='loss',save_best_only=True),
                    keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=1e-4)],
        use_multiprocessing=True,
        workers=8,
    )

    # Build the model to predict node representations from node ids with the learned Node2Vec model parameters
    x_inp_src = x_inp[0]
    x_out_src = x_out[0]
    embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

    # Get representations for all nodes in ``graph``
    node_gen = Node2VecNodeGenerator(graph, batch_size).flow(graph_node_list)
    node_embeddings = embedding_model.predict(node_gen, workers=8, verbose=2, use_multiprocessing=True)
    
    with open(os.path.join(DRIVE_PATH, 'node2vec-'+str(year_start)+'.pkl'), "wb") as output_file:
        pickle.dump(node_embeddings, output_file, protocol=pickle.HIGHEST_PROTOCOL)

    return node_embeddings

In [10]:
for yy in years:
  node_embeddings = node2vec_embedding(graph_train, "Node2Vec", yy)  

Training Node2Vec for 'Node2Vec':
1994
link_classification: using 'dot' method to combine node embeddings into edge embeddings
Epoch 1/100
37/37 - 12s - loss: 0.6877 - binary_accuracy: 0.5537 - lr: 0.0010 - 12s/epoch - 329ms/step
Epoch 2/100
37/37 - 13s - loss: 0.6345 - binary_accuracy: 0.6322 - lr: 0.0010 - 13s/epoch - 343ms/step
Epoch 3/100
37/37 - 12s - loss: 0.5954 - binary_accuracy: 0.6862 - lr: 0.0010 - 12s/epoch - 317ms/step
Epoch 4/100
37/37 - 13s - loss: 0.5669 - binary_accuracy: 0.7137 - lr: 0.0010 - 13s/epoch - 350ms/step
Epoch 5/100
37/37 - 12s - loss: 0.5394 - binary_accuracy: 0.7363 - lr: 0.0010 - 12s/epoch - 321ms/step
Epoch 6/100
37/37 - 11s - loss: 0.5183 - binary_accuracy: 0.7533 - lr: 0.0010 - 11s/epoch - 287ms/step
Epoch 7/100
37/37 - 11s - loss: 0.5004 - binary_accuracy: 0.7668 - lr: 0.0010 - 11s/epoch - 299ms/step
Epoch 8/100
37/37 - 11s - loss: 0.4831 - binary_accuracy: 0.7865 - lr: 0.0010 - 11s/epoch - 309ms/step
Epoch 9/100
37/37 - 12s - loss: 0.4681 - binary_a