In [14]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import stellargraph as sg
from stellargraph.layer.graphsage import MeanAggregator, AttentionalAggregator,\
                        MaxPoolingAggregator, MeanPoolingAggregator
# from stellargraph import globalvar
from stellargraph.mapper import GraphSAGELinkGenerator
# from stellargraph.layer import GraphSAGE, link_classification
from datetime import datetime

In [2]:
from keras.models import load_model

In [5]:
DATA_DIR = os.getenv("DATA_DIR")
if DATA_DIR is None:
    DATA_DIR = '../../data'
PREDICT_DIR = os.path.join(DATA_DIR, "predict_network")
MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "models")

print(os.listdir(PREDICT_DIR))
print(os.listdir(MODELS_DIR))

predict_file = os.path.join(PREDICT_DIR, "predict_top50_vs_all.csv.gz")
node_data_file = os.path.join(PREDICT_DIR, "node_data_labelled_tfidf_2K.csv.gz")
model_file = os.path.join(MODELS_DIR, "graphsage_attentional_20meanpool_20e.h5")

['node_data_labelled_tfidf_2K.csv.gz', 'predict_top50_vs_all.csv.gz']
['.gitkeep', 'graphsage_20maxpool_20e_105_01d.h5', 'graphsage_attentional_20meanpool_20e.h5', 'graphsage_attentional_32layer.h5', 'vectorizer_v3.pickle', 'vectorizer.pickle', 'graphsage.h5']


In [18]:
predict_test = pd.read_csv(predict_file, compression="gzip")
node_data = pd.read_csv(node_data_file, compression="gzip", index_col=0)

In [19]:
node_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
268029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68613,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166164,0.0,0.0,0.0,0.0,0.0,0.320345,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
model = load_model(model_file, custom_objects={'MeanPoolingAggregator': MeanPoolingAggregator})
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           (None, 20, 2000)     0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           (None, 200, 2000)    0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 20, 2000)     0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 200, 2000)    0                                            
__________________________________________________________________________________________________
input_13 (

In [21]:
node_features = node_data[node_data.columns].values

In [73]:
batch_size = 1
num_samples = [20, 10]

def batched_predict(start,end):
    G = nx.from_pandas_edgelist(predict_test[['source','target','label']][start:end], edge_attr="label")

    for nid, f in zip(node_data.index, node_features):
        if nid in G.node.keys():
            G.node[nid][globalvar.TYPE_ATTR_NAME] = "page"  # specify node type
            G.node[nid]["feature"] = f

    G_predict = sg.StellarGraph(G, node_features="feature")

    edge_ids_test = [(e[0],e[1]) for e in G_predict.edges()]

    predict_gen = GraphSAGELinkGenerator(G_predict, batch_size, num_samples).flow(edge_ids_test)

    print(datetime.now().strftime("%H:%M:%S"))

    pred = model.predict_generator(predict_gen, verbose=1, workers=8, use_multiprocessing=True, 
                                   max_queue_size=100)

    print(datetime.now().strftime("%H:%M:%S"))
    print("Got maximum prediction: {}".format(max(pred)))
    return [p[0] for p in pred], edge_ids_test

In [74]:
def chunker(length, chunksize):
    return [[i,i+chunksize] if i+chunksize < length else [i,length-1] for i in range(0, length, chunksize)]

In [None]:
predictions = []
test_ids = []
for indices in chunker(predict_test[0:20000].shape[0], 10000):
    print(indices[0],indices[1])
    pred_batch, id_batch = batched_predict(indices[0],indices[1])
#     print(pred_batch)
    predictions.extend(pred_batch)
    test_ids.extend(id_batch)

0 10000
11:07:08

In [None]:
predictions[0:10]

In [None]:
tempo = predict_test[0:len(predictions)].copy(deep=True)
tempo['predictions'] = predictions 
tempo['test_ids'] = test_ids
print(tempo.shape)
tempo.sort_values("predictions",ascending=False)

In [77]:
predict_test['predictions'] = predictions 
predict_test['test_ids'] = test_ids
# print(tempo.shape)

ValueError: Length of values does not match length of index