In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import stellargraph as sg

In [2]:
from stellargraph.layer.graphsage import MeanAggregator, AttentionalAggregator, MaxPoolingAggregator
from stellargraph import globalvar
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification

In [None]:
import pickle

In [None]:
from keras.models import load_model

In [None]:
DATA_DIR = os.getenv("DATA_DIR")
MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "models")
content_api = os.path.join(DATA_DIR, "content_api")

api_extract_file = os.path.join(content_api,"07-02-19", "content_json.csv.gz")
content_file = os.path.join(content_api,"content.json.gz")
labelled_file = os.path.join(content_api,"labelled.csv.gz")
edgefile = os.path.join(DATA_DIR, "processed_network", "graphsage_test.csv.gz")

model_file = os.path.join(MODELS_DIR, "graphsage.h5")
vectorizer_file = os.path.join(MODELS_DIR, "vectorizer.pickle")

## Load pretrained `graphSAGE` and `tfidfVectorizer`

In [None]:
model = load_model(model_file, custom_objects={'MeanAggregator': MeanAggregator})

In [None]:
model.summary()

In [None]:
vectorizer = pickle.load(open(vectorizer_file, "rb"))

In [None]:
vectorizer.transform(["this is a test sentence"])

## Load data

In [None]:
labelled = pd.read_csv(labelled_file, compression="gzip")

In [None]:
labelled.columns

In [None]:
labelled.shape, labelled[labelled.publishing_app=="publisher"].shape

In [None]:
edges = pd.read_csv(edgefile, compression='gzip', sep='\t')

In [None]:
edges.head()

In [None]:
node_id = {}
counter=0
for val in zip(edges.source.values, edges.target.values):
    for v in val:
        if v not in node_id.keys():
            node_id[v] = counter
            counter+=1
edges['source_id'] = edges.source.map(lambda x : int(node_id[x]))
edges['target_id'] = edges.target.map(lambda x : int(node_id[x]))

### Map edge pairs and ids to original base_path/content_id

In [None]:
api_extrach = pd.read_csv(api_extract_file, compression="gzip")

In [None]:
api_extrach.dropna(subset=['content_id'],inplace=True)

In [None]:
url_id = dict(zip(api_extrach.url,api_extrach.content_id))

In [None]:
edges['source_cid'] = edges.source.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)
edges['target_cid'] = edges.target.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)

In [None]:
edges.head()

In [None]:
def count_missing(ids):
    missing = 0
    missing_list = []
    all_i = 0 
    set_id = set(labelled.content_id.values)
    for id1 in ids:
        if id1 not in set_id:
            missing+=1
            missing_list.append(id1)
        all_i +=1
    print("included: {} missing: {}".format(all_i-missing, missing))
    return missing_list

In [None]:
missing_list1 = count_missing(set(edges.source_cid.values))
missing_list2 = count_missing(set(edges.target_cid.values))

In [None]:
missing_list1[0:10]

In [None]:
# "5ef7560d-7631-11e4-a3cb-005056011aef" in labelled.content_id.values
# labelled[labelled.content_id=="5ef7560d-7631-11e4-a3cb-005056011aef"]

In [None]:
mainstream = set(labelled[labelled.publishing_app=="publisher"].content_id.values)

In [None]:
len(mainstream)

In [None]:
"{} out of {} mainstream cids in edgelist".format(len(mainstream.intersection(set(edges.source_cid.values))),
                                                  len(mainstream))

## Initialize generator data

In [None]:
df_main = labelled[labelled.publishing_app=="publisher"].copy(deep=True)

In [None]:
labelled_cid = set(labelled.content_id.values)

In [None]:
import itertools

In [None]:
len(labelled_cid)

In [None]:
cid_base = dict(zip(labelled.content_id, labelled.base_path))

In [None]:
cid_pairs = []
basepath_pairs = []
for v,w in itertools.product([list(mainstream)[1]],labelled_cid):
    if v!=w:
        cid_pairs.append((v,w))
        basepath_pairs.append((cid_base[v], cid_base[w]))
len(cid_pairs), len(basepath_pairs)

In [None]:
predict_test = pd.DataFrame({'source_cid':[s for s,_ in cid_pairs], 
                             'target_cid':[t for _,t in cid_pairs],
                             'source_bp':[s for s,_ in basepath_pairs], 
                             'target_bp':[t for _,t in basepath_pairs],
                              })

In [None]:
predict_test.head()

#### *TODO:* Base node_ids on base_paths, not content_id

In [None]:
max_node_id = max(node_id.values()) + 1
for items in zip(predict_test.source_bp.values, predict_test.target_bp.values) :
    for item in items:
        if item not in node_id.keys():
            node_id[item] = max_node_id
            max_node_id+=1

In [None]:
predict_test['source'] = predict_test['source_bp'].map(node_id)
predict_test['target'] = predict_test['target_bp'].map(node_id)

In [None]:
text_dict = {}
for tup in labelled.itertuples():
    if tup.base_path in node_id.keys():
        text_dict[node_id[tup.base_path]] = "{} {}".format(tup.title, tup.description).rstrip()
        
text_list = [(key,value) for key,value in text_dict.items()]
text = [(value) for key,value in text_list]
index = [key for key,value in text_list]

X = vectorizer.transform(text)
X.shape[0]

In [None]:
text_dict[412]

In [None]:
labelled[labelled.content_id=='0f2e8c41-78fa-40f9-9eea-857c07bacd80'][['content_id', 'title','description']]

### Initialize `node_data`

In [None]:
node_data = pd.DataFrame(X.todense(),index=index)
node_data.head()

In [None]:
predict_test.columns

In [None]:
predict_test['label'] = "go_to"

### Set up stellargraph graph object

In [None]:
node_features = node_data[node_data.columns].values

In [None]:
batch_size = 1
num_samples = [20, 10]

def batched_predict(start,end):
    G = nx.from_pandas_edgelist(predict_test[['source','target','label']][start:end], edge_attr="label")

    for nid, f in zip(node_data.index, node_features):
        if nid in G.node.keys():
            G.node[nid][globalvar.TYPE_ATTR_NAME] = "page"  # specify node type
            G.node[nid]["feature"] = f

    G_predict = sg.StellarGraph(G, node_features="feature")

    edge_ids_test = [(e[0],e[1]) for e in G_predict.edges()]

    predict_gen = GraphSAGELinkGenerator(G_predict, batch_size, num_samples).flow(edge_ids_test)

    print(datetime.now().strftime("%H:%M:%S"))

    pred = model.predict_generator(predict_gen, verbose=1, workers=8, use_multiprocessing=True, 
                                   max_queue_size=100)

    print(datetime.now().strftime("%H:%M:%S"))
    print(max(pred))
    return pred

In [None]:
predict_test.shape[0]

In [None]:
def chunker(length, chunksize):
    return [[i,i+chunksize] if i+chunksize < length else [i,length-1] for i in range(0,length,chunksize)]

In [None]:
predictions = []
for indices in chunker(predict_test.shape[0], 10000):
    print(indices[0],indices[1])
    predictions.extend(batched_predict(indices[0],indices[1]))

In [None]:
predict_test['pred'] = predictions

### Old implementation

In [None]:
# node_features[0]

# G = nx.from_pandas_edgelist(predict_test[['source','target','label']], edge_attr="label")

# len(G.nodes)

# for nid, f in zip(node_data.index, node_features):
#     if nid in G.node.keys():
#         G.node[nid][globalvar.TYPE_ATTR_NAME] = "page"  # specify node type
#         G.node[nid]["feature"] = f

# G_predict = sg.StellarGraph(G, node_features="feature")

# batch_size = 1
# num_samples = [20, 10]

# edge_ids_test = [(e[0],e[1]) for e in G_predict.edges()]
# len(edge_ids_test)

# edge_ids_test[0:2]

# predict_gen = GraphSAGELinkGenerator(G_predict,  batch_size, num_samples).flow(edge_ids_test, [0]*len(edge_ids_test))

# type(predict_gen), len(predict_gen.ids)

# pred = model.predict_generator(predict_gen, verbose=1)