In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import stellargraph as sg

In [18]:
from stellargraph.layer.graphsage import MeanAggregator

In [29]:
import pickle

In [15]:
from keras.models import load_model

Using TensorFlow backend.


## Load data

In [54]:
DATA_DIR = os.getenv("DATA_DIR")
MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "models")
content_api = os.path.join(DATA_DIR, "content_api")

api_extract_file = os.path.join(content_api,"07-02-19", "content_json.csv.gz")
content_file = os.path.join(content_api,"content.json.gz")
labelled_file = os.path.join(content_api,"labelled.csv.gz")
edgefile = os.path.join(DATA_DIR, "processed_network", "graphsage_test.csv.gz")

model_file = os.path.join(MODELS_DIR, "graphsage.h5")
vectorizer_file = os.path.join(MODELS_DIR, "vectorizer.pickle")

In [21]:
# content = pd.read_json(os.path.join(content_api,'content.json.gz'),compression='gzip')

In [None]:
# content.head()

In [44]:
labelled = pd.read_csv(labelled_file, compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [46]:
labelled.columns

Index(['base_path', 'content_id', 'description', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon'],
      dtype='object')

In [48]:
labelled.shape, labelled[labelled.publishing_app=="publisher"].shape

((302474, 19), (3563, 19))

In [51]:
edges = pd.read_csv(edgefile, compression='gzip', sep='\t')

In [52]:
edges.head()

Unnamed: 0,source,target,weight
0,/government/publications/guidance-for-dependan...,/visa-fees,24
1,/visa-fees,/find-a-visa-application-centre,433
2,/entering-staying-uk/family-visas,/uk-family-visa,148
3,/uk-family-visa,/uk-family-visa/partner-spouse,10863
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,1037


In [53]:
node_id = {}
counter=0
for val in zip(edges.source.values, edges.target.values):
    for v in val:
        if v not in node_id.keys():
            node_id[v] = counter
            counter+=1
edges['source_id'] = edges.source.map(lambda x : int(node_id[x]))
edges['target_id'] = edges.target.map(lambda x : int(node_id[x]))

### Map edge pairs and ids to original base_path/content_id

In [55]:
api_extrach = pd.read_csv(api_extract_file, compression="gzip")

In [56]:
api_extrach.dropna(subset=['content_id'],inplace=True)

In [57]:
url_id = dict(zip(api_extrach.url,api_extrach.content_id))

In [60]:
edges['source_cid'] = edges.source.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)
edges['target_cid'] = edges.target.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)

In [61]:
edges.head()

Unnamed: 0,source,target,weight,source_id,target_id,source_cid,target_cid
0,/government/publications/guidance-for-dependan...,/visa-fees,24,0,1,5ef7560d-7631-11e4-a3cb-005056011aef,1e333395-5dd5-4452-96a3-fbe939928761
1,/visa-fees,/find-a-visa-application-centre,433,1,2,1e333395-5dd5-4452-96a3-fbe939928761,aa055bd6-fde1-4471-8ac2-ac86241b4898
2,/entering-staying-uk/family-visas,/uk-family-visa,148,3,4,d612c61e-22f4-4922-8bb2-b04b9202126e,305acc88-488e-44ec-8e5a-1a5de7819ba9
3,/uk-family-visa,/uk-family-visa/partner-spouse,10863,4,5,305acc88-488e-44ec-8e5a-1a5de7819ba9,305acc88-488e-44ec-8e5a-1a5de7819ba9
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,1037,5,6,305acc88-488e-44ec-8e5a-1a5de7819ba9,5ef421be-7631-11e4-a3cb-005056011aef


In [93]:
def count_missing(ids):
    missing = 0
    missing_list = []
    all_i = 0 
    set_id = set(labelled.content_id.values)
    for id1 in ids:
        if id1 not in set_id:
            missing+=1
            missing_list.append(id1)
        all_i +=1
    print("included: {} missing: {}".format(all_i-missing, missing))
    return missing_list

In [94]:
missing_list1 = count_missing(set(edges.source_cid.values))
missing_list2 = count_missing(set(edges.target_cid.values))

included: 20504 missing: 21225
included: 22577 missing: 23927


In [95]:
missing_list1[0:10]

[nan,
 '90d0f130-6165-5a40-8f12-004674c157b6',
 'ccd89a28-9e6c-402d-b76e-4f876b475efb',
 '5f4a9e83-7631-11e4-a3cb-005056011aef',
 '569a9ee5-c195-4b7f-b9dc-edc17a09113f',
 '5afc3c2c-2ee4-5b0f-a374-88a38b3f36a6',
 '5f5a18d7-7631-11e4-a3cb-005056011aef',
 'afd91d8d-d570-4900-8fad-7a377cc97a1a',
 '194ee00d-9fc7-439e-b9d1-832aa57e4076',
 '391bd20f-5cb9-400c-9754-5c0214f53b2d']

In [None]:
# "5ef7560d-7631-11e4-a3cb-005056011aef" in labelled.content_id.values
# labelled[labelled.content_id=="5ef7560d-7631-11e4-a3cb-005056011aef"]

In [101]:
mainstream = set(labelled[labelled.publishing_app=="publisher"].content_id.values)

In [103]:
len(mainstream)

2239

In [105]:
"{} out of {} mainstream cids in edgelist".format(len(mainstream.intersection(set(edges.source_cid.values))),
                                                  len(mainstream))

'1881 out of 2239 mainstream cids in edgelist'

## Load pretrained `graphSAGE` and `tfidfVectorizer`

In [24]:
model = load_model(model_file, custom_objects={'MeanAggregator': MeanAggregator})

In [38]:
# model.summary()

In [42]:
vectorizer = pickle.load(open(vectorizer_file, "rb"))

In [43]:
vectorizer.transform(["this is a test sentence"])

<1x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Initialize generator data

In [106]:
df_main = labelled[labelled.publishing_app=="publisher"].copy(deep=True)

In [116]:
labelled_cid = set(labelled.content_id.values)

In [109]:
import itertools

In [120]:
# for v,w in itertools.product([1],[1,2,3]):
#     print(v,w)

1 1
1 2
1 3


In [122]:
len(labelled_cid)

206038

In [121]:
cid_pairs = []
for v,w in itertools.product([list(mainstream)[0]],labelled_cid):
    if v!=w:
        cid_pairs.append((v,w))
len(cid_pairs)

206037

In [125]:
predict_test = pd.DataFrame({'source_cid':[s for s,_ in cid_pairs], 'target_cid':[t for _,t in cid_pairs]})

In [127]:
edges.head()

Unnamed: 0,source,target,weight,source_id,target_id,source_cid,target_cid
0,/government/publications/guidance-for-dependan...,/visa-fees,24,0,1,5ef7560d-7631-11e4-a3cb-005056011aef,1e333395-5dd5-4452-96a3-fbe939928761
1,/visa-fees,/find-a-visa-application-centre,433,1,2,1e333395-5dd5-4452-96a3-fbe939928761,aa055bd6-fde1-4471-8ac2-ac86241b4898
2,/entering-staying-uk/family-visas,/uk-family-visa,148,3,4,d612c61e-22f4-4922-8bb2-b04b9202126e,305acc88-488e-44ec-8e5a-1a5de7819ba9
3,/uk-family-visa,/uk-family-visa/partner-spouse,10863,4,5,305acc88-488e-44ec-8e5a-1a5de7819ba9,305acc88-488e-44ec-8e5a-1a5de7819ba9
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,1037,5,6,305acc88-488e-44ec-8e5a-1a5de7819ba9,5ef421be-7631-11e4-a3cb-005056011aef


In [37]:
model.predict_generator()

TypeError: predict_generator() missing 1 required positional argument: 'generator'