In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import itertools

In [2]:
import pickle

In [20]:
# os.listdir('../../data')

['not_found_urls.csv',
 '.DS_Store',
 'content_ids.csv',
 'content_api',
 'processed_network',
 'content_json.csv.gz',
 'processed',
 'content_api_links.csv.gz']

In [21]:
DATA_DIR = os.getenv("DATA_DIR")
if DATA_DIR is None:
    DATA_DIR = '../../data'
    
MODELS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "models")
content_api = os.path.join(DATA_DIR, "content_api")

api_extract_file = os.path.join(content_api,"07-02-19", "content_json.csv.gz")
content_file = os.path.join(content_api,"content.json.gz")
labelled_file = os.path.join(content_api,"labelled.csv.gz")
edgefile = os.path.join(DATA_DIR, "processed_network", "graphsage_test.csv.gz")

vectorizer_file = os.path.join(MODELS_DIR, "vectorizer_v3.pickle")

## Load pretrained `tfidfVectorizer`

In [22]:
vectorizer = pickle.load(open(vectorizer_file, "rb"))

In [23]:
vectorizer.transform(["this is a test sentence"])

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Load data

In [24]:
labelled = pd.read_csv(labelled_file, compression="gzip")

  interactivity=interactivity, compiler=compiler, result=result)


In [25]:
labelled.columns

Index(['base_path', 'content_id', 'description', 'document_type',
       'first_published_at', 'locale', 'primary_publishing_organisation',
       'publishing_app', 'title', 'body', 'combined_text', 'taxon_id',
       'taxon_base_path', 'taxon_name', 'level1taxon', 'level2taxon',
       'level3taxon', 'level4taxon', 'level5taxon'],
      dtype='object')

In [26]:
labelled.shape, labelled[labelled.publishing_app=="publisher"].shape

((302474, 19), (3563, 19))

In [27]:
edges = pd.read_csv(edgefile, compression='gzip', sep='\t')

In [28]:
edges.head()

Unnamed: 0,source,target,weight
0,/government/publications/guidance-for-dependan...,/visa-fees,24
1,/visa-fees,/find-a-visa-application-centre,433
2,/entering-staying-uk/family-visas,/uk-family-visa,148
3,/uk-family-visa,/uk-family-visa/partner-spouse,10863
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,1037


In [29]:
node_id = {}
counter=0
for val in zip(edges.source.values, edges.target.values):
    for v in val:
        if v not in node_id.keys():
            node_id[v] = counter
            counter+=1
edges['source_id'] = edges.source.map(lambda x : int(node_id[x]))
edges['target_id'] = edges.target.map(lambda x : int(node_id[x]))

### Map edge pairs and ids to original base_path/content_id

In [30]:
api_extrach = pd.read_csv(api_extract_file, compression="gzip")

In [31]:
api_extrach.dropna(subset=['content_id'],inplace=True)

In [32]:
url_id = dict(zip(api_extrach.url,api_extrach.content_id))

In [33]:
edges['source_cid'] = edges.source.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)
edges['target_cid'] = edges.target.map(lambda x : url_id[x] if x in url_id.keys() else np.nan)

In [34]:
edges.head()

Unnamed: 0,source,target,weight,source_id,target_id,source_cid,target_cid
0,/government/publications/guidance-for-dependan...,/visa-fees,24,0,1,5ef7560d-7631-11e4-a3cb-005056011aef,1e333395-5dd5-4452-96a3-fbe939928761
1,/visa-fees,/find-a-visa-application-centre,433,1,2,1e333395-5dd5-4452-96a3-fbe939928761,aa055bd6-fde1-4471-8ac2-ac86241b4898
2,/entering-staying-uk/family-visas,/uk-family-visa,148,3,4,d612c61e-22f4-4922-8bb2-b04b9202126e,305acc88-488e-44ec-8e5a-1a5de7819ba9
3,/uk-family-visa,/uk-family-visa/partner-spouse,10863,4,5,305acc88-488e-44ec-8e5a-1a5de7819ba9,305acc88-488e-44ec-8e5a-1a5de7819ba9
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,1037,5,6,305acc88-488e-44ec-8e5a-1a5de7819ba9,5ef421be-7631-11e4-a3cb-005056011aef


In [35]:
def count_missing(ids):
    missing = 0
    missing_list = []
    all_i = 0 
    set_id = set(labelled.content_id.values)
    for id1 in ids:
        if id1 not in set_id:
            missing+=1
            missing_list.append(id1)
        all_i +=1
    print("included: {} missing: {}".format(all_i-missing, missing))
    return missing_list

In [36]:
missing_list1 = count_missing(set(edges.source_cid.values))
missing_list2 = count_missing(set(edges.target_cid.values))

included: 20504 missing: 21225
included: 22577 missing: 23927


In [37]:
missing_list1[0:10]

[nan,
 '300a48c3-9cac-5a9f-bf56-97c8971d869d',
 'b24fca20-4c63-5042-bfc9-faddf1f1d95c',
 '9fa4215f-1034-485d-b5d4-6cde152509cc',
 'ea379f11-df97-416a-b355-7b03b4bb479a',
 'e4491ccf-20c1-4950-a043-579c8cbba644',
 '95a1d0dd-aad3-4c85-a117-1b52f05ea4dd',
 '4fea23c6-5052-55bc-845d-d13ea20c501e',
 '3df8e25b-a46d-5ea0-8932-2ca0a50d3950',
 'f4e9e92d-9192-4e17-90c6-553339bc04c3']

In [38]:
# "5ef7560d-7631-11e4-a3cb-005056011aef" in labelled.content_id.values
# labelled[labelled.content_id=="5ef7560d-7631-11e4-a3cb-005056011aef"]

In [43]:
# df_main = labelled[labelled.publishing_app=="publisher"].copy(deep=True)

In [39]:
mainstream = set(labelled[labelled.publishing_app=="publisher"].content_id.values)

In [40]:
len(mainstream)

2239

In [42]:
"{} out of {} mainstream cids in edgelist".format(len(mainstream.intersection(set(edges.source_cid.values))),
                                                  len(mainstream))

'1881 out of 2239 mainstream cids in edgelist'

## Read in top GOV.UK pages for February 2019
Set up a subsmaple to predict for

In [69]:
top = pd.read_csv(os.path.join(DATA_DIR, "top-pages-govuk-feb19.tsv"), sep='\t', usecols=['Page', 'Page Views'])

In [71]:
top['Page'] = top['Page'].map(lambda x: x.replace("https://www.integration.publishing.service.gov.uk",""))

#### Check which top pages are included in the `labelled` dataset.

In [76]:
top = top[top.Page.isin(labelled.base_path.values)]

In [77]:
top.head()

Unnamed: 0,Page,Page Views
2,/sign-in-universal-credit,8221833
3,/log-in-register-hmrc-online-services,6564918
4,/sold-bought-vehicle,4722768
5,/vehicle-tax,4670761
6,/check-mot-history,4666636


## Initialize generator data

In [61]:
labelled_cid = set(labelled.content_id.values)
len(labelled_cid)

206038

In [80]:
cid_base = dict(zip(labelled.content_id, labelled.base_path))
base_cid = dict(zip(labelled.base_path, labelled.content_id))

In [79]:
# cid_pairs = []
# basepath_pairs = []
# for v,w in itertools.product([list(mainstream)[1]],labelled_cid):
#     if v!=w:
#         cid_pairs.append((v,w))
#         basepath_pairs.append((cid_base[v], cid_base[w]))
# len(cid_pairs), len(basepath_pairs)

### Set up dataframe with top 50 pages

In [90]:
### top base_paths to content_ids
top['content_id'] = top['Page'].map(base_cid)
top_50_ids = top.sort_values("Page Views", ascending=False)['content_id'].values

In [91]:
cid_pairs = []
basepath_pairs = []
for v,w in itertools.product(top_50_ids, labelled_cid):
    if v!=w:
        cid_pairs.append((v,w))
        basepath_pairs.append((cid_base[v], cid_base[w]))
len(cid_pairs), len(basepath_pairs)

(28639143, 28639143)

### Set up test set for prediction, top pages against the entirety of `labelled`

In [92]:
predict_test = pd.DataFrame({'source_cid':[s for s,_ in cid_pairs], 
                             'target_cid':[t for _,t in cid_pairs],
                             'source_bp':[s for s,_ in basepath_pairs], 
                             'target_bp':[t for _,t in basepath_pairs],
                              })

In [93]:
predict_test.head()

Unnamed: 0,source_cid,target_cid,source_bp,target_bp
0,7ae28e91-b918-4b9a-898c-fcdee95a83d0,5d36d60b-7631-11e4-a3cb-005056011aef,/sign-in-childcare-account,/government/publications/caribbean-regional-pr...
1,7ae28e91-b918-4b9a-898c-fcdee95a83d0,5dc410b9-7631-11e4-a3cb-005056011aef,/sign-in-childcare-account,/government/publications/adult-cancers-near-ov...
2,7ae28e91-b918-4b9a-898c-fcdee95a83d0,6057480d-7631-11e4-a3cb-005056011aef,/sign-in-childcare-account,/government/news/shanghai-creativity-is-great-...
3,7ae28e91-b918-4b9a-898c-fcdee95a83d0,5e11f679-7631-11e4-a3cb-005056011aef,/sign-in-childcare-account,/government/news/maria-miller-calls-on-busines...
4,7ae28e91-b918-4b9a-898c-fcdee95a83d0,3f491408-350b-4b98-aa8c-916b9df0b01e,/sign-in-childcare-account,/government/publications/cm0-7dz-g-b-finch-lim...


#### *TODO:* Base node_ids on base_paths, not content_id

In [94]:
max_node_id = max(node_id.values()) + 1
for items in zip(predict_test.source_bp.values, predict_test.target_bp.values) :
    for item in items:
        if item not in node_id.keys():
            node_id[item] = max_node_id
            max_node_id+=1

In [95]:
predict_test['source'] = predict_test['source_bp'].map(node_id)
predict_test['target'] = predict_test['target_bp'].map(node_id)

In [96]:
text_dict = {}
for tup in labelled.itertuples():
    if tup.base_path in node_id.keys():
        text_dict[node_id[tup.base_path]] = "{} {}".format(tup.title, tup.description).rstrip()
        
text_list = [(key,value) for key,value in text_dict.items()]
text = [(value) for key,value in text_list]
index = [key for key,value in text_list]

X = vectorizer.transform(text)
X.shape[0]

206038

In [97]:
text_dict[412]

'get information about property and land how to search for information about property and land in england and wales - find out who owns it how much was paid for it how to get a copy of the deeds and how to check the property boundaries'

In [98]:
labelled[labelled.content_id=='0f2e8c41-78fa-40f9-9eea-857c07bacd80'][['content_id', 'title','description']]

Unnamed: 0,content_id,title,description
296378,0f2e8c41-78fa-40f9-9eea-857c07bacd80,get information about property and land,how to search for information about property a...
299770,0f2e8c41-78fa-40f9-9eea-857c07bacd80,get information about property and land,how to search for information about property a...


### Initialize `node_data`

In [99]:
node_data = pd.DataFrame(X.todense(),index=index)
node_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
268029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68613,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166164,0.0,0.0,0.0,0.0,0.0,0.320345,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
predict_test['label'] = "go_to"

In [101]:
predict_test.shape, node_data.shape

((28639143, 7), (206038, 2000))

## Save to file

In [103]:
predict_file = os.path.join(DATA_DIR, "predict_top50_vs_all.csv.gz")
node_data_file = os.path.join(DATA_DIR, "node_data_labelled_tfidf_2K.csv.gz")

In [None]:
predict_test.to_csv(predict_file, index=False, compression="gzip")
node_data.to_csv(node_data_file, compression="gzip")

In [None]:
n2 = pd.read_csv(node_data_file)
n2.head()