In [97]:
import os
import pandas as pd
import numpy as np
import gzip
import ijson
from ast import literal_eval
import networkx as nx
import itertools
import re

## Directories

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
PREDICT_DIR = os.path.join(DATA_DIR, "predict_network")
content_api = os.path.join(DATA_DIR, "content_api")
sub_content_api = os.path.join(content_api,"all_dates")
# api_links_file = os.path.join(content_api,"07-02-19", "content_api_links.csv.gz")
labelled_file = os.path.join(content_api,"labelled.csv.gz")

## Initialized labelled/content

In [3]:
labelled = pd.read_csv(labelled_file, compression="gzip", usecols=['base_path', 'content_id', 'title'])

In [4]:
labelled.drop_duplicates("content_id", inplace=True)

In [5]:
len(labelled.base_path)

206038

In [6]:
labelled[labelled.base_path=="/sign-in-universal-credit"]

Unnamed: 0,base_path,content_id,title
272249,/sign-in-universal-credit,b220a437-0d51-4390-9993-63345d0c83ad,sign in to your universal credit account


## Read in links data

In [187]:
api_links_files = [os.path.join(sub_content_api, f) for f in os.listdir(sub_content_api) if "api_links" in f]
api_links_files

['/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/missing_urls_240219_content_api_links.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/labelled2_content_api_links.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/pt01of9_content_api_links.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/content_api_links07.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/content_api_links_remote.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/content_api_links_22.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/labelled1_content_api_links.csv.gz',
 '/Users/felisialoukou/Documents/govuk-network-embedding/data/content_api/all_dates/pt02of9_content_api_links.csv.gz',
 '/Users/felisialoukou/Documents/govuk-netwo

In [188]:
links = pd.concat([pd.read_csv(f, compression="gzip") for f in api_links_files], ignore_index=True)
links.drop_duplicates("url", inplace=True)
links.shape, links.columns

((231145, 7),
 Index(['url', 'embedded_links', 'related_links', 'collection_links', 'num_rel',
        'num_emb', 'num_coll'],
       dtype='object'))

In [189]:
missing_urls = set(labelled.base_path).difference(set(links.url))
print(len(missing_urls))

4


In [190]:
with gzip.open(os.path.join(sub_content_api, "missing_urls_250219.csv.gz"), "wt") as w:
    w.write("Node\n")
    for missing in missing_urls:
        w.write("{}\n".format(missing))

In [191]:
link_types = ['embedded_links',  'collection_links', 'related_links']

def lit_eval(x):
    return [l.strip() for l in re.sub("\[|\]|'","",x).split(",")]

for ltype in link_types:
    print(ltype)
    links[ltype] = links[ltype].map(lambda x: lit_eval(x))

embedded_links
collection_links
related_links


In [192]:
links[links.url=="/sign-in-universal-credit"].related_links.values[0][1]

'/apply-universal-credit'

In [193]:
links.columns.get_loc("embedded_links")

1

In [194]:
adj_list = []
for tup in links.itertuples(index=False):
    for ltype in link_types:
        tup_ind = links.columns.get_loc(ltype)
        if len(tup[tup_ind]) > 0:
            for dest in tup[tup_ind]:
#                 if dest in labelled.base_path.values:
                adj_list.append((tup.url, dest, ltype))   

In [195]:
# ((231145, 7), 887717)
links.shape, len(adj_list)

((231145, 7), 887717)

In [196]:
graph = nx.DiGraph()

In [197]:
for s,d,t in adj_list:
    graph.add_edge(s,d,link_type=t)    

In [198]:
nx.info(graph)

'Name: \nType: DiGraph\nNumber of nodes: 437808\nNumber of edges: 512879\nAverage in degree:   1.1715\nAverage out degree:   1.1715'

In [199]:
base_cid = dict(zip(labelled.base_path, labelled.content_id))

In [200]:
rowlist = []
for s,d,t in adj_list:
    row = {'src_node':s,
    'dest_node': d,
    'link_type': t,
    'src_cid': base_cid[s] if s in base_cid.keys() else np.nan,
    'dest_cid': base_cid[d] if d in base_cid.keys() else np.nan}
    rowlist.append(row)

In [201]:
df_structural = pd.DataFrame(rowlist)

In [202]:
df_structural.shape, df_structural.dropna(subset=["dest_cid","src_cid"]).shape

((887717, 5), (62660, 5))

In [203]:
structural_file = os.path.join(content_api, "structural_training_data_no_node_id.csv.gz")
df_struct_filter = df_structural.dropna(subset=["dest_cid","src_cid"]).copy(deep=True)
df_struct_filter.to_csv(structural_file, index=False, compression="gzip")

## Merge structural with functional data

In [204]:
edgefile = os.path.join(DATA_DIR, "processed_network", "edges_graphsagetest_feb_01_18_doo_min15weight_wtext.csv.gz")
edges = pd.read_csv(edgefile, compression="gzip")
edges.head()

Unnamed: 0,src_node,dest_node,weight,src_cid,dest_cid,source,target
0,/government/publications/guidance-for-dependan...,/visa-fees,66,5ef7560d-7631-11e4-a3cb-005056011aef,1e333395-5dd5-4452-96a3-fbe939928761,33265,58314
1,/visa-fees,/find-a-visa-application-centre,1164,1e333395-5dd5-4452-96a3-fbe939928761,aa055bd6-fde1-4471-8ac2-ac86241b4898,58314,12343
2,/entering-staying-uk/family-visas,/uk-family-visa,377,d612c61e-22f4-4922-8bb2-b04b9202126e,305acc88-488e-44ec-8e5a-1a5de7819ba9,23720,25053
3,/uk-family-visa,/uk-family-visa/partner-spouse,26227,305acc88-488e-44ec-8e5a-1a5de7819ba9,305acc88-488e-44ec-8e5a-1a5de7819ba9,25053,14044
4,/uk-family-visa/partner-spouse,/government/publications/application-for-uk-vi...,2733,305acc88-488e-44ec-8e5a-1a5de7819ba9,5ef421be-7631-11e4-a3cb-005056011aef,14044,33797


In [205]:
node_id = dict(zip(edges.src_node, edges.source))
for node,nid in zip(edges.dest_node, edges.target):
    if node not in node_id.keys():
        node_id[node] = nid

In [206]:
df_struct_filter['source'] = df_struct_filter['src_node'].map(lambda x: node_id[x] if x in node_id.keys() else -1)
df_struct_filter['target'] = df_struct_filter['dest_node'].map(lambda x: node_id[x] if x in node_id.keys() else -1)

In [207]:
df_struct_filter.head(2)

Unnamed: 0,dest_cid,dest_node,link_type,src_cid,src_node,source,target
194,5ef78617-7631-11e4-a3cb-005056011aef,/government/publications/transxchange-examples...,collection_links,5ef7544d-7631-11e4-a3cb-005056011aef,/government/collections/transxchange,-1,-1
195,5ef78888-7631-11e4-a3cb-005056011aef,/government/publications/transxchange-overview,collection_links,5ef7544d-7631-11e4-a3cb-005056011aef,/government/collections/transxchange,-1,-1


In [208]:
df_struct_filter.shape

(62660, 7)

In [209]:
# df_struct_filter[df_struct_filter.source==-1].src_node
# df_struct_filter[df_struct_filter.target==-1].dest_node
# df_struct_filter.dest_node
def compute_coverage(src_node, dest_node, miss_src, miss_dest):
    ### Number of unique nodes
    unique_nodes = set(list(src_node) + list(dest_node))
    print(len(unique_nodes))
    missing_nodes = list(set(miss_src))
    missing_nodes.extend(list(set(miss_dest)))
    print(len(missing_nodes))
    print("\n {} nodes in funct, but not in struct".format(len(set(node_id.keys()).difference(unique_nodes))),"\n",
    "{} nodes both in funct and struct data".format(len(set(node_id.keys()).intersection(unique_nodes))),"\n",
    "{} nodes in struct, but not in funct".format(len(set(unique_nodes).difference(node_id.keys()))))

In [210]:
compute_coverage(df_struct_filter.src_node, 
                 df_struct_filter.dest_node, 
                 df_struct_filter[df_struct_filter.source==-1].src_node, 
                 df_struct_filter[df_struct_filter.target==-1].dest_node)

52323
41416

 55541 nodes in funct, but not in struct 
 11078 nodes both in funct and struct data 
 41245 nodes in struct, but not in funct


     55541 nodes in funct, but not in struct 
     11078 nodes both in funct and struct data 
     41245 nodes in struct, but not in funct

In [211]:
max_node = max(node_id.values())+1
for node in missing_nodes:
    if node not in node_id.keys():
        node_id[node] = max_node
        max_node+=1

In [212]:
df_struct_filter['source'] = df_struct_filter['src_node'].map(lambda x: node_id[x] if x in node_id.keys() else -1)
df_struct_filter['target'] = df_struct_filter['dest_node'].map(lambda x: node_id[x] if x in node_id.keys() else -1)

In [214]:
compute_coverage(df_struct_filter.src_node, 
                 df_struct_filter.dest_node, 
                 df_struct_filter[df_struct_filter.source==-1].src_node, 
                 df_struct_filter[df_struct_filter.target==-1].dest_node)

52323
0

 55541 nodes in funct, but not in struct 
 52323 nodes both in funct and struct data 
 0 nodes in struct, but not in funct


In [216]:
df_struct_filter.shape

(62660, 7)

In [220]:
merged_edges = pd.concat([edges, df_struct_filter.drop('link_type', axis=1)], ignore_index=True, sort=True)

In [223]:
edges.shape, merged_edges.shape

((213446, 7), (276106, 7))

In [None]:
### Figure out whicj

In [227]:
merged_edges['edge-id'] = merged_edges[['source','target']].apply(lambda x: "{}-{}".format(x[0],x[1]), axis=1)
print(merged_edges.drop_duplicates("edge-id").shape)
ids = merged_edges['edge-id']
merged_edges[ids.isin(ids[ids.duplicated()])].sort_values("edge-id")

(263756, 8)


Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
125396,5f20dfc0-7631-11e4-a3cb-005056011aef,/guidance/comply-with-the-compulsory-beef-labe...,10016,5f4a692c-7631-11e4-a3cb-005056011aef,/government/collections/beef-and-veal-labellin...,46581,32.0,10016-46581
268771,5f20dfc0-7631-11e4-a3cb-005056011aef,/guidance/comply-with-the-compulsory-beef-labe...,10016,5f4a692c-7631-11e4-a3cb-005056011aef,/government/collections/beef-and-veal-labellin...,46581,,10016-46581
133206,0f82316f-4fd5-45f9-b992-c14ad7504f25,/government/publications/visit-guidance,10039,afbc5dfd-68c0-4eba-ae41-ee7993ec2e1f,/government/collections/visit-visas,16460,66.0,10039-16460
265252,0f82316f-4fd5-45f9-b992-c14ad7504f25,/government/publications/visit-guidance,10039,afbc5dfd-68c0-4eba-ae41-ee7993ec2e1f,/government/collections/visit-visas,16460,,10039-16460
265251,2aa74b54-b119-4dd7-8a75-ddbb030458e8,/government/publications/considering-human-rig...,10039,afbc5dfd-68c0-4eba-ae41-ee7993ec2e1f,/government/collections/visit-visas,61143,,10039-61143
200859,2aa74b54-b119-4dd7-8a75-ddbb030458e8,/government/publications/considering-human-rig...,10039,afbc5dfd-68c0-4eba-ae41-ee7993ec2e1f,/government/collections/visit-visas,61143,15.0,10039-61143
242475,48c0c30d-5f04-4095-8804-d1ee8989761d,/what-different-qualification-levels-mean,10059,a2dde07e-6552-43d3-8c17-22602c38a398,/find-a-regulated-qualification,10185,,10059-10185
242474,48c0c30d-5f04-4095-8804-d1ee8989761d,/what-different-qualification-levels-mean,10059,a2dde07e-6552-43d3-8c17-22602c38a398,/find-a-regulated-qualification,10185,,10059-10185
46092,48c0c30d-5f04-4095-8804-d1ee8989761d,/what-different-qualification-levels-mean,10059,a2dde07e-6552-43d3-8c17-22602c38a398,/find-a-regulated-qualification,10185,432.0,10059-10185
237727,1d5c1538-8599-4174-8540-8fb14ba3185c,/government/publications/application-to-recons...,1008,4816fa06-9e6c-4414-96b6-aa7ec7f24968,/right-of-abode,12850,,1008-12850


In [228]:
merged_edges.drop_duplicates("edge-id", inplace=True)

In [231]:
merged_edges[(merged_edges.src_node=="/universal-credit") & (merged_edges.weight.isna())]

Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
234998,0b0ff1cc-1532-4cd3-836b-0b7ab0359a4b,/call-charges,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,18430,,26857-18430
235003,28774875-4c69-41f0-a9a3-49abb015b34d,/funeral-payments,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,51325,,26857-51325
235004,84f68c01-c5f3-4d20-9b31-d46fec04498c,/legal-aid,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,14677,,26857-14677
235006,a5d44e01-f9b1-48e4-87e0-96eba6def07b,/help-with-prison-visits,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,12935,,26857-12935
235008,145d29c4-cb5c-4b1c-8664-094912f8a34e,/disabled-facilities-grants,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,34166,,26857-34166
235009,2e89e098-f5a8-4a6c-9dc9-ebe0c7b02ae6,/energy-company-obligation,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,39394,,26857-39394
235010,5ef648f9-7631-11e4-a3cb-005056011aef,/government/publications/claiming-discretionar...,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,2099,,26857-2099
235017,c1b3a2fe-9bf5-44ee-a76c-7d34d4880023,/court-fees-what-they-are,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,50126,,26857-50126
235018,4479426d-51c1-4b51-8930-543d994e8818,/apply-free-school-meals,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,54857,,26857-54857
235020,73d634c6-e3d8-4810-abb1-61b8a9a70c88,/mandatory-reconsideration,26857,f790dc71-386e-4440-9689-31f94e7ac64d,/universal-credit,60461,,26857-60461


## Check embeddings file

In [244]:
node_data_file_predict = os.path.join(PREDICT_DIR, "predict_top50_vs_all_node_data_embs_300.csv.gz")
node_data_file_train = os.path.join(content_api, "training_node_data_fixd.csv.gz")

In [None]:
embeddings = pd.read_csv(node_data_file_predict, compression="gzip", index_col=0)

In [238]:
embeddings.shape

(206038, 513)

In [245]:
embeddings1 = pd.read_csv(node_data_file_train, compression="gzip", index_col=0)

In [249]:
c_all = set(list(embeddings.content_id) + list(embeddings1.content_id))
len(c_all) 

220941

In [250]:
content_ids_edges = set(list(merged_edges.src_cid)+list(merged_edges.dest_cid)) 
# content_ids_functedges = set(list(edges.src_cid)+list(edges.dest_cid)) 
len(content_ids_edges)

73173

In [251]:
len(content_ids_edges.difference(c_all))

0

### Count duplicate entries in embeddings

In [252]:
all_embeddings = pd.concat([embeddings, embeddings1], sort=True, ignore_index=True)

In [253]:
ids = all_embeddings['content_id']
all_embeddings[ids.isin(ids[ids.duplicated()])].sort_values("content_id")

Unnamed: 0,0,1,10,100,101,102,103,104,105,106,...,91,92,93,94,95,96,97,98,99,content_id
3994,-0.016188,0.031590,0.012646,-0.021106,-0.000546,-0.053907,-0.013014,-0.051070,0.063367,0.055796,...,-0.026592,-0.030728,-0.063242,0.002924,0.007913,0.036202,-0.012893,0.069854,0.009103,000b3ccc-51a6-4bd4-a332-f2b573e4ded7
221573,-0.016188,0.031590,0.012646,-0.021106,-0.000546,-0.053907,-0.013014,-0.051070,0.063367,0.055796,...,-0.026592,-0.030728,-0.063242,0.002924,0.007913,0.036202,-0.012893,0.069854,0.009103,000b3ccc-51a6-4bd4-a332-f2b573e4ded7
259292,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
266762,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
230390,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
6234,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
249902,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
259432,-0.055287,-0.020550,0.025845,0.049996,-0.049346,-0.053477,0.036689,0.055123,0.033551,0.042913,...,0.055387,-0.050334,0.002952,-0.037240,-0.055529,0.026132,0.054479,-0.023638,-0.055453,000c3ffa-6e21-49e4-b283-2650f0187e82
247853,-0.048413,0.046790,-0.057513,-0.045078,0.002419,-0.018005,0.054131,0.029876,0.049948,0.016554,...,0.044341,-0.019702,-0.032436,0.058858,-0.052162,-0.035839,-0.006849,0.070634,-0.048735,0012a263-e552-4240-b0af-29730e8c1e03
10696,-0.048413,0.046790,-0.057513,-0.045078,0.002419,-0.018005,0.054131,0.029876,0.049948,0.016554,...,0.044341,-0.019702,-0.032436,0.058858,-0.052162,-0.035839,-0.006849,0.070634,-0.048735,0012a263-e552-4240-b0af-29730e8c1e03


In [254]:
all_embeddings.drop_duplicates("content_id", inplace=True)

### Map `content_id` to `node_id` again

In [255]:
nid_cid = dict(zip(merged_edges.source, merged_edges.src_cid))
for nid,cid in zip(merged_edges.target, merged_edges.dest_cid):
    if nid not in nid_cid.keys():
        nid_cid[nid] = cid

In [259]:
cols = [str(i) for i in range(0,512)]
cols[-1]

'511'

In [260]:
embeddings_dict = dict(zip(all_embeddings.content_id, all_embeddings[cols].values))

In [265]:
all_embeddings[all_embeddings.content_id=="80f7a69c-5cdc-4bd5-a16c-15d669ba2cc5"]["3"]

0    0.008009
Name: 3, dtype: float64

In [266]:
rowlist = []
index = []
for nid,cid in nid_cid.items():
    row_dict = {}
    row_dict["content_id"] = cid
    for i,emb in enumerate(embeddings_dict[cid]):
        row_dict[i] = emb
    rowlist.append(row_dict)  
    index.append(nid)
new_embeddings = pd.DataFrame(rowlist, index=index)

In [267]:
new_embeddings.head()

Unnamed: 0,content_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
33265,5ef7560d-7631-11e4-a3cb-005056011aef,-0.047676,-0.027639,0.056349,-0.009413,-0.020528,-0.045546,0.068116,0.018213,0.016371,...,0.019222,0.066869,0.036356,-0.010981,0.030189,-0.045008,0.007033,-0.047787,-0.001289,-0.063644
58314,1e333395-5dd5-4452-96a3-fbe939928761,-0.033119,0.049818,0.042104,-0.017883,-0.007372,-0.054978,0.064178,0.062335,0.011622,...,-0.034333,0.064,0.063919,-0.037945,0.054777,0.023922,0.00265,-0.063124,-0.019809,-0.057015
23720,d612c61e-22f4-4922-8bb2-b04b9202126e,-0.067857,-0.015453,-0.028865,0.020038,0.017298,-0.064374,0.081447,-0.008263,0.021375,...,0.028531,0.019493,0.058379,-0.001819,0.070202,-0.067259,0.041385,-0.066728,0.001454,-0.014244
25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,-0.055095,0.003975,0.034079,-0.016298,-0.041664,-0.046474,0.056651,0.056367,0.050351,...,0.02253,0.056369,0.056647,-0.03575,0.054273,0.048701,0.021803,-0.055919,0.002437,-0.052509
14044,305acc88-488e-44ec-8e5a-1a5de7819ba9,-0.055095,0.003975,0.034079,-0.016298,-0.041664,-0.046474,0.056651,0.056367,0.050351,...,0.02253,0.056369,0.056647,-0.03575,0.054273,0.048701,0.021803,-0.055919,0.002437,-0.052509


In [268]:
merged_edges[merged_edges.source==25053]

Unnamed: 0,dest_cid,dest_node,source,src_cid,src_node,target,weight,edge-id
3,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/partner-spouse,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,14044,26227.0,25053-14044
1087,32d4d2c8-2f9c-406f-96e4-65222074642a,/settle-in-the-uk,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,28920,2729.0,25053-28920
1216,686e6d1d-78c2-4c9d-b863-be5c5d8ed787,/family-permit,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,43124,543.0,25053-43124
1220,32d4d2c8-2f9c-406f-96e4-65222074642a,/settle-in-the-uk/y/none-of-these,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,44491,126.0,25053-44491
1221,d6c2de5d-ef90-45d1-82d4-5f2438369eea,/government/brexit,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,64440,37.0,25053-64440
2019,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/private-life,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,51911,2659.0,25053-51911
3593,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/provide-information,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,29781,1125.0,25053-29781
3608,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa/knowledge-of-english,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,32219,1252.0,25053-32219
5181,b3fb8ff7-ac2e-4440-8c0f-227eb038c436,/healthcare-immigration-application,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,13119,2096.0,25053-13119
6088,58b05bc2-fde5-4a0b-af73-8edc532674f8,/contact,25053,305acc88-488e-44ec-8e5a-1a5de7819ba9,/uk-family-visa,56268,62.0,25053-56268


In [269]:
train_network = os.path.join(DATA_DIR,"train_network")
merged_file = os.path.join(train_network, "merged_edges_struct_funct.csv.gz")
new_embeddings_file = os.path.join(train_network, "merged_embs_struct_funct.csv.gz")

In [270]:
merged_edges.to_csv(merged_file, compression="gzip", index=False)
new_embeddings.to_csv(new_embeddings_file, compression="gzip")