In [1]:
import os
import pandas as pd

In [23]:
DATA_DIR = os.getenv("DATA_DIR")
content_api = os.path.join(DATA_DIR, "content_api")
processed_network = os.path.join(DATA_DIR, "processed_network")

## Load up embeddings

In [10]:
embeddings_filename = os.path.join(content_api, "text_for_embedding_reviewed_cid_embeds_300.csv.gz")

In [14]:
embeddings = pd.read_csv(embeddings_filename, compression="gzip")

In [17]:
embeddings.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,content_id
0,-0.047676,-0.027639,0.056349,-0.009413,-0.020528,-0.045546,0.068116,0.018213,0.016371,0.047445,...,0.066869,0.036356,-0.010981,0.030189,-0.045008,0.007033,-0.047787,-0.001289,-0.063644,5ef7560d-7631-11e4-a3cb-005056011aef
1,-0.033119,0.049818,0.042104,-0.017883,-0.007372,-0.054978,0.064178,0.062335,0.011622,0.006318,...,0.064,0.063919,-0.037945,0.054777,0.023922,0.00265,-0.063124,-0.019809,-0.057015,1e333395-5dd5-4452-96a3-fbe939928761


In [19]:
embeddings.shape

(31932, 513)

In [20]:
embeddings.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '503', '504', '505', '506', '507', '508', '509', '510', '511',
       'content_id'],
      dtype='object', length=513)

In [21]:
cid_emb = dict(zip(embeddings['content_id'], embeddings[embeddings.columns[0:-1]].values))

## Load up edges file for `node_id` mapping to embeddings df

In [24]:
edges_file_filtered = os.path.join(processed_network, "edges_graphsagetest_feb_01_18_doo_min15weight_wtext.csv.gz")
edges = pd.read_csv(edges_file_filtered, compression="gzip")

### Set up `node_id` to `content_id` dictionary

In [25]:
nid_cid = dict(zip(edges['source'], edges['src_cid']))
for nid, cid in zip(edges['target'], edges['dest_cid']):
    if nid not in nid_cid.keys():
        nid_cid[nid] = cid
len(nid_cid)

In [35]:
edges[edges.src_node=="/sign-in-universal-credit"][0:2]

Unnamed: 0,src_node,dest_node,weight,src_cid,dest_cid,source,target
177,/sign-in-universal-credit,/,46821,b220a437-0d51-4390-9993-63345d0c83ad,f3bbdec2-0e62-4520-a7fd-6ffd5d36e03a,5947,4412
1694,/sign-in-universal-credit,/jobsearch,29431,b220a437-0d51-4390-9993-63345d0c83ad,a01fcb59-5dc8-4bf5-b06d-dd567a6d2f5f,5947,42642


In [31]:
target_id = 5947
target_node = "/sign-in-universal-credit"
nid_cid[target_id], edges[edges.src_node==target_node].src_cid.value_counts(),\
                        edges[edges.src_node==target_node].src_cid.value_counts()

('b220a437-0d51-4390-9993-63345d0c83ad',
 b220a437-0d51-4390-9993-63345d0c83ad    388
 Name: src_cid, dtype: int64,
 b220a437-0d51-4390-9993-63345d0c83ad    388
 Name: src_cid, dtype: int64)

'b220a437-0d51-4390-9993-63345d0c83ad' /sign-in-universal-credit 5947

### Map embeddings to `node_id`

In [37]:
rowlist = []
index = []
for nid in set(list(edges['source']) + list(edges['target'])):
    row_dict = {}
    for i,emb in enumerate(cid_emb[nid_cid[nid]]):
        row_dict[i] = emb
    row_dict["content_id"] = nid_cid[nid]
    rowlist.append(row_dict)  
    index.append(nid)

In [38]:
df_embs = pd.DataFrame(rowlist, index=index)

In [44]:
df_embs.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,content_id
0,0.014993,0.024639,0.058578,0.008009,0.008096,-0.062776,0.059777,0.005778,-0.008614,0.059473,...,0.065568,0.04427,-0.046852,0.035862,-0.043941,0.009524,-0.037714,-0.001525,-0.06366,80f7a69c-5cdc-4bd5-a16c-15d669ba2cc5
1,-0.057149,0.025875,0.049524,0.024483,-0.007833,-0.035528,0.056479,0.009408,-0.04437,-0.040844,...,0.056229,0.056662,-0.038329,0.056141,-0.052406,-0.038637,-0.047737,0.046157,-0.054258,81fb4151-bc2b-49b1-b86e-22f2a1d14208
2,-0.002469,-0.00908,0.014424,-0.018611,0.0046,-0.048021,0.067327,0.030089,-0.014283,0.058513,...,0.057214,0.01574,-0.032861,0.047954,-0.016275,0.011618,-0.041357,0.011032,-0.018847,5ef9b7da-7631-11e4-a3cb-005056011aef


#### Verified results, match across training and predict data embeddings...

In [45]:
embeddings_file = os.path.join(content_api, "training_node_data_fixd.csv.gz")
df_embs.to_csv(embeddings_file, compression="gzip")