In [None]:
import deepdish as dd
import csv
import json
import numpy as np

# Embed Document Content Alongside Network Structure 

This notebook is directly using this [implementation](https://github.com/benedekrozemberczki/BANE) of [BANE](https://www.researchgate.net/publication/328688614_Binarized_Attributed_Network_Embedding)

In [None]:
embeddings_path = './embeddings/sBERT/services.h5'
embeddings = dd.io.load(embeddings_path)
# embeddings = np.asarray(embeddings)

with open('./data/services_nodes.json') as sn:
    serv_nodes = json.loads(sn.read())


In [None]:
# reformat indexes of node_nums
node_num_map = {}
node_idx_map = {}
for i, node_num in enumerate(serv_nodes):
    node_num_map[node_num] = i
    node_idx_map[i] = node_num

edges = []
# reindex edgelist
with open('./data/services_edgelist.csv') as edge_file:
    serv_edges_reader = csv.reader(edge_file)
    with open('./data/services_BANE_edgelist.csv', 'w') as new_edges:
        edge_writer = csv.writer(new_edges, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        i = 0
        for edge in serv_edges_reader:
            if i == 0:
                edge_writer.writerow(edge)
                i += 1
                continue
            else:
                src, dst = edge
                edge = (node_num_map[src], node_num_map[dst])
                if i <= 20:
                    print(edge)
                edge_writer.writerow(edge)
                edges.append(edge)
                i += 1
        adj_li = {}
        for i in range(len(serv_nodes)):
            adj_li[i] = []

        for n1,n2 in edges:
            adj_li[n1] = adj_li[n1] + [n2]
            adj_li[n2] = adj_li[n2] + [n1]
        
        no_neigh_nodes = []
        for n in adj_li:
            if adj_li[n] == []:
                node_name = serv_nodes[node_idx_map[n]]['name']
                print("Node {} ({}) has no neighbors".format(n, node_name))
                no_neigh_nodes.append(n)
        
        # add self-edges to see if that fixes dimensionality problem
        for n_idx in no_neigh_nodes:
            edge = (n_idx, n_idx)
            edges.append(edge)
            edge_writer.writerow(edge)

In [None]:
remake_feats = ''
while remake_feats not in ('Y', 'N'):
    remake_feats = input("Would you like to create a new embeddings file for BANE? [Y/N]: ").upper()
remake_feats = True if remake_feats == 'Y' else False

# see here: https://github.com/benedekrozemberczki/BANE#datasets
feat_type = ''
while remake_feats not in ('S', 'D'):
    remake_feats = input("Would you like to save as sparse or dense BANE feats? [S/D]: ").upper()
feat_type = 'sparse' if remake_feats == 'S' else 'dense'
    
if remake_feats:
    
    if feat_type == 'sparse':
        sparse_feats = {}
        for i, node_num in enumerate(serv_nodes):
            sparse_feats[node_num_map[node_num]] = list(map(lambda e: float(e), embeddings[i]))
        with open('./data/BANE_sparse_feats.json', 'w') as feat_json:
            sparse_feats = json.dumps(sparse_feats)
            json.dump(sparse_feats, feat_json)
    else:
        # reset index from above (i still in scope)
        i = 0
        head = ['Node ID'] + ["Dim{} Feat".format(i) for i in range(len(embeddings[i]))]
        head_size = len(head)
        print("Header of Length {}".format(head_size))
        with open('./data/BANE_dense_feats.csv', 'w') as embed_file:
            embed_writer = csv.writer(embed_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            embed_writer.writerow(head)
            lines = []
            for i, node_num in enumerate(serv_nodes):
                # TODO: fix that node_num is saved as str in service_nodes and int in service_edgelist
                line = [node_num_map[node_num]] + list(embeddings[i])
                if len(line) != head_size:
                    print("Line with node {} has wrong length of {}".format(node_num, len(line)))
                lines.append(line)
            sorted_lines = sorted(lines, key = lambda x: x[0])
#             print(sorted_lines[:5])
            print(len(sorted_lines))
            for line in sorted_lines:
                embed_writer.writerow(line)

## NOTE: BANE implementation has runtime error when there are nodes with no neighbors in the network

\* error is due to scipy coo sparse matrix function

To fix:
1. Add self-edges to the nodes with no neighbors (Currently implemented) (**unknown consequences**)
2. Remove nodes with no neighbors  
(probably most sane solution, but concerning long-term since it's unlikely new added services [especially informal ones] will be tagged with taxonomy codes)

In [None]:
# keeping embedding dimension same as one used in s-BERT but should play around with this later
!python BANE/src/main.py --edge-path ./data/services_BANE_edgelist.csv --feature-path ./data/BANE_dense_feats.csv --output-path ./embeddings/BANE/bane_dense_embeddings_out.bin --features dense --dimensions 768