In [14]:
import deepdish as dd
import csv
import json
import numpy as np

In [15]:
embeddings_path = './embeddings/sBERT/services.h5'
embeddings = dd.io.load(embeddings_path)
# embeddings = np.asarray(embeddings)

with open('./data/services_nodes.json') as sn:
    serv_nodes = json.loads(sn.read())


In [17]:
# reformat indexes of node_nums
node_num_map = {}
node_idx_map = {}
for i, node_num in enumerate(serv_nodes):
    node_num_map[node_num] = i
    node_idx_map[i] = node_num

edges = []
# reindex edgelist
with open('./data/services_edgelist.csv') as edge_file:
    serv_edges_reader = csv.reader(edge_file)
    with open('./data/services_BANE_edgelist.csv', 'w') as new_edges:
        edge_writer = csv.writer(new_edges, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        
        i = 0
        for edge in serv_edges_reader:
            if i == 0:
                edge_writer.writerow(edge)
                i += 1
                continue
            else:
                src, dst = edge
                edge = (node_num_map[src], node_num_map[dst])
                if i <= 20:
                    print(edge)
                edge_writer.writerow(edge)
                edges.append(edge)
                i += 1
        adj_li = {}
        for i in range(len(serv_nodes)):
            adj_li[i] = []

        for n1,n2 in edges:
            adj_li[n1] = adj_li[n1] + [n2]
            adj_li[n2] = adj_li[n2] + [n1]
        
        no_neigh_nodes = []
        for n in adj_li:
            if adj_li[n] == []:
                node_name = serv_nodes[node_idx_map[n]]
                print("Node {} ({}) has no neighbors".format(n, node_name))
                no_neigh_nodes.append(n)
        
        # add self-edges to see if that fixes dimensionality problem
        for n_idx in no_neigh_nodes:
            edge = (n_idx, n_idx)
            edges.append(edge)
            edge_writer.writerow(edge)

(9884, 3531)
(6495, 9972)
(12014, 3323)
(12032, 12032)
(108, 5188)
(7137, 1863)
(988, 6782)
(3350, 4845)
(5063, 6027)
(3276, 15476)
(12092, 16249)
(6145, 291)
(9196, 8849)
(9350, 2428)
(14718, 13998)
(4878, 15902)
(6889, 16164)
(10233, 4574)
(1122, 1808)
(5440, 16369)
Node 146 ({'AGENCY_ID': 73, 'SERV_ID': 6718, 'name': 'Adult Immunizations', 'SERV_AKA': '', 'description': 'Offers limited adult vaccinations, depending on the general need of the community. Vaccines may range from measles (MMR), tetanus (Tdap), hepatitis A, hepatitis B, and seasonal flu shots. Recommended to check availability in advance; vaccines subject to change without notice.', 'SERV_SITES': '[{"Site_Id":92,"Name":"Elkhart County Health Department"}]', 'SERV_GEO_TAG': '[{"scope":"county","county":"Elkhart","state":"IN"}]', 'SERV_TAXONOMY_CODES': '[{"taxonomy_code":"X_LT-3400.0100"}]', 'SERV_OP_HOURS': 'Mon 8am-5pm; Tues-Fri 8am-4pm', 'SERV_CLOSED_TIMES': '', 'SERV_WEB_URL': 'www.elkhartcountyhealth.org', 'SERV_EMAIL

In [18]:
remake_feats = ''
while remake_feats not in ('Y', 'N'):
    remake_feats = input("Would you like to create a new embeddings file for BANE? [Y/N]: ").upper()
remake_feats = True if remake_feats == 'Y' else False

# see here: https://github.com/benedekrozemberczki/BANE#datasets
feat_type = ''
while remake_feats not in ('S', 'D'):
    remake_feats = input("Would you like to save as sparse or dense BANE feats? [S/D]: ").upper()
feat_type = 'sparse' if remake_feats == 'S' else 'dense'
    
if remake_feats:
    
    if feat_type == 'sparse':
        sparse_feats = {}
        for i, node_num in enumerate(serv_nodes):
            sparse_feats[node_num_map[node_num]] = list(map(lambda e: float(e), embeddings[i]))
        with open('./data/BANE_sparse_feats.json', 'w') as feat_json:
            sparse_feats = json.dumps(sparse_feats)
            json.dump(sparse_feats, feat_json)
    else:
        # reset index from above (i still in scope)
        i = 0
        head = ['Node ID'] + ["Dim{} Feat".format(i) for i in range(len(embeddings[i]))]
        head_size = len(head)
        print("Header of Length {}".format(head_size))
        with open('./data/BANE_dense_feats.csv', 'w') as embed_file:
            embed_writer = csv.writer(embed_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            embed_writer.writerow(head)
            lines = []
            for i, node_num in enumerate(serv_nodes):
                # TODO: fix that node_num is saved as str in service_nodes and int in service_edgelist
                line = [node_num_map[node_num]] + list(embeddings[i])
                if len(line) != head_size:
                    print("Line with node {} has wrong length of {}".format(node_num, len(line)))
                lines.append(line)
            sorted_lines = sorted(lines, key = lambda x: x[0])
#             print(sorted_lines[:5])
            print(len(sorted_lines))
            for line in sorted_lines:
                embed_writer.writerow(line)

Would you like to create a new embeddings file for BANE? [Y/N]:  y
Would you like to save as sparse or dense BANE feats? [S/D]:  d


Header of Length 769
16547


## NOTE: Both Algorithms have issues when there are no nodes with no neighbors in the network

To fix:
1. Add self-edges to the nodes with no neighbors (Currently implemented) (**unknown consequences**)
2. Remove nodes with no neighbors (probably most sane solution, but concerning long-term since it's unlikely new added services 
especially informal ones will be tagged with taxonomy codes)

In [23]:
!python BANE/src/main.py --edge-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/data/services_BANE_edgelist.csv --feature-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/data/BANE_dense_feats.csv --output-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/embeddings/BANE/bane_dense_embeddings_out.bin --features dense --dimensions 512

+----------------------+-------------------------------------------------------+
|        Alpha         |                         0.01                          |
| Approximation rounds | 5                                                     |
+----------------------+-------------------------------------------------------+
| Binarization rounds  | 10                                                    |
+----------------------+-------------------------------------------------------+
| Dimensions           | 512                                                   |
+----------------------+-------------------------------------------------------+
| Edge path            | /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Rese |
|                      | arch/code_and_data/data/services_BANE_edgelist.csv    |
+----------------------+-------------------------------------------------------+
| Feature path         | /home/asvnpr/Documents/ND_CSE/workspace/Care-         |
|                      | Net

In [24]:
!python TENE/src/main.py --edge-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/data/services_BANE_edgelist.csv --feature-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/data/BANE_dense_feats.csv --output-path /home/asvnpr/Documents/ND_CSE/workspace/Care-Net_Research/code_and_data/embeddings/TENE/tene_dense_embeddings_out.bin --features dense --dimensions 768 

+---------------+--------------------------------------------------------------+
|   Parameter   |                            Value                             |
| Alpha         | 1                                                            |
+---------------+--------------------------------------------------------------+
| Beta          | 1                                                            |
+---------------+--------------------------------------------------------------+
| Dimensions    | 768                                                          |
+---------------+--------------------------------------------------------------+
| Edge path     | /home/asvnpr/Documents/ND_CSE/workspace/Care-                |
|               | Net_Research/code_and_data/data/services_BANE_edgelist.csv   |
+---------------+--------------------------------------------------------------+
| Feature path  | /home/asvnpr/Documents/ND_CSE/workspace/Care-                |
|               | Net_Resear

In [19]:
print("There are {} nodes in serv_nodes".format(len(serv_nodes)))
mx = 0
u_edges = set()
for n1,n2 in edges:
    if n1 > mx:
        mx = n1
    if n2 > mx:
        mx = n2
    u_edges.add(n1)
    u_edges.add(n2)
print("The largest node index in edgelist is {}".format(mx))
print("There are {} unique nodes in the network".format(len(u_edges)))

There are 16547 nodes in serv_nodes
The largest node index in edgelist is 16546
There are 16547 unique nodes in the network


In [92]:
print(len(list(set(list(serv_nodes.keys())))))

16547


Node 146 has no neighbors
Node 450 has no neighbors
Node 1023 has no neighbors
Node 1566 has no neighbors
Node 2025 has no neighbors
Node 2110 has no neighbors
Node 2482 has no neighbors
Node 2779 has no neighbors
Node 2837 has no neighbors
Node 2873 has no neighbors
Node 2895 has no neighbors
Node 2936 has no neighbors
Node 3224 has no neighbors
Node 3924 has no neighbors
Node 3955 has no neighbors
Node 4154 has no neighbors
Node 4226 has no neighbors
Node 4695 has no neighbors
Node 5010 has no neighbors
Node 5645 has no neighbors
Node 5646 has no neighbors
Node 5647 has no neighbors
Node 6210 has no neighbors
Node 6246 has no neighbors
Node 6247 has no neighbors
Node 6671 has no neighbors
Node 8361 has no neighbors
Node 8362 has no neighbors
Node 8363 has no neighbors
Node 8365 has no neighbors
Node 8367 has no neighbors
Node 8456 has no neighbors
Node 8485 has no neighbors
Node 8987 has no neighbors
Node 9444 has no neighbors
Node 10111 has no neighbors
Node 10395 has no neighbors
N