# Graph Nodes Embedding

- Use Doc2Vec to embed the description, name, etc of services 
- cluster these embeddings and measure cluster performance

later:
- use [BANE](https://github.com/benedekrozemberczki/BANE) code to embed the nodes taking into account their network structure
- compare cluster performance


In [1]:
import json
import os
import csv
import random
import pandas as pd
import nltk
import typing
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
# %%capture
# tqdm().pandas()

In [2]:
# open our datasets
with open('./data/services_nodes.json') as sn:
    serv_nodes = json.loads(sn.read())
    
with open('./data/services_edgelist.csv') as se:
    serv_edges = csv.reader(se)

with open('./data/HIN_nodes.json') as taxo:
    taxo_nodes = json.loads(taxo.read())

with open('./data/code_to_node_num.json') as cn:
    code_trans = json.loads(cn.read())

In [3]:
# following: https://towardsdatascience.com/machine-learning-text-processing-1d5a2d638958
# other ref: https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908

# dictionary used to store our services node content to be embeded
serv_cont = {}

# service features to be embedded as text or tags
# maybe fix to get actual agency name instead of id
text_feats = [
    'name', 
    'akas', 
    'description', 
    'codes',
    'eligibility'
]

from nltk.stem import WordNetLemmatizer
import string
# returns a 
def preprocessText(text: str) -> typing.List :
    # normalize: convert to lower, remove 
    # numbers, remove punctuation
    text = text.lower()
#     text = text.translate(str.maketrans('','', string.digits))
#     text = text.translate(str.maketrans('','',string.punctuation))
    words = []
    # tokenize into sentences
    text_sents = nltk.sent_tokenize(text)
#     for sent in text_sents:
#         words.extend(nltk.word_tokenize(sent))
    
#     stop_words = set(nltk.corpus.stopwords.words())
    # remove stopwords like 'the', 'is', 'a', etc
#     words = [w for w in words if w not in stop_words]
#     # get base form of words
#     lemmer = WordNetLemmatizer()
#     words = [lemmer.lemmatize(w) for w in words]
    
    return text_sents

In [4]:
docs = []
tagged_sents = {}
serv_num = len(serv_nodes)

# for each service node
with tqdm(total=serv_num, desc='Pre-processing Text') as pbar:
    for node_num in serv_nodes:

        node = serv_nodes[node_num]
        sents = []
        # preprocess the text content of the node and it's taxonomy code nodes
        for feat in text_feats:

            text = node[feat]
            if feat in ('akas', 'eligibility'):
                text = ' '.join(text)

            elif feat == 'codes':
                for cn in node[feat]:
                    code = taxo_nodes[str(code_trans[cn])]
                    sents.extend(preprocessText(code['name']))
                    sents.extend(preprocessText(code['description']))
                    if 'keywords' in code:
                        keywords = '. '.join(code['keywords'])
                        sents.extend(preprocessText(keywords))

            # some nodes have empty features
            elif text:
                sents.extend(preprocessText(text))
            # preprocess our node raw text feats

#             docs.append(TaggedDocument(words=words, tags=[node_num]))
        pbar.update(1)
        tagged_sents[node_num] = sents

HBox(children=(IntProgress(value=0, description='Pre-processing Text', max=16547, style=ProgressStyle(descript…




In [5]:
from sentence_transformers import SentenceTransformer

# see this section for more info: https://github.com/UKPLab/sentence-transformers#pretrained-models
models = [
    'roberta-base-nli-stsb-mean-tokens'
]
choice = '-1'
choice_list = '\n'.join(["{}. {}".format(i+1, models[i]) for i in range(len(models))])
while int(choice)-1 not in list(range(len(models))) or not choice.isdigit():
    choice = input("Choose one of the following pre-trained models to use: \n{}".format(choice_list))
choice = int(choice)-1

model = SentenceTransformer(models[choice])

Choose one of the following pre-trained models to use: 
1. roberta-base-nli-stsb-mean-tokens 0
Choose one of the following pre-trained models to use: 
1. roberta-base-nli-stsb-mean-tokens 1


In [7]:
# model generates embeddings per sentence but need embedding per "document" ie service
# use mean of all vector embeddings 
# (relatively safe assumption since it is used by default in their paper https://arxiv.org/pdf/1908.10084.pdf 
# to get the embedding of a sentence from words. Mean of vector has also seen good performance for representing documents 
# with models such as word2vec )
import numpy as np

tagged_embeds = {}

with tqdm(total=serv_num, desc='Embedding Service Sentences') as pbar:
    for node_num in tagged_sents:
        serv_sents = tagged_sents[node_num]
        sents_embeds = model.encode(serv_sents)
        serv_embed = np.mean(sents_embeds, axis=0)
        tagged_embeds[node_num] = {'sents': sents_embeds, 'serv': serv_embed}
        pbar.update()

HBox(children=(IntProgress(value=0, description='Embedding Service Sentences', max=16547, style=ProgressStyle(…




In [17]:
from scipy.spatial.distance import cdist

# check that each service is similar to itself
serv_embeds = [tagged_embeds[node_num]['serv'] for node_num in serv_nodes]
# choose n random services
n = 5
n_servs = np.random.choice(list(serv_nodes.keys()), size=n)

def sanityCheck():
    for node_num in n_servs:
        serv_embed = tagged_embeds[node_num]['serv']
        # get cosine distance to all embeddings
        distances = cdist([serv_embed], serv_embeds, "cosine")[0]
        
        # zip a tuple of (nserv_nodes dist)
        results = zip(list(serv_nodes.keys()), distances)
        # sort the tuples according to distance
        results = sorted(results, key=lambda x: x[1])
        
        top_k = 15
        print("\n======================\n")
        print("Service:", serv_nodes[node_num]['name'])
        print("\nTop {} most similar services in corpus:".format(top_k))
        
        for node_num, dist in results[0:top_k]:
            print("{} (with Cosine Similarity: {})".format(serv_nodes[node_num]['name'], 1-dist))

In [18]:
sanityCheck()



Service: Pregnancy Testing - Saint Vincent Mercy

Top 15 most similar services in corpus:
Pregnancy Testing - Saint Vincent Mercy (with Cosine Similarity: 0.9999999999999999)
Pregnancy Testing - Saint Vincent Anderson Regional (with Cosine Similarity: 0.9806146493840792)
Pregnancy Testing (with Cosine Similarity: 0.9316876916628067)
Pregnancy Testing And Counseling (with Cosine Similarity: 0.9289860625566775)
Pregnancy Testing (with Cosine Similarity: 0.9210653057524528)
Pregnancy Testing (with Cosine Similarity: 0.9200712840953946)
Pregnancy Counseling (with Cosine Similarity: 0.9186404460541996)
Pregnancy Testing (with Cosine Similarity: 0.9184896378615415)
Pregnancy Testing (with Cosine Similarity: 0.9166372404421487)
Pregnancy Testing - Peoples (with Cosine Similarity: 0.9154319036847974)
Pregnancy Testing (with Cosine Similarity: 0.9145147001617011)
Pregnancy Testing (with Cosine Similarity: 0.9137234447906675)
Pregnancy Testing (with Cosine Similarity: 0.9137234447906675)
Pregn

In [19]:
from sklearn.cluster import AgglomerativeClustering

# Perform kmean clustering
num_clusters = 5
clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
clustering_model.fit(serv_embeds)
cluster_assignment = clustering_model.labels_
cluster_assignment

array([4, 3, 3, ..., 3, 0, 0])

In [20]:
clustered_servs = [[] for i in range(num_clusters)]
node_nums = list(serv_nodes.keys())
for idx, cluster_id in enumerate(cluster_assignment):
    serv_name = serv_nodes[node_nums[idx]]['name']
    clustered_servs[cluster_id].append(serv_name)

In [39]:
from tabulate import tabulate

for i, cluster in enumerate(clustered_servs):
    print("Cluster ", i+1)
    np.random.shuffle(cluster)
    rows, cols = (3,5)
    table_cluster = np.asarray(cluster[0:rows*cols]).reshape(cols, rows)
    
    print(tabulate(table_cluster, tablefmt='fancy_grid', headers=["Service{}".format(i+1) for i in range(rows)]))
    print("")

Cluster  1
╒══════════════════════════╤═════════════════════════════════════════╤══════════════════════════════════════╕
│ Service1                 │ Service2                                │ Service3                             │
╞══════════════════════════╪═════════════════════════════════════════╪══════════════════════════════════════╡
│ Clothes Closet           │ Supportive Housing                      │ Animal Control                       │
├──────────────────────────┼─────────────────────────────────────────┼──────────────────────────────────────┤
│ Public Library           │ Homeless Shelter                        │ Senior Center                        │
├──────────────────────────┼─────────────────────────────────────────┼──────────────────────────────────────┤
│ Eye Care                 │ Health Insurance                        │ Career Academy                       │
├──────────────────────────┼─────────────────────────────────────────┼──────────────────────────────────────┤

In [40]:
# save embeddings to hdf5 file for more efficient storage and loading
import deepdish as dd
embeddings_path = './embeddings/sBERT/services.h5'
dd.io.save(embeddings_path, serv_embeds, compression=('blosc', 9))

In [60]:
# test random example 
import random

for i in range(10):
    doc_id = random.choice(list(serv_nodes.keys()))
    doc_idx = random.randint(0,serv_num)
    inferred_vector = model.infer_vector(docs[doc_idx].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=100)
    print('Service {} ({}): «{}»\n'.format(doc_id, serv_nodes[doc_id]['name'], serv_nodes[doc_id]['description']))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for i in range(0,3):
        label = "{}-Most Similar".format(i+1)
        print("{} {} {}".format(label, sims[i], serv_nodes[sims[i][0]]['name']))
    print()



Service 34003 (Street Department): «Provides street and sidewalk maintenance, storm-sewer maintenance, snow and ice control, dead animal pick-up, street-sign maintenance, mosquito control, and groundskeeping of town properties in the town of Edinburgh in Bartholomew, Johnson, and Shelby counties. Provides curbside collection of trash, leaves, and yard waste for town residents.»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow+w,d150,n10,w5,mc5,s0.001,t4):

1-Most Similar ('41789', 0.9580173492431641) Housing Payment Assistance - Michigan City
2-Most Similar ('41790', 0.9179282784461975) Housing Payment Assistance - Hammond
3-Most Similar ('41788', 0.9020909070968628) Housing Payment Assistance - Gary

Service 32029 (Police Department): «Provides law enforcement, crime investigation, crime prevention, emergency assistance, and other police duties for the city of New Haven in Allen County. Persons in need of emergency police assistance should call 9-1-1.»

SIMILAR/DISSIMILAR DOCS PER MODE

In [27]:
counter[0]

0

In [1]:
model.similarity('Police', 'Law')

NameError: name 'model' is not defined