# Graph Nodes Embedding

- Use Doc2Vec to embed the description, name, etc of services 
- cluster these embeddings and measure cluster performance

later:
- use [BANE](https://github.com/benedekrozemberczki/BANE) code to embed the nodes taking into account their network structure
- compare cluster performance


In [24]:
import json
import os
import csv
import random
import pandas as pd
import nltk
import typing
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
# %%capture
# tqdm().pandas()
%matplotlib inline

In [25]:
# open our datasets
with open('./data/services_nodes.json') as sn:
    serv_nodes = json.loads(sn.read())
    
with open('./data/services_edgelist.csv') as se:
    serv_edges = csv.reader(se)

with open('./data/HIN_nodes.json') as taxo:
    taxo_nodes = json.loads(taxo.read())

with open('./data/code_to_node_num.json') as cn:
    code_trans = json.loads(cn.read())

In [28]:
# following: https://towardsdatascience.com/machine-learning-text-processing-1d5a2d638958
# other ref: https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908

# dictionary used to store our services node content to be embeded
serv_cont = {}

# service features to be embedded as text or tags
# maybe fix to get actual agency name instead of id
text_feats = [
    'name', 
    'akas', 
    'description', 
#     'codes',
#     'eligibility'
]

from nltk.stem import WordNetLemmatizer
import string
# returns a 
def preprocessText(text: str) -> typing.List :
    # normalize: convert to lower, remove 
    # numbers, remove punctuation
    text = text.lower()
    text = text.translate(str.maketrans('','', string.digits))
    text = text.translate(str.maketrans('','',string.punctuation))
    words = []
    # tokenize into sentences
    text_sents = nltk.sent_tokenize(text)
    for sent in text_sents:
        words.extend(nltk.word_tokenize(sent))
    
    stop_words = set(nltk.corpus.stopwords.words())
    # remove stopwords like 'the', 'is', 'a', etc
    words = [w for w in words if w not in stop_words]
    # get base form of words
    lemmer = WordNetLemmatizer()
    words = [lemmer.lemmatize(w) for w in words]
    
    return words

In [29]:
# word2vec ref: http://jalammar.github.io/illustrated-word2vec/

# "documents" consisting of the direct and related text content from each node
# each elem is a TaggedDocument class from gensim
docs = []
serv_num = len(serv_nodes)
# for each service node
with tqdm(total=serv_num, desc='Pre-processing Text') as pbar:
    for node_num in serv_nodes:

        node = serv_nodes[node_num]
        words = []
        # preprocess the text content of the node and it's taxonomy code nodes
        for feat in text_feats:

            text = node[feat]
            if feat in ('akas', 'eligibility'):
                text = ' '.join(text)

            elif feat == 'codes':
                for cn in node[feat]:
                    code = taxo_nodes[str(code_trans[cn])]
                    words.extend(preprocessText(code['name']))
                    words.extend(preprocessText(code['description']))
                    if 'keywords' in code:
                        keywords = ' '.join(code['keywords'])
                        words.extend(preprocessText(keywords))

            # some nodes have empty features
            elif text:
                words.extend(preprocessText(text))
            # preprocess our node raw text feats

        docs.append(TaggedDocument(words=words, tags=[node_num]))
        pbar.update(1)
random.shuffle(docs)

HBox(children=(IntProgress(value=0, description='Pre-processing Text', max=16547, style=ProgressStyle(descript…




In [30]:
# ref for clustering embeddings: https://towardsdatascience.com/automatic-topic-clustering-using-doc2vec-e1cea88449c
# some discussion of model parameter tuning: https://stackoverflow.com/questions/47890052/improving-gensim-doc2vec-results
# explaining negative sampling vs hierarchical-softmax: https://stackoverflow.com/questions/46860197/doc2vec-and-word2vec-with-negative-sampling
# more useful info on parameters: https://stackoverflow.com/questions/56323377/which-method-dm-or-dbow-works-well-for-document-similarity-using-doc2vec
# some discussion on embedding size: https://datascience.stackexchange.com/questions/51404/word2vec-how-to-choose-the-embedding-size-parameter
# followed this guide : https://ai.intelligentonlinetools.com/ml/text-clustering-doc2vec-word-embedding-machine-learning/
# and gensim examples: https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
# gensim docs: https://radimrehurek.com/gensim/models/doc2vec.html

# use default alpha/min-alpha learning rates. 
# use skip-gram to train word vector alongside doc vectors using dbow
trn_prompt = ''
while trn_prompt not in ('Y', 'N'):
    trn_prompt = input("Would you like to train a model from scratch with our {} documents?: ".format(serv_num))
    trn_prompt = trn_prompt.upper()
train = True if trn_prompt == 'Y' else False

if train:
    model_fname = './models/doc2vec_services'
    # model = Doc2Vec(dm=0, dbow_words=1, workers=4, negative=10, min_count=5, vector_size=150, epochs=30)
    model = Doc2Vec(dm=1, dm_mean=1, workers=8, negative=10, min_count=5, vector_size=150, epochs=30)
    model.build_vocab(docs)
    model.train(docs, total_examples=model.corpus_count, epochs=model.epochs)

Would you like to train a model from scratch with our 16547 documents?:  Y


In [31]:
# finished training/updating model. delete training data to reduce memory use when loaded
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model.save(model_fname)

In [32]:
# confirm model was saved correctly
model = Doc2Vec.load(model_fname)

In [None]:
# sanity check: check if given each doc in the same training data
# the model can return that same doc as the most similar
def sanityCheck(model):
    ranks = []
    second_ranks = []
    model_len = len(model.docvecs)
    with tqdm(total=serv_num) as pbar:
        for idx, doc_id in enumerate(serv_nodes):
            inferred_vector = model.infer_vector(docs[idx].words)
            sims = model.docvecs.most_similar([inferred_vector], topn=model_len)
            rank = [docid for docid, sim in sims].index(doc_id)
            ranks.append(rank)

            second_ranks.append(sims[1])
            pbar.update(1)
        
    import collections
    counter = collections.Counter(ranks)
    return counter

HBox(children=(IntProgress(value=0, max=16547), HTML(value='')))

In [60]:
# test random example 
import random

for i in range(10):
    doc_id = random.choice(list(serv_nodes.keys()))
    doc_idx = random.randint(0,serv_num)
    inferred_vector = model.infer_vector(docs[doc_idx].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=100)
    print('Service {} ({}): «{}»\n'.format(doc_id, serv_nodes[doc_id]['name'], serv_nodes[doc_id]['description']))
    print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
    for i in range(0,3):
        label = "{}-Most Similar".format(i+1)
        print("{} {} {}".format(label, sims[i], serv_nodes[sims[i][0]]['name']))
    print()



Service 34003 (Street Department): «Provides street and sidewalk maintenance, storm-sewer maintenance, snow and ice control, dead animal pick-up, street-sign maintenance, mosquito control, and groundskeeping of town properties in the town of Edinburgh in Bartholomew, Johnson, and Shelby counties. Provides curbside collection of trash, leaves, and yard waste for town residents.»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dbow+w,d150,n10,w5,mc5,s0.001,t4):

1-Most Similar ('41789', 0.9580173492431641) Housing Payment Assistance - Michigan City
2-Most Similar ('41790', 0.9179282784461975) Housing Payment Assistance - Hammond
3-Most Similar ('41788', 0.9020909070968628) Housing Payment Assistance - Gary

Service 32029 (Police Department): «Provides law enforcement, crime investigation, crime prevention, emergency assistance, and other police duties for the city of New Haven in Allen County. Persons in need of emergency police assistance should call 9-1-1.»

SIMILAR/DISSIMILAR DOCS PER MODE

In [27]:
counter[0]

0

In [1]:
model.similarity('Police', 'Law')

NameError: name 'model' is not defined