# Services Content Embedding: Similarity Search

- uses our [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder/4) embeddings of Services description and other auxilary data
- embed the user given query and find the top-k most similar services

[Try your own query](#User-Query)  
[Explore Services Similarity](#Service-Similarity-Heatmap)



## Imports and Global Config



In [0]:
!pip3 install annoy h2o4gpu tqdm tensorflow_text==2.0rc0 gdown

In [0]:
#following this guide: https://colab.research.google.com/drive/1t4bi7X7zRzwIjdxUrU2hUs7LneqgYLVK#scrollTo=a73qer_zPJLy
# and: https://github.com/tensorflow/hub/blob/master/examples/colab/tf2_semantic_approximate_nearest_neighbors.ipynb

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
  
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub
import numpy as np
from h2o4gpu.metrics.pairwise import cosine_similarity
import seaborn as sns
import annoy
import gdown
import json
import os
import pickle
import pandas as pd
import random
from tqdm import tqdm, trange
import logging

TensorFlow 2.x selected.


In [0]:
# make sure we're using a gpu or hw acceleration before we start embedding!!
# NOTE: works inconsistently 
if not tf.test.is_gpu_available() and 'COLAB_TPU_ADDR' not in os.environ:
  print("WARNING!: This notebook is not connected to a GPU nor TPU runtime.")
else:
  print("HW Acceleration should work :)")

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
HW Acceleration should work :)


In [0]:
NODE_TYPE = 'services'

MODEL = 'USE'
MODEL_URL = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'
MODEL_TYPE = MODEL_URL.split('/')[-2]
MODEL_VER = MODEL_URL.split('/')[-1]

print("Using Model {}_{}_v{}".format(MODEL, MODEL_TYPE, MODEL_VER))

Using Model USE_universal-sentence-encoder-large_v5


## Get our service texts, service embeddings, and taxonomy codes

In [0]:
texts_url = 'https://drive.google.com/uc?id=1-4eT0u1Cdg4fE8MLXdfEsaLWeljXDKeE'
texts_file = 'tagged_texts.json'
tagged_texts = {}
gdown.download(texts_url, texts_file, quiet=False)

with open(texts_file) as tf:
    tagged_texts = json.load(tf)
    print("Loaded {} Services text docs...".format(len(tagged_texts)))

embed_url = 'https://drive.google.com/uc?id=1li8eF5VCJ2wxl9FpfeQky8Z_1K3gx-bf'
embed_file = "{}_{}_v{}.pkl".format(NODE_TYPE, MODEL_TYPE, MODEL_VER)
gdown.download(embed_url, embed_file, quiet=False)

with open(embed_file, 'rb') as f:
    tagged_embeds = pickle.load(f)
    e_key = list(tagged_embeds.keys())[0]
    e_smpl = tagged_embeds[e_key]['embed']
    print("Loaded {} {}-dimensional embeddings...".format(len(tagged_embeds), len(e_smpl)))

taxo_url = 'https://drive.google.com/uc?id=10gfognNI2_Q4epqApYVrFSemIZblQKjy'
taxo_file = 'HIN_nodes.json'
gdown.download(taxo_url, taxo_file, quiet=False)

with open(taxo_file) as taxo:
    taxo_nodes = json.load(taxo)
    print("Loaded taxonomy codes...")

Downloading...
From: https://drive.google.com/uc?id=1-4eT0u1Cdg4fE8MLXdfEsaLWeljXDKeE
To: /content/tagged_texts.json
13.2MB [00:00, 247MB/s]


Loaded 16547 Services text docs...


Downloading...
From: https://drive.google.com/uc?id=1li8eF5VCJ2wxl9FpfeQky8Z_1K3gx-bf
To: /content/services_universal-sentence-encoder-large_v5.pkl
35.2MB [00:00, 188MB/s]


Loaded 16547 512-dimensional embeddings...


Downloading...
From: https://drive.google.com/uc?id=10gfognNI2_Q4epqApYVrFSemIZblQKjy
To: /content/HIN_nodes.json
59.9MB [00:00, 233MB/s]


Loaded taxonomy codes...


In [0]:
# reformat data for easier use
node_names = [taxo_nodes[idx]['name'] for idx in tagged_texts]
node_embeds = [tagged_embeds[idx]['embed'] for idx in tagged_embeds]
node_texts = ['\n'.join(tagged_texts[idx]) for idx in tagged_embeds]
node_ids = list(tagged_embeds.keys())
# get size of vectors
embed_dim = len(e_smpl)
# consolidate all our data into a single var
embed_data = list(zip(node_ids, node_names, node_texts, node_embeds))

## Semantic Similarity Search: 
get the most similar items for each of our queries

### Approximate Nearest Neighbors

Using [ANNOY](https://github.com/spotify/annoy) to build a search index so we can have a real-time search instead of searching through the 10K+ embeddings for every query.

Load our Search Index file. 

In [0]:
index_url = 'https://drive.google.com/uc?id=1--4qmz9oaP-AgWZxgt9gp_qmn7X0oYul'
index_file = "{}_{}_{}_ANNOY_index".format(NODE_TYPE, MODEL, MODEL_TYPE)
gdown.download(index_url, index_file, quiet=False)

mapping_url = 'https://drive.google.com/uc?id=1--H7X_OErsUypKoaBSACE1TVtSDbqlO2'
mapping_file = index_file + '.mapping'
gdown.download(mapping_url, mapping_file, quiet=False)

# adapted from: https://github.com/tensorflow/hub/blob/master/examples/colab/tf2_semantic_approximate_nearest_neighbors.ipynb
# uses an approximate nearest neighbor index (with ANNOY lib)
# used to avoid searching through the entirety of our data when comparing queries
# TODO: experiment with param values

# load the index and mapping files
index = annoy.AnnoyIndex(embed_dim, metric='angular')
index.load(index_file, prefault=True)
print('Annoy index is loaded.')
with open(mapping_file, 'rb') as handle:
  mapping = pickle.load(handle)
print('Mapping file is loaded.')

Downloading...
From: https://drive.google.com/uc?id=1--4qmz9oaP-AgWZxgt9gp_qmn7X0oYul
To: /content/services_USE_universal-sentence-encoder-large_ANNOY_index
63.1MB [00:00, 248MB/s]
Downloading...
From: https://drive.google.com/uc?id=1--H7X_OErsUypKoaBSACE1TVtSDbqlO2
To: /content/services_USE_universal-sentence-encoder-large_ANNOY_index.mapping
13.3MB [00:00, 274MB/s]

Annoy index is loaded.
Mapping file is loaded.





In [0]:
# Finds similar items to a given embedding in the ANN index
def find_similar_items_filter(embedding, num_matches=100, k=10):
    ids, distances = index.get_nns_by_vector(
    embedding, num_matches, search_k=-1, include_distances=True)
    items = [mapping[i] for i in ids]

    uniq_names = set()
    uniq_descrs = set()
    uniq_codes = set()
    filter_items = []
    filter_dists = []
    i = 0
    while len(uniq_names) < num_matches and i < len(items):
        name = items[i]['name']
        # get node with item info 
        node = taxo_nodes[items[i]['id']]
        descr = node['description']
        codes = node['codes']
        codes = [c.split('.')[0] for c in codes] 
        new_codes = [c not in uniq_codes for c in codes]
        if name not in uniq_names and descr not in uniq_descrs and all(new_codes):
            # print(name)
            uniq_names.add(name)
            uniq_descrs.add(descr)
            # print(new_codes)
            # print(uniq_codes)
            for idx, nc in enumerate(new_codes):
                if nc:
                    uniq_codes.add(codes[idx])
            filter_items.append(items[i])
            filter_dists.append(distances[i])
        i += 1
    return filter_items[:k], filter_dists[:k]

def find_similar_items(embedding, num_matches=100, k=10):
    ids, distances = index.get_nns_by_vector(
    embedding, num_matches, search_k=-1, include_distances=True)
    items = [mapping[i] for i in ids]

    return items[:k], distances[:k]

In [0]:
# Load the TF-Hub module
print("Loading the TF-Hub {} module...".format(MODEL))
%time embed_fn = hub.load(MODEL_URL)
print("TF-Hub module is loaded.")

def extract_embed(queries):
    # Generates the embedding for the query
    query_embedding =  embed_fn(queries).numpy()
    # print(len(query_embedding))
    return query_embedding

def k_query_matches(queries, sample_size=10, num_res=100, k=10, all_queries=False, out=True, filtr=True, dist_var=False, dl=0.5, du=0.8):
    if all_queries:
        sample_size = len(queries)

    query_samples = random.choices(queries, k=sample_size)
    for query in query_samples:
        # print(query)
        query_embed = extract_embed([query])[0]

        items, dists = None, None
        if filtr:
            items, dists = find_similar_items_filter(query_embed, num_res, k)
        elif dist_var:
            # try to add variety by getting items where similarity between 
            # new candidate and all currently selected items statisfies
            # dl <= dist <= du
            items, dists = find_similar_items(query_embed, num_res, k)
            # add the first embedding since it is excempt from the condition
            var_items, var_dists, res_embeds = [], [], [tagged_embeds[items[0]['id']]['embed']]
            i = 1
            while len(var_items) < k and i < len(items):
                embed = tagged_embeds[items[i]['id']]['embed']
                # TODO: maybe change to cosine distance to be consistent
                e_dists = [cosine_similarity([e], [embed]) for e in res_embeds]
                if all(ed >= dl and ed <= du for ed in e_dists):
                    var_items.append(items[i])
                    var_dists.append(dists[i])
                    res_embeds.append(embed)                    
            items = var_items
            dists = var_dists
        else:
            items, dists = find_similar_items(query_embed, num_matches=k)

        if out:
            print("Top-{} most similar items (w.o. duplicates) to query \"{}\":".format(len(items), query))
            for i in range(len(items)):
                item = items[i]
                dist = dists[i]
                name = item['name']
                print("({}) {} (dist={})".format(i+1, name, dist))
            print()
    return items, dists

Loading the TF-Hub USE module...
CPU times: user 18.6 s, sys: 3.24 s, total: 21.9 s
Wall time: 28 s
TF-Hub module is loaded.


## User Query
Make sure to **run all the code cells above** and enter your own relevant query to find similar social services

In [0]:
user_query = [input("Write a query to get recommended services: ")]

# %time matches, distances = k_query_matches(user_query, sample_size=len(user_query), filtr=False)
%time matches, distances = k_query_matches(user_query, sample_size=len(user_query), filtr=True)
# %time matches, distances = k_query_matches(user_query, sample_size=len(user_query), dist_var=True)

Write a query to get recommended services: adoption
Top-10 most similar items (w.o. duplicates) to query "adoption":
(1) Adoption Services (dist=0.9472340941429138)
(2) Domestic Infant And International Adoption (dist=1.0427831411361694)
(3) Adoption / Foster Care (dist=1.0706366300582886)
(4) Clerk's Office - Main Street - South Bend (dist=1.088861107826233)
(5) Birth Parent Network - Indiana Birth Parents (dist=1.1011050939559937)
(6) Foster Parenting - Mishawaka (dist=1.102400541305542)
(7) Angels Of Love - (dist=1.1205506324768066)
(8) Family Planning (dist=1.1461135149002075)
(9) Pregnancy Services (dist=1.1584712266921997)
(10) Information and Referral (dist=1.1598827838897705)

CPU times: user 42.2 ms, sys: 2.79 ms, total: 45 ms
Wall time: 29.5 ms


## Broad Service-Service Comparisons
- each service has one or more [211 taxonomy codes](https://211taxonomy.org/publicfiles/view/Intro-What_is_the_AIRS.pdf) of the form AZ-1234.\[5678\]. The less characters the code has, the more broad is the social service category following a hierarchy
- We use the broadest codes (1 letter) for the x and y axis which cover 9 categories , and the next level down (2 letters) which cover 96 categories for a more granular comparison 
- We average the embeddigns for the sum of all embeddings in a code and compare them in a heatmap of their cosine similarity

In [0]:
def node_to_n_degree_code(node_num, code_names, n=2):
    node = taxo_nodes[node_num]
    codes = node['codes']
    main_code = None
    if len(codes) >= 1:
        main_code = codes[0]
        main_code = main_code[:n]
    else:
        main_code = None

    return main_code

code_names_url = 'https://drive.google.com/uc?id=1Zummlwwvg3mCKH58Q5uCMtqow2YTjw_e'
code_names_file = 'code_to_name.json'
gdown.download(code_names_url, code_names_file, quiet=False)

with open(code_names_file) as ctn:
    code_names = json.loads(ctn.read())

def getLabelEmbeds(code_len='1'):
    tagged_lbl_emb = {}
    lbl_to_avg_emb = {'No Label': []}
    lbl_cnt = {'No Label': 0}
    # TODO change how code_names is saved so it only has the lbls from 2 char codes

    for lbl in code_names[code_len].values():
        # print(lbl)
        lbl_to_avg_emb[lbl] = []
        lbl_cnt[lbl] = 0

    for node_num in tagged_embeds:

        node_codes = taxo_nodes[node_num]['codes']
        main_code = node_to_n_degree_code(node_num, node_codes, n=int(code_len))
        if main_code == None:
            lbl = 'No Label'
        # pseudo label according to 2 first chars of taxonomy code
        else:
            lbl = code_names[code_len][main_code]
        tagged_lbl_emb[node_num] = {'embed': tagged_embeds[node_num]['embed'], 'label': lbl}
        lbl_to_avg_emb[lbl].append(tagged_embeds[node_num]['embed'])
        # print(len(lbl_to_avg_emb[lbl]))
        lbl_cnt[lbl] += 1
    
    # get the mean of each collected label embeddings
    keys_to_del = []
    for lbl in lbl_to_avg_emb:
        if lbl_to_avg_emb[lbl] == []:
            keys_to_del.append(lbl)
            
        else:   
            avg_embed = np.mean(lbl_to_avg_emb[lbl], axis=0)
            lbl_to_avg_emb[lbl] = avg_embed
            # print(avg_embed.shape)
    for lbl in keys_to_del:
        try:
            lbl_to_avg_emb.pop(lbl)    
            lbl_cnt.pop(lbl)   
        except KeyError:
            print("Key not found")    
    # for lbl in lbl_cnt:
    #     print("Label {} has {} elements".format(lbl, lbl_cnt[lbl]))
    print("Nodes have {} labels".format(len(lbl_to_avg_emb)))
    return tagged_lbl_emb, lbl_to_avg_emb, lbl_cnt

Downloading...
From: https://drive.google.com/uc?id=1Zummlwwvg3mCKH58Q5uCMtqow2YTjw_e
To: /content/code_to_name.json
100%|██████████| 3.76k/3.76k [00:00<00:00, 5.54MB/s]


In [0]:
lvl1_tagged_embs, lvl1_avg_embeds, lvl1_cnt = getLabelEmbeds(code_len='1')
lvl2_tagged_embs, lvl2_avg_embeds, lvl2_cnt = getLabelEmbeds(code_len='2')

label_hierarchy = {}
for lbl in lvl1_avg_embeds:
    label_hierarchy[lbl] = set()

# get order of nodes to display their names when hovering
lbl_node_nums = {}
for lbl in lvl1_avg_embeds:
    lbl_node_nums[lbl] = []

for n_idx in lvl1_tagged_embs:
    lvl1_lbl = lvl1_tagged_embs[n_idx]['label']

    label_hierarchy[lvl1_lbl].add(lvl2_tagged_embs[n_idx]['label'])
    # lbl_node_nums[lvl1_lbl].append(n_idx)

# get the embeddings grouped according to lvl1 labels
reordered_embeds = []
reordered_lbls = []
for lbl in label_hierarchy:

    sub_lbls = label_hierarchy[lbl]
    for sub in sub_lbls:
        reordered_embeds.append(lvl2_avg_embeds[sub])
        reordered_lbls.append(sub)

# reordered_names = [taxo_nodes[n_idx]['name'] for n_idx in reordered_node_ids]
embed_sims = cosine_similarity(reordered_embeds)
tick_vals = []
tick_total = 0
for lbl in lvl1_cnt:
    val = lvl1_cnt[lbl]
    tick_vals.append(val // 2)
    tick_total += val

print("Have {} names for x,y axis".format(len(reordered_lbls)))
# print(reordered_embeds)
print("Have {}x{} similarity matrix".format(len(embed_sims), len(embed_sims[0])))

# print(list(lvl1_avg_embeds.keys()))

Nodes have 12 labels
Nodes have 73 labels
Have 73 names for x,y axis
Have 73x73 similarity matrix


## Services Similarity Heatmap

In [0]:
import plotly.graph_objects as go
import datetime
import numpy as np
np.random.seed(1)

ticks = list(label_hierarchy.keys())
sim_threshold = float(input("Enter a minimum similarity threshold (0.0 <= x <= 1.0) for service comparison: "))
while sim_threshold > 1.0 or sim_threshold < 0.0:
    sim_threshold = float(input("Enter a minimum similarity threshold (0.0 <= x <= 1.0) for service comparison: "))
threshold_sims = [list(map(lambda x: x if x >= sim_threshold else 0.0, row)) for row in embed_sims]

y = reordered_lbls
x = reordered_lbls
z = threshold_sims

def split_text(text, delim=' '):
    txt = ''
    for j, word in enumerate(text.split(delim)):
        if j % 2 == 0:
            txt += word + ' <br>'
        else:
            txt += word + ''
    tick_text[i] = txt

tick_text = list(lvl1_avg_embeds.keys())
for i, text in enumerate(tick_text):
    if '/' in text:
        split_text(text, delim='/')
    else:
        split_text(text)


dx_dy = 6

x_axis = dict(
        tickmode = 'array',
        tickvals = list(range(0, len(tick_text)*dx_dy, dx_dy)),
        ticktext = tick_text
)

y_axis = dict(
        tickmode = 'array',
        tickvals = list(range(0, len(tick_text)*dx_dy, dx_dy)),
        ticktext = tick_text
)

color_scales = ['aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
    'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
    'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',
    'darkmint', 'deep', 'delta', 'dense', 'earth', 'edge', 'electric',
    'emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens', 'greys',
    'haline', 'hot', 'hsv', 'ice', 'icefire', 'inferno', 'jet',
    'magenta', 'magma', 'matter', 'mint', 'mrybm', 'mygbm', 'oranges',
    'orrd', 'oryel', 'peach', 'phase', 'picnic', 'pinkyl', 'piyg',
    'plasma', 'plotly3', 'portland', 'prgn', 'pubu', 'pubugn', 'puor',
    'purd', 'purp', 'purples', 'purpor', 'rainbow', 'rdbu', 'rdgy',
    'rdpu', 'rdylbu', 'rdylgn', 'redor', 'reds', 'solar', 'spectral',
    'speed', 'sunset', 'sunsetdark', 'teal', 'tealgrn', 'tealrose',
    'tempo', 'temps', 'thermal', 'tropic', 'turbid', 'twilight',
    'viridis', 'ylgn', 'ylgnbu', 'ylorbr', 'ylorrd']

color_scale = random.choice(color_scales)
print("Using color scale: {}".format(color_scale))


fig = go.Figure(data=go.Heatmap(
        z=z,
        x=x,
        y=y,
        colorscale = color_scale))

fig.update_layout(
    title='Service-to-Service Similarity',
    xaxis_nticks=len(tick_text),
    yaxis_nticks=len(tick_text),
    width = 1000,
    height = 800,
    xaxis_type = 'category',
    yaxis_type = 'category',
    xaxis = x_axis,
    yaxis = y_axis,
    margin=dict(l=20, r=30, t=50, b=40),
    paper_bgcolor="LightSteelBlue",
    font=dict(
        family="Courier New, monospace",
        size=12,
        # color="#7f7f7f"
        )
    )
    

fig.show()

Enter a minimum similarity threshold (0.0 <= x <= 1.0) for service comparison: 0.65
Using color scale: speed
