In [None]:
import pandas as pd
import numpy as np
import os
from annoy import AnnoyIndex

from IPython.core.display import display, HTML


In [None]:
os.chdir('../../data/processed')

In [None]:
# json of settings for the testing
settings = {'use_20201014_v2': {'embeddings': 'embeddings_20201014_df_v2.csv',
                         'text': 'text_use_20201014_df_v2.csv',
                         'embedding_dim': 512,
                         'ann_index': 'use_20201014_v2.ann'}
            }

In [None]:
embedding_type = 'use_20201014_v2'

In [None]:
# build the Annoy Index, f is the length of embedding vector to be used
# f is the length of the vectors, aka 'embedding dimension'
f = settings[embedding_type]['embedding_dim']

# declare an empty index which is going to be based on cosine similarity, aka 'angular'
u = AnnoyIndex(f, 'angular')

# load a previously trained Annoy Index 
u.load(settings[embedding_type]['ann_index'])
# u.unload()

In [None]:
def get_cosine_from_similarity(similarity, dp=4):
    '''
    converts the similarity distance metric into a cosine angle
    '''
    cosine_angle = 1 - (similarity**2) / 2
    return cosine_angle

In [None]:
text_df = pd.read_csv(settings[embedding_type]['text'])

base_path_idx_lookup = dict(zip(text_df['base_path'], range(text_df.shape[0])))

idx_base_path_lookup = dict(zip(range(text_df.shape[0]), text_df['base_path']))

In [None]:
def get_similar_docs(base_path, verbose=True):
    try:
        source_textdf_idx = base_path_idx_lookup[base_path]
    except KeyError:
        return f'sorry, base_path {base_path} not found in our lookup'
    source_text_data = text_df.iloc[source_textdf_idx]
    if source_text_data['doc_text'] == np.nan:
        return f'sorry, there\'s no text in the content item {base_path}'
    
    results = np.array(u.get_nns_by_item(source_textdf_idx, 4, include_distances=True))
    print('query doc: ')
#     display(HTML(f"""<a href="https://www.gov.uk{base_path}" target="_blank">{source_text_data['title']}</a>"""))
    print(source_text_data['title'])
    print(f"https://www.gov.uk{source_text_data['base_path']}")
    print('first_published_at: ' + source_text_data['first_published_at'][:10])
    
    print('\n similar content: \n')
    
    for i in range (1,4):
        cosine_angle = get_cosine_from_similarity(results[1, i])
        text_data = text_df.iloc[int(results[0, i])]
        
#         display(HTML(f"""<a href="https://www.gov.uk{text_data['base_path']}" target="_blank">{text_data['title']}</a>"""))
        print(text_data['title'])
        print(f"https://www.gov.uk{text_data['base_path']}")
        print('first_published_at: ' + text_data['first_published_at'][:10])
        print('similarity score: ' + '%s' % float('%.2g' % cosine_angle))
        print("----")

# get similar URLS

In [None]:
get_similar_docs(
    '/rent-room-in-your-home'
)