In [None]:
import pandas as pd
import numpy as np
import os
from annoy import AnnoyIndex

In [None]:
os.chdir('../../data/processed')

In [None]:
# json of settings for the testing
settings = {'dilbert': {'embeddings': 'embeddings_distilbert_base_df.csv',
                        'text': 'text_distilbert_base_df.csv',
                        'embedding_dim': 768,
                        'ann_index': 'sbert.ann'},
            'use_large': {'embeddings': 'embeddings_use_large_df.csv',
                          'text': 'text_use_large_df.csv',
                          'embedding_dim': 512,
                          'ann_index': 'use.ann'},
            'use_2000': {'embeddings': 'embeddings_use_large_2000_df.csv',
                         'text': 'text_use_large_2000_df.csv',
                         'embedding_dim': 512,
                         'ann_index': 'use_2000.ann'},
            'use_20201014': {'embeddings': 'embeddings_20201014_df.csv',
                         'text': 'text_use_20201014_df.csv',
                         'embedding_dim': 512,
                         'ann_index': 'use_20201014.ann'}
            }

In [None]:
embedding_type = 'use_20201014'


In [None]:
# build the Annoy Index, f is the length of embedding vector to be used
# f is the length of the vectors, aka 'embedding dimension'
f = settings[embedding_type]['embedding_dim']

# declare an empty index which is going to be based on cosine similarity, aka 'angular'
u = AnnoyIndex(f, 'angular')

# load a previously trained Annoy Index 
u.load(settings[embedding_type]['ann_index'])
# u.unload()


In [None]:
def get_cosine_from_similarity(similarity, dp=4):
    '''
    converts the similarity distance metric into a cosine angle
    '''
    cosine_angle = 1 - (similarity**2) / 2
    return cosine_angle

In [None]:
text_df = pd.read_csv(settings[embedding_type]['text'])

In [None]:
text_df.head()

In [None]:
base_path_idx_lookup = dict(zip(text_df['base_path'], range(text_df.shape[0])))

idx_base_path_lookup = dict(zip(range(text_df.shape[0]), text_df['base_path']))

In [None]:
len(base_path_idx_lookup.keys())

In [None]:
def get_similar_docs(base_path, verbose=True):
    try:
        source_textdf_idx = base_path_idx_lookup[base_path]
    except KeyError:
        return f'sorry, base_path {base_path} not found in our lookup'
    source_text_data = text_df.iloc[source_textdf_idx]
    if source_text_data['doc_text'] == np.nan:
        return f'sorry, there\'s no text in the content item {base_path}'
    
    results = np.array(u.get_nns_by_item(source_textdf_idx, 3, include_distances=True))
    print('query doc: ' + source_text_data['content_id'])
    print('first_published_at: ' + source_text_data['first_published_at'][:10])
    
    for i in range (1,3):
        cosine_angle = get_cosine_from_similarity(results[1, i])
        text_data = text_df.iloc[int(results[0, i])]
       
        print("----")
        print('best match: ')
        
        display(HTML(f"""<a href="https://www.gov.uk{text_data['base_path']}" target="_blank">text</a>"""))
        print('www.gov.uk' + text_data['base_path'])
        print('first_published_at: ' + text_data['first_published_at'][:10])
        print('cosine angle: ' + '%s' % float('%.2g' % cosine_angle))
#         print("----")
#         print(text_data['doc_text'][:max_length_string])

In [None]:
text_df.iloc[
    base_path_idx_lookup[
        '/government/publications/covid-19-and-renting-guidance-for-landlords-tenants-and-local-authorities']
]['doc_text'] == np.nan

In [None]:
from IPython.core.display import display, HTML
display(HTML("""<a href="https://google.at">text</a>"""))

In [None]:
get_similar_docs(
    '/government/publications/covid-19-and-renting-guidance-for-landlords-tenants-and-local-authorities'
)

In [None]:
text_df.iloc[275735]

In [None]:
results = np.array(u.get_nns_by_item(
    base_path_idx_lookup['/government/publications/covid-19-and-renting-guidance-for-landlords-tenants-and-local-authorities'],
    3, include_distances=True))


In [None]:
results

In [None]:
results[0,1]

In [None]:
np.array(u.get_nns_by_item(base_path_idx_lookup['/coronavirus'], 3, include_distances=True))[0, 0]

In [None]:
np.isnan(text_df.iloc[1060]['doc_text'])

In [None]:
base_path_idx_lookup['/coronavirus']

In [None]:
text_df.iloc[215404]

In [None]:
text_df['base_path'][
    np.array(u.get_nns_by_item(base_path_idx_lookup['/government/statistics/announcements/producing-a-historical-series-for-cpih'],
                               3, include_distances=True))[0, 0]]


In [None]:

def print_cosine_and_texts(text_idx, max_length_string=1000, verbose=True):
    '''
    function for printing out details of query document and the best match
    '''
    results = np.array(u.get_nns_by_item(text_idx, 2, include_distances=True))
    cosine_angle = get_cosine_from_similarity(results[1, 1])
    if verbose:
        print('cosine angle: ' + '%s' % float('%.2g' % cosine_angle))
        print("----")
        print('Index: ' + str(text_idx))
        print("----")
        print('query doc: ' + text_df['content_id'][results[0, 0]])
        print('date: ' + text_df['first_published_at'][results[0, 0]][:10])
        print("----")
        print(text_df['doc_text'][results[0, 0]][:max_length_string])
        print("----")
        print('best match: ' + text_df['content_id'][results[0, 1]])
        print('date: ' + text_df['first_published_at'][results[0, 1]][:10])
        print("----")
        print(text_df['doc_text'][results[0, 1]][:max_length_string])
    else:
        return(cosine_angle)


# pick a random document, and search the index, printing to screen
print_cosine_and_texts(np.random.randint(0, 100000))
# this is really how you can compare the ability of different embeddings
print_cosine_and_texts(3779, max_length_string=10000)

# collect a list of potentially interesting documents, inspecting results
list_interesting_indices = []
n = 1000
while n > 0:
    n -= 1
    text_idx = np.random.randint(0, 3000)
    cosine_angle = print_cosine_and_texts(text_idx, verbose=False)
    if cosine_angle > 0.9 and cosine_angle < 0.95:
        list_interesting_indices.append(text_idx)

# check count of collected docs which match your criteria above
len(list_interesting_indices)

# set counter
i = 0
# if you highlight the two lines beneath, and hit 'shift + enter' in vs code
# you can manually inspect the results
print_cosine_and_texts(list_interesting_indices[i])
i += 1


# LOOKING AT GUIDANCE ON VIRUS CONTENT
# this is just an illustration of finding content based on keywords and then looking for similar
# content

# search by date and content type:
doc_types = ['press_release', 'news_story', 'speech', 'world_news_story', 'guidance']
doc_types = ['guidance']
doc_mask = text_df['document_type'].isin(doc_types)
date_mask = text_df['first_published_at'].str[:4].fillna('2000').astype(int) >= 2020
text_mask = text_df['doc_text'].str.lower().str.contains('virus')
content_mask = date_mask & doc_mask & text_mask
cols_keep = ['document_type', 'content_id', 'first_published_at', 'doc_text']
subset_text_df = text_df.loc[content_mask, cols_keep].copy()
subset_text_df.shape

collected_guidance_ids = []
for i in subset_text_df.index.to_list():
    results = np.array(u.get_nns_by_item(i, 2, include_distances=True))
    cosine_angle = get_cosine_from_similarity(results[1, 1])
    if cosine_angle > 0.8:
        collected_guidance_ids.append(i)

print_cosine_and_texts(96897)
i = 0
print_cosine_and_texts(collected_guidance_ids[i])
i += 1


# search by vector
# this illustrates how you can search for similar documents, based on any text
# being convered into an embedding schema
# requires the loading of the embedding model, which can take time to load, as its 1GB
# flake complains about the import of libraries here, but it will add to the run time of the script
# happy for this to be chopped and moved etc
'''
from universal_sentence_encoder import document_embedding

test_text = ['Britain will roll out COVID-19 vaccinations when they are ready based on clinical advice about who \
    should be prioritised, health minister Matt Hancock said on Monday, after a report that half the population \
    could miss out on the jabs.',
    'Asked about comments by the chair of the government vaccine taskforce to the Financial Times that vaccines \
    would probably only be available to less than half the UK population, Hancock said the taskforce had done \
    good work in procuring vaccines but that deployment was his department responsibility.',
    '“We will take the advice on the deployment of the vaccine, based on clinical advice from the Joint \
    Committee on vaccinations and immunizations,” Hancock told parliament.']

embedding = document_embedding(test_text)
results = np.array(u.get_nns_by_vector(embedding, 4, include_distances=True))

get_cosine_from_similarity(results[1,0])
text_df['doc_text'][results[0,0]]
'''
