In [None]:
import pandas as pd

import os

## Check what document types there are

In [None]:
# import data from the content store
content_store_file_path = os.path.join('../../data/raw/preprocessed_content_store_210920.csv')
content_store_df = pd.read_csv(content_store_file_path, compression='gzip', delimiter="\t", low_memory=False)


In [None]:
content_store_df.columns

In [None]:
pd.options.display.max_rows

In [None]:
pd.options.display.max_rows = 140

In [None]:
content_store_df[['document_type', 'base_path']].groupby(
    'document_type').count().sort_values('base_path', ascending=False)

## What doc types fail at getting embeddings?

In [None]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
from ast import literal_eval

model = hub.load('../../data/external/universal-sentence-encoder_4')

In [None]:
def embed(input):
    return model(input)

In [None]:
# filter dates
date_mask = content_store_df['first_published_at'].str[:4].fillna('2000').astype(int) > 2000

# filter live documents
live_mask = content_store_df['withdrawn'] == False

In [None]:
# combine masks
content_mask = live_mask & date_mask 

cols_keep = ['document_type', 'content_id', 'first_published_at', 'details']
subset_content_df = content_store_df.loc[content_mask, cols_keep].copy()
subset_content_df['details'] = subset_content_df['details'].map(literal_eval)

In [None]:
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)


def clean_xml(original_text):
    ''' strips out xml tagging from string'''
    extracted_sentence = []
    start_idx = 1
    end_idx = 1
    while (start_idx > 0) and (end_idx > 0):
        end_idx = original_text.find('<')
        if end_idx >= 0:
            extracted_sentence.append(original_text[:end_idx])
            start_idx = original_text.find('>')
            if (start_idx >= 0):
                original_text = original_text[start_idx + 1:]
    if len(original_text) > 0:
        extracted_sentence.append(original_text)
    return str(''.join(extracted_sentence))


# sentences may be more performant
def extract_paragraphs(original_text):
    ''' takes raw string text from gov uk and returns extracted paragraphs
    still contains xml tags'''
    extracted_paragraphs = []
    start_idx = 1
    end_idx = 1
    while (start_idx >= 0) and (end_idx >= 0):
        start_idx = original_text.find('<p>')
        end_idx = original_text.find('</p>')
        if (start_idx >= 0) and (end_idx >= 0):
            if (end_idx - start_idx) > 3:
                cleaned_text_segment = clean_xml(original_text[start_idx + 3:end_idx])
                extracted_paragraphs.append(cleaned_text_segment)
            original_text = original_text[end_idx + 3:]
    return extracted_paragraphs


def document_embedding(paragraphs):
    """
    average embeddings across sentences
    """
    embedding = embed(paragraphs)
    average_embedding = tf.math.reduce_mean(embedding, axis=0).numpy()
    return average_embedding

In [None]:
# initialise an empty array for embeddings
collected_doc_embeddings = np.zeros((subset_content_df.shape[0], 512))

# fill array with embeddings for all docs
for i in range(subset_content_df.shape[0]):
    try:
        doc = subset_content_df.iloc[i]['details']['body']
    except KeyError:
        continue
    try:
        extracted_paragraphs = extract_paragraphs(doc)
    except AttributeError:
        print(subset_content_df.iloc[i]['document_type'])
    if len(extracted_paragraphs) > 0:
        doc_embedding = document_embedding(extracted_paragraphs)
        collected_doc_embeddings[i, :] = doc_embedding
    if i % 1000 == 0:
        progress = i / subset_content_df.shape[0]
        print('%s' % float('%.2g' % progress))


In [None]:
doctype_inverse_mask = subset_content_df.document_type.isin([
    'aaib_report',
     'answer',
     'asylum_support_decision',
     'business_finance_support_scheme',
     'cma_case',
     'countryside_stewardship_grant',
     'drug_safety_update',
     'employment_appeal_tribunal_decision',
    'employment_tribunal_decision',
     'esi_fund',
     'export_health_certificate',
     'help_page',
     'html_publication',
     'international_development_fund',
     'maib_report',
     'manual',
     'manual_section',
     'medical_safety_alert',
     'ministerial_role',
     'person',
     'raib_report',
    'research_for_development_output'
     'residential_property_tribunal_decision',
     'service_standard_report',
     'simple_smart_answer',
     'statutory_instrument',
     'tax_tribunal_decision',
     'utaac_decision'
    ])
subset_content_df_v2 = subset_content_df.loc[~doctype_inverse_mask, cols_keep].copy()

In [None]:
# initialise an empty array for embeddings
collected_doc_embeddings = np.zeros((subset_content_df_v2.shape[0], 512))

bad_doc_types = set()
# fill array with embeddings for all docs
for i in range(subset_content_df_v2.shape[0]):
    try:
        doc = subset_content_df_v2.iloc[i]['details']['body']
    except KeyError:
        continue
    try:
        extracted_paragraphs = extract_paragraphs(doc)
    except AttributeError:
        bad_doc_types.add(subset_content_df_v2.iloc[i]['document_type'])

#     if len(extracted_paragraphs) > 0:
#         doc_embedding = document_embedding(extracted_paragraphs)
#         collected_doc_embeddings[i, :] = doc_embedding
    if i % 1000 == 0:
        progress = i / subset_content_df_v2.shape[0]
        print('%s' % float('%.2g' % progress))


In [None]:
bad_doc_types