# Entity subclasses

There are lots of entities in government that are subclasses of one another.

By this I mean that, for example, a "child passport" is a subclass of "passport" - it is a more specific form of a passport and has siblings like "adult passport", "horse passport", "pet passport" etc etc

By extracting these we can get a better understanding of how entities relate to another

We can also help users with this information, both in terms of actually understanding that a search like "passport for my daughter" actually relates to "child passport" (well...probably, a daughter can technically be any age - but we do know that a horse or pet passport is not related) so we can improve search in this manner.

We can also use it to provide handy hints for users who are doing vague searches. For example, if someone types in "passport" we can suggest "adult passport", "child passport" etc as a way of helping them refine what they mean and thus get better results

The results from this notebook are quite good but some refinement is needed. It's also _painfully_ slow - taking more than 24 hours on my machine

In [None]:
import pandas as pd
import spacy
import os
from py2neo import Graph
import sys
import os
from bs4 import BeautifulSoup


In [None]:

# govuk-language-model uses the environment variable to set where the data for the model is
# I have the data in that folder, you might have it elsewhere. Contact me if you're not sure what the data
# is and/or how it all fits together
os.environ['MODEL_FILE_PATH'] = '../../govuk-knowledge-graph/data'
# Requires govuk-language-model
sys.path.append("../../govuk-language-model")
from sagemaker.container.govner.govner import GovNER
ner = GovNER()

In [None]:

html_content_dir_path = "/Users/oscarwyatt/govuk/govuk-knowledge-extractor/govuk-production-mirror-replica"
preprocessed_content_store_path = "/Users/oscarwyatt/govuk/govuk-knowledge-graph/data/preprocessed_content_store_070920.csv.gz"

all_content_items = pd.read_csv(preprocessed_content_store_path, sep="\t", compression="gzip",
                                         low_memory=False)

print("Finished reading from the preprocessed content store!")

mainstream_content = all_content_items[all_content_items['publishing_app'] == 'publisher']

In [None]:
class Page:
    def __init__(self, content_item, html_content_dir_path):
        self.content_item = content_item
        self.html_content_dir_path = html_content_dir_path
        self.texts = self._extract_texts()
        
    def base_path(self):
        return self.content_item['base_path']
    
    def html_file_path(self):
        return f"{self.html_content_dir_path}{self.base_path()}.html"
        
    def _extract_texts(self):
        if os.path.exists(self.html_file_path()):
            # I have an old copy of the mirrors so sometimes the file won't exist
            with open(self.html_file_path(), "r") as html_file:
                html = html_file.read()
                soup = BeautifulSoup(html, 'html.parser')
                for tag in ['b', 'i', 'u', 'a', 'abbr']:
                    for match in soup.findAll(tag):
                        match.replaceWithChildren()
                        # If we don't extract them, the old tags stick
                        # around and mess up the soup.strings call
                        # match.extract()
                [x.extract() for x in soup.findAll('script')]
                soup = BeautifulSoup(str(soup), 'html.parser')
                texts = list(soup.strings)
                texts = [text for text in texts if text != '\n']
                return texts
        return []

In [None]:
pages = []
for index, content_item in mainstream_content.iterrows():
    pages.append(Page(content_item, html_content_dir_path))

In [None]:
subclasses = []
for page in pages:
    subclasses += get_subclasses(page.texts)

In [None]:
def get_subclasses(texts):
    subclasses = []
    for text in texts:
        subclasses += get_subclasses_for_text(text)
    return subclasses


def get_subclasses_for_text(text):
    extracted_matches = ner.predictor.predict(text)
    ner._create_entity_dict(extracted_matches)
    entities = {}
    last_label = None
    tokens_with_label = []
    extracted_subclasses = []
    for i, (left_to_right_tokens, left_to_right_labels) in enumerate(extracted_matches):
        tokens = left_to_right_tokens.copy()
        tokens.reverse()
        labels = left_to_right_labels.copy()
        labels.reverse()
        last_entity = None
        for token, label in zip(tokens, labels):
            if label != "O" and label != '[SEP]':
                # could be a subclass
                tokens_with_label.append(token)
                if len(tokens_with_label) > 1:
                    un_reversed_tokens = tokens_with_label.copy()
                    un_reversed_tokens.reverse()
                    entire_entity = " ".join(un_reversed_tokens)
                    extracted_subclasses.append([last_entity, entire_entity])
                    last_entity = entire_entity
                else:
                    last_entity = token
            else:
                tokens_with_label = []
                last_entity = None
            last_label = label
            last_token = token
    return extracted_subclasses

In [None]:
len(subclasses)

In [None]:
unique_subclasses = {}
for subclass in subclasses:
    unique_subclasses[f"{subclass[0].lower()} - {subclass[1].lower()}"] = subclass

In [None]:
len(unique_subclasses)

In [None]:
unique_subclasses

In [None]:
# It takes a long time to run so save them to disk

with open('subclasses.json', 'w') as json_file:
    json.dump(unique_subclasses, json_file)

### Load subclasses from file and insert into graph

If you're coming back later or with a pregenerated file, you can start from here

In [None]:
import json
import os
with open('subclasses.json', 'r') as json_file:
    loaded_unique_subclasses = json.load(json_file)
print(len(loaded_unique_subclasses))

This finds subclass entries where there is an entity for both sides already in the graph and if so, creates a relationship between them

In [None]:
host = os.environ.get('REMOTE_NEO4J_URL')
user = os.environ.get('NEO4J_USER')
password = os.environ.get('NEO4J_PASSWORD')
graph = Graph(host=host, user='neo4j', password = password, secure=True)

has_both = []
for parent, child in loaded_unique_subclasses.items():
    try:
        entities = graph.run("MATCH (a:Entity{name: '" + parent + "'}) WITH a OPTIONAL MATCH (b:Entity{name: '" + child + "'}) return a.name as parent, b.name as child").data()
        print(entities)
        if any(entities) and entities[0]['parent'] and entities[0]['child']:
            has_both.append([parent, child])
            # NB: I haven't run this query so it may require debugging
            graph.run("MATCH (parent:Entity{name: '" + parent + "'}) WITH parent OPTIONAL MATCH (child:Entity{name: '" + child + "'} CREATE (parent)-[:HAS_SUBCLASS]->(child) CREATE (child)-[:HAS_SUPERCLASS]->(parent)")
    except ClientError:
        next

In [None]:
has_both