# Extract SVO triples from GOV.UK

This notebook explores extracting subject verb object triples from titles on GOV.UK content
The idea is that we'll have triples showing how the entities relate to one another
For example in the sentence "Apply for a passport" the verb is apply and the object is passport, there is no subject. It's worth noting that subjects are quite rare in titles, presumably because usually it is implicitly the reader. Further work could explore assuming this except for places where it isn't. I think this ought to work but I reckon there will be edge cases where this is wrong

Anyway, the cool thing about SVO triples is that we can use them to understand what you can do to various entities. It's worth noting that using the headline title of the page is the most easy/reliable way to do this - body text is more comprehensive but there are all kinds of edge cases where the verb gets negated or there's a condition or there are cases like "Universal Credit is paid once a month" where extracting "paid" is technically correct but it's not something a user is able to do...its DWP that does the paying so we ned to understand more about how to know which party in a transaction does what and whether it's government (or some other non user entity) that is the subject or not...food for thought

I've also included work on FrameNet which has massive potential but is a bit more complex

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
from nltk import Tree
import spacy
import json
from py2neo import Graph

nlp = spacy.load("en_core_web_sm")

In [None]:
html_content_dir_path = "/Users/oscarwyatt/govuk/govuk-knowledge-extractor/govuk-production-mirror-replica"
preprocessed_content_store_path = "/Users/oscarwyatt/govuk/govuk-knowledge-graph/data/preprocessed_content_store_070920.csv.gz"

all_content_items = pd.read_csv(preprocessed_content_store_path, sep="\t", compression="gzip",
                                         low_memory=False)

print("Finished reading from the preprocessed content store!")

mainstream_content = all_content_items[all_content_items['publishing_app'] == 'publisher']

In [None]:
class SOV:
    def __init__(self):
        self.subject = None
        self.object = None
        self.verb = None
        
    def cypher_subject(self):
        return self._cypher_safe(self.subject)

    def cypher_object(self):
        return self._cypher_safe(self.object)

    def cypher_verb(self):
        return self._cypher_safe(self.verb)

    def _cypher_safe(self, token):
        if token is None:
            return ""
        if type(token) is list: 
            text = ''.join([t.text_with_ws for t in token])
        else:
            text = token.text
        text = text.lower()
        text = text.strip()
        return text.replace("'", "")

class TitleProcessor:
    
    def _to_nltk_tree(self, node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(node.orth_, [self._to_nltk_tree(child) for child in node.children])
        else:
            return node.orth_
        
    def _debug_token(self, token):
        print(f"text: {token.text}")
        print(f"dep: {token.dep_}")
        print(f"head dep: {token.head.dep_}")
        print(f"head head pos: {token.head.head.pos_}")
        print(f"lefts: {list(token.lefts)}")
        print()

        
class SVOProcessor(TitleProcessor):
    
    def process(self, doc, debug=False):
        if debug:
            [self._to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]
        triples = []
        for token in doc:
            # If statements can be highly misleading (as a source of Truth)
            if token.text == "if":
                return []
            if debug:
                self._debug_token(token)
            subject_object_triples = self._find_triples(token, debug)
            if subject_object_triples:
                triples += subject_object_triples
        return triples

    def _find_triples(self, token, debug=False):
        is_object_of_prepositional_phrase = self._is_object_of_prepositional_phrase(token)
        if is_object_of_prepositional_phrase:
            if debug:
                print("is_object_of_prepositional_phrase")
            return is_object_of_prepositional_phrase
        is_object = self._is_object(token)
        if is_object:
            if debug:
                print("is_object")
            return is_object
    
    def _verbs(self):
        return ["VERB", "AUX"]    
    
    def _is_object_of_prepositional_phrase(self, token):
        # Finds objects of prepositional phrases
        # eg "Apply online for a UK passport", "Apply for this licence"
        if (token.dep_ == "pobj" and token.head.dep_ == "prep") or (token.dep_=="dobj" and token.head.dep_ == "xcomp") and token.head.head.pos_ in self._verbs():
            print(f"AHA ITS A PREP PHRASE, token is : {token.text}")
            triple = SOV()
            triple.verb = token.head.head
            triple.object = [token]
            # experiment
            triple.subject = []
            reversed_lefts = list(token.lefts) or []
            reversed_lefts.reverse()# or []
            print(f"reversed lefts are: {reversed_lefts}")
            if reversed_lefts:
                for left in reversed_lefts:
                    print(f"left text: {left.text}")
                    print(f"left dep: {left.dep_}")
                    if left.dep_ == "poss":
                        triple.subject.append(left)
                        print(f"After appending lefts, subject is now: {triple.subject}")
            # end experiment
            compound_lefts = self._compound_left_compounds(token)
            if any(compound_lefts):
                compound_lefts.reverse()
                print(compound_lefts)
                triple.object = compound_lefts + triple.object
            return [triple]

    def _is_object(self, token):
        # Finds simple objects
        # eg "Get a passport for your child"
        # TODO: should probably extract "for your child" bit as a modifier of some kind
        if token.dep_ == "dobj" and token.head.pos_ in self._verbs():
            triple = SOV()
            triple.verb = token.head.head
            triple.object = [token]
            compound_lefts = self._compound_left_compounds(token)
            if any(compound_lefts):
                compound_lefts.reverse()
                print(f"reversed compound lefts are: {compound_lefts}")
                triple.object = compound_lefts + triple.object
                print(f"object is now: {triple.object}")
            return [triple]

    def _compound_left_compounds(self, token):
        print(f"compounded lefts for token: {token.text}")
        compounded_lefts = []
        reversed_lefts = list(token.lefts) or []
        reversed_lefts.reverse()# or []
        print(reversed_lefts)
        if reversed_lefts:
            for left in reversed_lefts:
                print(f"left text: {left.text}")
                print(f"left dep: {left.dep_}")
                if left.dep_ == "compound":
                    compounded_lefts.append(left)
                    compounded_lefts += self._compound_left_compounds(left)
                else:
                    break
        return compounded_lefts
    

    
        
class EntityCombinationProcesor(TitleProcessor):
    # This isn't used at the moment as it's low impact but worth keeping
    # The basic idea is that there are plenty of content items whose title
    # is something like "Tourette's syndrome and driving" where the "and" combination
    # is important - as it indicates that it's specifically related to Tourette's 
    # syndrome and it's relation to driving
    #
    # Known texts that perform well
    # Optic neuritis and driving
    # Tourette's syndrome and driving
    # 
    # Texts that don't work/need further work
    # Kindertransport and the State Pension
    # Stroke (cerebrovascular accident) and driving
    
    
    def process(self, doc, debug = False):
        if debug:
            [self._to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]
        triples = []
        last_cc_token = None
        objects = []
        join = None
        for token in doc:
            if debug:
                self._debug_token(token)
            if token.dep_ == "cc":
                last_cc_token = token
            if token.dep_ == "conj" and last_cc_token:
                head_token = compound_left_compounds(token.head) + [token.head]
                print(f"after adding left compounds, token is now: {head_token}")
                objects.append(head_token)
                objects.append(token)
                join = last_cc_token
        if any(objects):
            return [objects, join]
        else:
            return []
    
    def _compound_left_compounds(self, token):
            compounded_lefts = []
            reversed_lefts = list(token.lefts) or []
            reversed_lefts.reverse()# or []
            print(f"compounded lefts for token: {token.text} are {reversed_lefts}")
            if reversed_lefts:
                for left in reversed_lefts:
                    print(f"left text: {left.text}")
                    print(f"left dep: {left.dep_}")
                    if left.dep_ == "amod":
                        compounded_lefts.append(left)
                        compounded_lefts += compound_left_compounds(left)
                    else:
                        break
            return compounded_lefts

class PageTitleExtractor:
    def __init__(self, page, nlp):
        self.page = page
        self.nlp = nlp
        self.titles = []
        
    def extract(self):
        if any(self.titles):
            return self.titles
        if os.path.exists(self.page.html_file_path()):
            # I have an old copy of the mirrors so sometimes the file won't exist
            with open(self.page.html_file_path(), "r") as html_file:
                html = html_file.read()
                soup = BeautifulSoup(html)
                soup = self._get_main_content(soup)
                # TODO: Sometimes soup is None, not sure why, should investigate
                # I _suspect_ that it's that the page doesn't have a "main" tag
                if soup:
                    titles = self._extract_titles(soup)
                    for title in titles:
                        title_instance = Title(title, self.nlp)
                        self.titles.append(title_instance)
        return self.titles

    
    def _get_main_content(self, soup):
        # Sometimes nav. items are in the main content block
        # so we need to remove those too
        regex = re.compile('nav')
        for nav_item in soup.find_all(class_=regex):
                nav_item.decompose()
        regex = re.compile('feedback')
        for nav_item in soup.find_all(class_=regex):
                nav_item.decompose()
        for nav_item in soup.find_all("nav"):
                nav_item.decompose()
        return soup.find("main")

    def _extract_titles(self, soup):
        for tag in ['b', 'i', 'u', 'a', 'abbr']:
            for match in soup.findAll(tag):
                match.replaceWithChildren()
                # If we don't extract them, the old tags stick
                # around and mess up the soup.strings call
        # This extracts all headers, commented out as at the moment we're only doing the title
#         headers = soup.find_all(re.compile('^h[1-6]$'))
        headers = soup.find_all(re.compile('^h1$'))
        titles = []
        for header in headers:
            strings = [re.sub(' +', ' ', string) for string in list(header.strings)]
            strings = [string.replace("\n", "") for string in strings]
            strings = [string.strip() for string in strings]
            titles += strings
        return titles
            
class Title:
    def __init__(self, title, nlp):
        self.title = title
        self.doc = nlp(title)
        self.triples = []
        self.computed_triples = False
        self.combinations = []
        self.computed_combinations = False
        
    def subject_object_triples(self, debug=False):
        if self.computed_triples:
            return self.triples
        print(f"debug at title: {debug}")
        self.triples = SVOProcessor().process(self.doc, debug)
        self.computed_triples = True
        return self.triples
    
    def entity_combinations(self):
        if self.computed_combinations:
            return self.combinations
        self.combinations = EntityCombinationProcesor().process(self.doc)
        self.computed_combinations = True
        return self.combinations
        
class Page:
    def __init__(self, content_item, nlp, html_content_dir_path):
        self.content_item = content_item
        self.nlp = nlp
        self.html_content_dir_path = html_content_dir_path
        self.extracted_titles = []
        
    def base_path(self):
        return self.content_item['base_path']
    
    def html_file_path(self):
        return f"{self.html_content_dir_path}{self.base_path()}.html"
    
    def titles(self):
        if any(self.extracted_titles):
            return self.extracted_titles
        self.extracted_titles = PageTitleExtractor(self, self.nlp).extract()
        return self.extracted_titles


In [None]:

pages = []
for index, content_item in mainstream_content.iterrows():
    pages.append(Page(content_item, nlp, html_content_dir_path))
    

### Insert SVO triples into graph

In [None]:

def create_actions(page, triple):
    action_name = f"{triple.cypher_verb()} {triple.cypher_object()}"
    if triple.subject:
        action_name = f"{triple.cypher_subject()} {action_name}"
        graph.run("MATCH (page:Cid { name: '" + page.base_path() + "'}) \
        MATCH (object:Entity{name: '" + triple.cypher_object() + "'}) \
        MATCH (subject:Entity{name: '" + triple.cypher_subject() + "'}) \
        MERGE (verb:Verb{name: '" + triple.cypher_verb() + "'}) \
        MERGE (action:Action{name: '" + action_name + "'}) \
        MERGE (action)<-[:TITLE_MENTIONS]-(page) \
        MERGE (action)-[:HAS_VERB]->(verb) \
        MERGE (action)-[:HAS_OBJECT]->(object) \
        MERGE (action)-[:HAS_SUBJECT]->(subject);")
    else:
        graph.run("MATCH (page:Cid { name: '" + page.base_path() + "'}) \
        MATCH (object:Entity{name: '" + triple.cypher_object() + "'}) \
        MERGE (verb:Verb{name: '" + triple.cypher_verb() + "'}) \
        MERGE (action:Action{name: '" + action_name + "'}) \
        MERGE (action)<-[:TITLE_MENTIONS]-(page) \
        MERGE (action)-[:HAS_VERB]->(verb) \
        MERGE (action)-[:HAS_OBJECT]->(object);")

In [None]:
# Insert title actions into graph
# Once it has run (shouldn't take long) inspect the graph to see how it works (in particular look at Action nodes and their linkages)

host = os.environ.get('REMOTE_NEO4J_URL')
user = os.environ.get('NEO4J_USER')
password = os.environ.get('NEO4J_PASSWORD')
graph = Graph(host=host, user='neo4j', password = password, secure=True)

for page in pages:
    if any(page.titles()):
        title = page.titles()[0]
        for triple in title.subject_object_triples():
            create_actions(page, triple)

In [None]:
# Make a specific call to find pages that allow you to "renew licence"

graph.run('MATCH ({name: "renew"})-[:HAS_VERB|HAS_OBJECT|HAS_SUBJECT]-(n:Action)-[:HAS_VERB|HAS_OBJECT|HAS_SUBJECT]-({name: "licence"}) WITH n MATCH (n)-[:TITLE_MENTIONS]-(c:Cid) return c.name').data()

### Lets look at pages that don't have any triples and see if we can find some

In [None]:
no_triples_or_combinations = []
for page in pages:
    if any(page.titles()):
        if len(page.titles()[0].subject_object_triples()) == 0 and len(page.titles()[0].entity_combinations()) == 0:
            no_triples_or_combinations.append(page)

In [None]:
print(len(pages))
len(no_triples_or_combinations)


### Find all verbs and objects for content (ie what can do you do some things)

This is the section to run to export data for the finder frontend prototype

In [None]:
objects = {}
verbs = {}
a = ""
for page in pages:
    if any(page.titles()):
        for title in page.titles():
            for triple in title.subject_object_triples():
                triple_object = triple.cypher_object()
                triple_verb = triple.cypher_verb()
                if triple_object not in objects:
                    objects[triple_object] = []
                objects[triple_object].append([triple_verb, page.base_path(), title.title])
                if triple_verb not in verbs:
                    verbs[triple_verb] = []
                verbs[triple_verb].append([triple_object, page.base_path(), title.title])

In [None]:
for k, v in objects.items():
    unique_v = []
    for item in v:
        found = False
        for unique_item in unique_v:
            if unique_item[0] == item[0]:
                found = True
        if not found:
            unique_v.append(item)
    objects[k] = unique_v
    
for k, v in verbs.items():
    unique_v = []
    for item in v:
        found = False
        for unique_item in unique_v:
            if unique_item[0] == item[0]:
                found = True
        if not found:
            unique_v.append(item)
    verbs[k] = unique_v

In [None]:
# Dump out results for usage in finder-frontend

import json

with open('objects.json', 'w') as json_file:
    json.dump(objects, json_file)
    
with open('verbs.json', 'w') as json_file:
    json.dump(verbs, json_file)

### Debugging

In [None]:
# Useful for debugging SOV extraction

text = "Appeal a Housing Benefit decision"
text = "Apply for Universal Credit"
text = "Housing Benefit can help you pay your rent if you’re unemployed, on a low income or claiming benefits."
text = "Get state pension"
text = "Order a commemorative marriage certificate"

# ands
# Déjà vu and driving
# Severe memory problems and driving
# Kindertransport and the State Pension
# Optic neuritis and driving
# Tourette's syndrome and driving
# Stroke (cerebrovascular accident) and driving

# Vehicle recalls and faults
# VAT visits and inspections
# Farm and livery horses
# Maternity pay and leave

# Tattoo, piercing and electrolysis licence (Northern Ireland)
# Your benefits, tax and pension after the death of a spouse





# # ors
# Laser or intense pulsed light treatment licence (Northern Ireland)
# Brain abscess, cyst or encephalitis and driving
# Dizziness or vertigo and driving

# # fors
# Planning permission for farms
# Claim tax relief for your job expenses
# Master certificate for forest reproductive material (Northern Ireland)
# Driver CPC training for qualified drivers
# Tax relief for community amateur sports clubs (CASCs)
# Support for Mortgage Interest (SMI)
# Compensation for victims of modern slavery and human trafficking
# Reduced rate National Insurance for married women
# PAYE and payroll for employers
# Subsidised transport for 16 to 19 year olds in education
# Vehicle access permit for pedestrian zones (Northern Ireland)

# # other interesting
# Statutory Sick Pay (SSP): employer guide
# Student finance: how to apply
# VAT on services from abroad
# Licence for projections over a highway (England & Wales)
# Haulage jobs in the EU
# Rights and responsibilities for reservists and employers
# Budget for your Self Assessment tax bill if you're self-employed
# Tax on your UK income if you live abroad
# Dispose of waste in sea (Scotland)
# Safety certificates: sports grounds (England, Scotland and Wales)

# # SVO
# Exchange your paper driving licence for a photocard licence
# Check if a vehicle is taxed
# Stop being an employer
# Check if an animal medicine is licensed
# Request CCTV footage of yourself
# Volunteer as a coastguard
# Get your court costs assessed and approved
# Book internet access in your library
# Claim asylum in the UK
# Check if an alcohol wholesaler is approved
# Check if an awarding body is recognised
# Check if a university or college is officially recognised
# Become a motorcycle instructor
# Become a magistrate
# Comment on an alcohol licence
# Become an approved building inspector (England and Wales)

text = "Severe memory problems and driving"
# Kindertransport and the State Pension
# Optic neuritis and driving
# Tourette's syndrome and driving
# Stroke (cerebrovascular accident) and driving

text = "Apply to adopt a child through your council"
t = Title(text, nlp)
sov = t.subject_object_triples(debug=True)[1]
print(f"subject: {sov.subject}")
print(f"object: {sov.object}")
print(f"verb: {sov.verb}")

In [None]:


    
    
text = "Optic neuritis and driving"
text = "Kindertransport and the State Pension"
# Optic neuritis and driving
# Tourette's syndrome and driving
# 
text = "Stroke (cerebrovascular accident) and driving"
EntityCombination().process(text)

# Framenet

Offers a really good framework to 'cluster' actions that are possible and help synonymise verbs, for example in many cases "get" and "apply" for something are effectively synonyms and are used interchanably. Linking them through frames will help us answer queries that don't use the exact same verb as the one in the content title (as well as offering ways to find all content that (say) is about "applying for something"

It's a little more complex than I'd initially thought so I'm keeping this here for future use as I think it will be useful but it was a bit of trying to run before we could walk

## Inserting FrameNet

In [None]:
from nltk.corpus import framenet as fn
from py2neo import Graph
import os

In [None]:
# Do some inheritance munging

frames_data = []
for frame in fn.frames():
    lexeme_units = list(frame['lexUnit'].keys())
    fe = list(frame["FE"].keys())
    for relation in frame['frameRelations']:
        if relation['type']['name'] == "Inheritance" and relation['Child'] == frame:
            # If it's an inheritance and the child is the frame in question
            lexeme_units += list(relation['Parent']['lexUnit'].keys())
            fe += list(relation['Parent']['FE'].keys())
    frame_data = {
        'name': frame['name'],
        'lexeme_units': lexeme_units,
        'fe': fe
    }
    frames_data.append(frame_data)

In [None]:
# THIS TAKES A LOOOOOONG TIME

host = os.environ.get('REMOTE_NEO4J_URL')
user = os.environ.get('NEO4J_USER')
password = os.environ.get('NEO4J_PASSWORD')
graph = Graph(host=host, user='neo4j', password = password, secure=True)

cypher = ""

for index, frame in enumerate(frames_data[447 + 529:]):
    print(f"{index} of {len(frames_data)}")
    graph.run("CREATE (f:Frame {name: '" + frame['name'] + "'})")
    for lexeme_unit in list(set(frame['lexeme_units'])):
        split_lexeme = lexeme_unit.split(".")
        if split_lexeme[1] == "v":
            word = split_lexeme[0].replace("'", "")
            graph.run("MATCH (f:Frame {name: '" + frame['name'] + "'}) \
            MERGE (w:Verb { word: '" + word + "'}) \
            MERGE (w)<-[:HAS_LEXEME_UNIT]-(f)")
    for fe in frame['fe']:
        fe = fe.replace("'", "")
        fe = fe.lower()
        if len(graph.run("MATCH (e:Entity {name: '" + fe + "'}) RETURN e").data()) > 0:
            graph.run("MATCH (f:Frame { name: '" + frame['name'] + "'}), \
                (e:Entity { name: '" + fe + "' }) \
                SET e:FrameElement \
                MERGE (e)<-[:HAS_FRAME_ELEMENT]-(f)")
        else:
            graph.run("MATCH (f:Frame {name: '" + frame['name'] + "'}) \
            MERGE (w:FrameElement { type: '" + fe + "'})<-[:HAS_FRAME_ELEMENT]-(f)")

# Using framenet

In [None]:

def frames_with_frame_element(triple):
    frame_element_results = graph.run("\
        MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'})-[]-(frame: Frame)-[]-(fe:FrameElement { name: '" + triple.cypher_object() + "'})\
        RETURN distinct(frame.name) as frame_name").data()
    if not any(frame_element_results) and len(triple.cypher_subject()) > 0:
        frame_element_results = graph.run("\
         MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'})-[]-(frame: Frame)-[]-(fe:FrameElement) \
         WHERE lower(fe.name) CONTAINS 'entity' \
         RETURN distinct(frame.name) as frame_name").data()
    return [frame_name['frame_name'] for frame_name in frame_element_results]

def frames_without_frame_element(triple):
    frame_results = graph.run("\
        MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'})-[]-(frame: Frame)-[]-(fe:FrameElement) \
        RETURN distinct(frame.name) as frame_name")
    return [frame_name['frame_name'] for frame_name in frame_results]

def frames(triple):
    frames = frames_with_frame_element(triple)
    if any(frames):
        print("found frame element")
        return frames
    frames = frames_without_frame_element(triple)
    return frames

    
def create_links(page, triple):
    frames = frames_with_frame_element(triple)
    if any(frames):
        for frame in frames:
            if triple.subject:
                print(f"frame element for frame with subject: {frame}")
                graph.run("MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'}), \
                (page:Cid { name: '" + page.base_path() + "'}), \
                (frame: Frame {name: '" + frame + "'}), \
                (frameElement {name: '" + triple.cypher_object() + "'}), \
                (subject: Entity {name: '" + triple.cypher_subject() + "'}) \
                CREATE (createdFrame:PageFrame { name: '" + frame + "'}) \
                CREATE (createdFrame)<-[:IS_INSTANCE_OF]-(frame) \
                CREATE (verb)<-[:HAS_VERB]-(page) \
                CREATE (frameElement)<-[:HAS_FRAME_ELEMENT]-(page) \
                CREATE (verb)<-[:HAS_VERB]-(createdFrame) \
                CREATE (createdFrame)<-[:HAS_FRAME]-(page)\
                CREATE (frameElement)<-[:HAS_OBJECT]-(createdFrame)\
                CREATE (createdFrame)-[:SUBJECT]->(subject)")
            else:
                print(f"frame element for frame without subject: {frame}")
                graph.run("MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'}), \
                (page:Cid { name: '" + page.base_path() + "'}), \
                (frame: Frame {name: '" + frame + "'}), \
                (frameElement {name: '" + triple.cypher_object() + "'}) \
                CREATE (createdFrame:PageFrame { name: '" + frame + "'}) \
                CREATE (createdFrame)<-[:IS_INSTANCE_OF]-(frame) \
                CREATE (verb)<-[:HAS_VERB]-(page) \
                CREATE (frameElement)<-[:HAS_FRAME_ELEMENT]-(page) \
                CREATE (verb)<-[:HAS_VERB]-(createdFrame) \
                CREATE (createdFrame)<-[:HAS_FRAME]-(page)\
                CREATE (frameElement)<-[:HAS_FRAME_ELEMENT]-(createdFrame)")
    else:
        print("no frame lements")
        frames = frames_without_frame_element(triple)
        for frame in frames:
            graph.run("MATCH (verb:Verb{word: '" + triple.cypher_verb() + "'}), \
            (page:Cid { name: '" + page.base_path() + "'}), \
            (frame: Frame {name: '" + frame + "'}) \
            CREATE (createdFrame:PageFrame { name: '" + frame + "'}) \
            CREATE (createdFrame)<-[:IS_INSTANCE_OF]-(frame) \
            CREATE (verb)<-[:HAS_VERB]-(page) \
            CREATE (verb)<-[:HAS_VERB]-(createdFrame) \
            CREATE (createdFrame)<-[:HAS_FRAME]-(page)")


In [None]:
# Link pages to frames, once it has run, have a poke around some PageFrame objects to see how it works

host = os.environ.get('REMOTE_NEO4J_URL')
user = os.environ.get('NEO4J_USER')
password = os.environ.get('NEO4J_PASSWORD')
graph = Graph(host=host, user='neo4j', password = password, secure=True)

for page in pages:
    for title in page.titles():
        for triple in title.subject_object_triples():
            create_links(page, triple)
            