# spaCy Entity Extraction

In this notebook we will be looking at using spaCy (https://spacy.io/) to populate object models from patent claim data.

In [1]:
#Let's import spaCy
import spacy
from spacy.symbols import DET, NOUN, CCONJ, VERB, PUNCT

nlp = spacy.load('en') 

### Entity Extraction

For reference here are some common object POS patterns as extracted from a patent specification using the reference numeral as an end point.
```
[('<DET><NOUN><NUM>', 63),
 ('<DET><NOUN><NOUN><NUM>', 50),
 ('<DET><VERB><NOUN><NUM>', 48),
 ('<DET><ADJ><NOUN><NUM>', 39),
 ('<DET><NOUN><NOUN><NOUN><NUM>', 35),
 ('<DET><ADJ><ADJ><NOUN><NOUN><NUM>', 14),
 ('<DET><NOUN><PUNCT><VERB><NOUN><NUM>', 8),
 ('<DET><ADJ><NOUN><NOUN><NUM>', 6),
 ('<DET><ADJ><CCONJ><ADJ><ADJ><NOUN><NOUN><NUM>', 4),
 ('<DET><NOUN><NOUN><NOUN><NOUN><NUM>', 3),
 ('<DET><NOUN><ADP><NOUN><NOUN><NUM>', 3),
 ('<DET><ADJ><CCONJ><ADJ><NOUN><NUM>', 3),
 ('<DET><NOUN><ADP><NOUN><NUM>', 3),
 ('<DET><NOUN><VERB><NOUN><NUM>', 2),
 ('<DET><NOUN><ADV><CCONJ><ADJ><NOUN><NUM>', 1),
 ('<DET><ADJ><VERB><NUM><PUNCT><NUM><ADP><VERB><NOUN><NUM>', 1),
 ('<DET><NOUN><ADP><ADV><VERB><NOUN><NUM>', 1),
 ('<DET><ADV><VERB><NOUN><NUM>', 1),
 ('<DET><ADV><VERB><VERB><NOUN><NUM>', 1),
 ('<DET><VERB><NOUN><NOUN><NUM>', 1),
 ('<DET><NOUN><PUNCT><NOUN><VERB><NOUN><NUM>', 1),
 ('<DET><NOUN><VERB><ADP><VERB><NOUN><NUM>', 1),
 ('<DET><NOUN><ADP><ADJ><ADJ><ADJ><NOUN><NOUN><NUM>', 1),
 ('<DET><ADJ><NOUN><PUNCT><NOUN><PUNCT><VERB><NOUN><NUM>', 1),
 ('<DET><PUNCT><NOUN><PUNCT><NOUN><PUNCT><VERB><NOUN><NUM>', 1),
 ('<DET><VERB><NOUN><ADV><CCONJ><ADJ><NOUN><NUM>', 1),
 ('<DET><NOUN><ADP><ADJ><NOUN><NUM>', 1)]
 ```

Below are some initial functions.

In [2]:
def simple_spacy_entity_finder(doc):
    """ Find entities with reference numerals using POS data."""
    entity_list = list()
    record = False
    # Generate a list of tokens so we can iterate backwards through it
    enum_doc_list = list(enumerate(doc))
    last_end = 0
    # Add indices
    for i, word in enum_doc_list:
        if word.pos == DET and not record:
            # Start recording and record start index
            record = True
            start_index = i
        else:        
            if (word.pos == DET or word.pos == CCONJ or word.lemma_ == ";") and record:
                # Step back until last noun is found
                for j, bword in reversed(enum_doc_list[last_end:i]):
                    if bword.pos == NOUN:
                        # Add np_chunk to buffer
                        entity_list.append(doc[start_index:j+1])
                        last_end = j
                        break       
                if word.pos == DET:
                    # Set new start index
                    record = True
                    start_index = i
                else:
                    record = False
    
    entity_dict = dict()
    # Now group by unique
    for entity in entity_list:
        
        np_start = entity.start
        # Ignore the determinant 
        if doc[np_start].pos == DET:
            np_start += 1
        # Generate a string representation excluding the determinant
        np_string = doc[np_start:entity.end].text.lower()
                                
        if np_string not in entity_dict.keys():
            entity_dict[np_string] = list()          
        entity_dict[np_string].append(entity)
    
    return entity_list, entity_dict

In [3]:
def check_ant(doc, entity_dict):
    """ Check antecedence - attempt to merge entries with incorrect antecedence."""
    
    issue_keys_a = list()
    issue_keys_the = list()
    
    # Look for entries with antecedence issues
    for key in entity_dict:
        entities = entity_dict[key]
        # Check if first entry begins with "a" - flag if doesn't
        first_entry = entities[0]
        if first_entry[0].pos == DET and first_entry[0].lemma_ != "a" and first_entry[0].lemma_ != "an":
            issue_keys_a.append(key)
        
        # If more than one entry check subsequent entries start with "the" - flag if don't
        if len(entities) > 1:
            for entity in entities[1:]:
                if entity[0].pos == DET and entity[0].lemma_ != "the":
                    issue_keys_the.append(key)
    
    return issue_keys_a, issue_keys_the
        

In [4]:
def look_for_existing(doc, entity_dict):
    """ Look for previously existing versions of problem keys."""
    # If more than one entry check subsequent entries start with "the" - flag if don't
    issue_keys_a = list()
    for key in entity_dict:
        entities = entity_dict[key]
        # Check if first entry begins with "a" - flag if doesn't
        first_entry = entities[0]
        if first_entry[0].pos == DET and first_entry[0].lemma_ != "a" and first_entry[0].lemma_ != "an":
            issue_keys_a.append(key)
    
    for pkey in issue_keys_a:
        problem_entities = entity_dict[pkey]
        # i.e. list of two longer oblong spans
        # Can we just work with the key initially?
        for key in entity_dict.keys():
            if len(pkey) > len(key) and key in pkey:
                print(key, pkey)


In [5]:
# We now need to collate and create a set of entities
def get_entity_set(entity_list):
    """ Get a set of unique entity n-grams from a list of entities."""
    ngram_list = list()
    for entity in entity_list:
        ngram_list.append(" ".join([word for word, pos in entity if (pos != 'DET')]))
    return set(ngram_list)

In [15]:
def np_entity_finder(doc):
    """ Find entities using noun phrases/chunks."""
    entity_dict = dict()
    for entity in doc.noun_chunks:
        np_start = entity.start
        # Ignore the determinant 
        if doc[np_start].pos == DET:
            np_start += 1
        # Generate a string representation excluding the determinant
        np_string = doc[np_start:entity.end].text.lower()
                                
        if np_string not in entity_dict.keys():
            entity_dict[np_string] = list()          
        entity_dict[np_string].append(entity)
        
    return entity_dict

## Testing on Other Patent Data

Lets test on different patent claims.

In [6]:
# Generate or create some test claim sets for analysis

# (Looks like we can't pickle and load spaCy objects)
from patentdata.corpus import USPublications

pubs = USPublications("/media/SAMSUNG1/Patent_Downloads")
filegenerator = pubs.patentdoc_generator(['G', '06'], sample_size=10)
docs = list(filegenerator)
ent_from_claims = list()
nlp_docs = list()
for doc in docs:
    nlp_doc = nlp(doc.claimset.get_claim(1).text)
    entity_list, entity_dict = simple_spacy_entity_finder(nlp_doc)
    nlp_docs.append(nlp_doc)
    ent_from_claims.append(entity_dict) 

554570 records located.
10 records sampled.


In [7]:
ent_from_claims[0]

{'communication system': [a communication system],
 'experience': [the experience],
 'game system': [A game system],
 'interactive apparatus': [an interactive apparatus,
  the interactive apparatus],
 'interactive application': [an interactive application,
  the interactive application,
  the interactive application],
 'offline': [an offline],
 'online experience': [an online experience]}

In [8]:
ika, ikt = check_ant(nlp_docs[0], ent_from_claims[0])
print("These terms are not explicitly introduced using 'a/an X':\n", ika, "\n")

print("These terms do not use 'the' yet occur previously:\n", ikt, "\n") 

These terms are not explicitly introduced using 'a/an X':
 ['experience'] 

These terms do not use 'the' yet occur previously:
 [] 



In [9]:
nlp_docs[0]


1. A game system comprising:
an interactive apparatus having a communication system; and
an interactive application, the interactive application and the interactive apparatus are independently operable to provide an offline and an online experience, wherein at least one of the interactive application and interactive apparatus is configured modify its operation based on the experience of the other.


In [10]:
for d, e in zip(nlp_docs, ent_from_claims):
    print(d, "\n")
    print(e, "\n------\n")


1. A game system comprising:
an interactive apparatus having a communication system; and
an interactive application, the interactive application and the interactive apparatus are independently operable to provide an offline and an online experience, wherein at least one of the interactive application and interactive apparatus is configured modify its operation based on the experience of the other.

 

{'interactive apparatus': [an interactive apparatus, the interactive apparatus], 'game system': [A game system], 'communication system': [a communication system], 'offline': [an offline], 'interactive application': [an interactive application, the interactive application, the interactive application], 'experience': [the experience], 'online experience': [an online experience]} 
------


1. A server for providing affiliate store information, comprising:
a receiver for collecting transaction count information relating to a number of transactions made in each affiliate store and receiving s

Observations:
* Matching occurrences of "the X" with other entries looks generally useful (e.g. is needed across multiple claims). Phrases such as "the given X" or "the selected X" also appear.
* There are some long sections that appear not to meet the simple parse.
* Some have a blank entity?
* We could use the noun_chunks as a second test and merge for greater accuracy?
* Doesn't work so well on some method claims.
* Need to stop on punctuation as well, i.e. "," or ";"
* "said" needs to be a DET.
* Plurals cause an issue, e.g. "multimedia data"

In [11]:
for d, e in zip(nlp_docs, ent_from_claims):
    print("----\n", list(d.noun_chunks), "\n")
    print(list(e.keys()))
    print("\n-----\n")

----
 [A game system, an interactive apparatus, a communication system, an interactive application, the interactive application, the interactive apparatus, an offline, an online experience, the interactive application, interactive apparatus, its operation, the experience] 

['interactive apparatus', 'game system', 'communication system', 'offline', 'interactive application', 'experience', 'online experience']

-----

----
 [A server, affiliate store information, a receiver, transaction count information, a number, transactions, each affiliate store, search condition information, a mobile communication terminal, a database, affiliate store information, the collected transaction count information, a predetermined criteria, the affiliate store information, at least one affiliate store, the database, the search condition information, a search result, the extracted affiliate store information, a transmitter, the search result, the mobile communication terminal] 

['predetermined criteria', 

Can I define the problem using probabilities?  

Entities are latent variables of which the words are the visible / observable data.  

Problem is aligning groups of tokens with entities. Classification in a case where we don't know what the classes are or how many classes there are.  

P(entity | words)

What do we know for certain:
* It will have a form of DET ... NOUN or no DET but noun phrase ending in NNS

In [16]:
np_entity_finder(nlp_docs[1])

{'affiliate store': [each affiliate store],
 'affiliate store information': [affiliate store information,
  affiliate store information,
  the affiliate store information],
 'at least one affiliate store': [at least one affiliate store],
 'collected transaction count information': [the collected transaction count information],
 'database': [a database, the database],
 'extracted affiliate store information': [the extracted affiliate store information],
 'mobile communication terminal': [a mobile communication terminal,
  the mobile communication terminal],
 'number': [a number],
 'predetermined criteria': [a predetermined criteria],
 'receiver': [a receiver],
 'search condition information': [search condition information,
  the search condition information],
 'search result': [a search result, the search result],
 'server': [A server],
 'transaction count information': [transaction count information],
 'transactions': [transactions],
 'transmitter': [a transmitter]}

In [17]:
simple_spacy_entity_finder(nlp_docs[1])

([A server for providing affiliate store information,
  a receiver for collecting transaction count information,
  a number of transactions,
  each affiliate store,
  a mobile communication terminal,
  a database for storing affiliate store information,
  the collected transaction count information,
  a predetermined criteria,
  a generator,
  the affiliate store information of at least one affiliate store,
  the database,
  the search condition information,
  a search result,
  the extracted affiliate store information,
  a transmitter,
  the search result],
 {'affiliate store': [each affiliate store],
  'affiliate store information of at least one affiliate store': [the affiliate store information of at least one affiliate store],
  'collected transaction count information': [the collected transaction count information],
  'database': [the database],
  'database for storing affiliate store information': [a database for storing affiliate store information],
  'extracted affiliate stor

In [18]:
np_entity_finder(nlp_docs[2])

{'at least a portion': [at least a portion,
  at least a portion,
  at least a portion],
 'circuit': [the circuit],
 'communication module': [a communication module],
 'connector': [the connector, the connector],
 'contactless devices': [contactless devices],
 'cover': [A cover],
 'mobile device': [a mobile device,
  a mobile device,
  the mobile device,
  the mobile device,
  the mobile device,
  the mobile device],
 'port': [a port],
 'rear surface': [a rear surface],
 'rear surface form': [the rear surface form],
 'side surfaces': [the side surfaces, the side surfaces],
 'transactions': [transactions]}

In [19]:
simple_spacy_entity_finder(nlp_docs[2])

([A cover, a mobile device, comprising:
  side surfaces, a portion, a mobile device, a rear surface, a portion, a rear surface, the mobile device, the side surfaces, the side surfaces, the rear surface form, an opening, a portion, the mobile device, a connector, a port, the mobile device, a circuit, the connector, a communication module, the communication module configured to execute transactions with contactless devices, the mobile device, the circuit],
 {'circuit': [a circuit, the circuit],
  'communication module': [a communication module],
  'communication module configured to execute transactions with contactless devices': [the communication module configured to execute transactions with contactless devices],
  'connector': [a connector, the connector],
  'cover': [A cover],
  'mobile device': [a mobile device,
   the mobile device,
   the mobile device,
   the mobile device,
   the mobile device],
  'mobile device, comprising:\nside surfaces': [a mobile device, comprising:
   sid

## Improving the Algorithm

What do we know:
* A DET or a NOUN will always form part of an entity.
* A plural noun may not start with a DET.
* An entity will consist of consecutive tokens.
* The world following a DET will be part of the entity.
* Each determinant can only be linked to one of the nouns in front of it before the next determinant or [";", ":", "."] (and possibly ",").
* Entities with a "the" determinant should have occurred before.
* There are no overlaps.
* We can be more confident if a phrase is repeated.
* We can be more confident still if the phrase is repeated that initially starts with "a" and the next occurrence starts with "the" or "said".
* "said" should be taken as a DET.
* There will be between 1 and number of NOUNS entities.
* The boundary of an entity will be marked by NOUN NOTNOUN - however this pattern can also occur as part of the noun phrase for the entity.
* Entity text sequences will not cross a ":" or ";".
* Occurrences of an entity will have matching text including at least a matching noun.

Definite constraints for a well-formed claim:
* A NOUN will always form part of an entity;
* A singular noun will have a determinant;
* An entity will consist of consecutive tokens.
* There are no overlaps in occurrences - a word can only be linked to a single entity.
* There will be between 1 and number of NOUNS entities.
* Entity text sequences will not cross a ":" or ";" or "." (and possibly a ",").
* The boundary of an entity will be marked by NOUN NOTNOUN - however this pattern can also occur as part of the noun phrase for the entity.

We want to calculate the probability of a set of entities, $ \boldsymbol E $, given a claim as a sequence of words, $ \boldsymbol W $: $$ P(\boldsymbol E | \boldsymbol W) $$   

In fact we want to calculate: $$ \underset{\boldsymbol E}{\operatorname{argmax}} P(\boldsymbol E | \boldsymbol W) $$

Our claim has a length $ N $:$$\boldsymbol W = (\boldsymbol w_0, \boldsymbol w_1, ..., \boldsymbol w_{N})$$

$N$ may be calculated as the length of the claim in tokens.

Each word $\boldsymbol w_i$ has:
* text - $t_i$;
* a simple POS tag - $pos_i$;
* a more detailed POS tag - $posplus_i$;
* a lemma (i.e. a normalised word form) - $lemma_i$; and
* dependeny tree information - $dep_i$.

I.e. $$ \boldsymbol w_i = (t_i, pos_i, posplus_i, lemma_i, dep_i) $$

We have $ M $ entities: $$\boldsymbol E = (e_0, e_1, ..., e_{M})$$ 

where $\boldsymbol e_0 $ indicates "no related entity" or a "null" token. $M$ is not known but will be greater than 2 and less than a number of nouns.

An occurrence is a set of consecutive tokens: $$ \boldsymbol o_k = [\boldsymbol w_i, \boldsymbol w_{i+1}, ..., \boldsymbol w_{i+L_{k}}] $$ where $L_k$ is the length of occurrence $k$ which begins at word index $i$.

$$ \boldsymbol W = [o_1, o_2, ..., o_K] $$ where there are $K$ total occurrences in the claim. However, we don't know $K$ for sure. 

We do know the number of nouns $N_{noun}$. And we know $1 \leqslant K \leqslant N_{noun}$. Also $M \leqslant K$

An entity can have:
* a set of one or more occurrences;
* a string representation - possibly equal to common text across the set of occurrences;
* a number (e.g. be singular or plural).

An entity may be though of as a class label that is applied to a word: $$ \sum_{i=0}^M p(e_i | w) = 1 $$

We know that $ p(e_0 | pos = {DET}) = p(e_0 | pos = {NOUN}) = 0 $, i.e. that determinants and nouns will be assigned to some entity. We also know $ p(e_0 | t = ";") = p(e_0 | t = ":") = p(e_0 | t = ".") = 1$.

Entities are primarily just groupings of word spans, wherein the grouping creates a discrete entity?


$$ \sum_{i=0}^M P(\boldsymbol o_k | e_i) = 1$$

Decomposing using Bayes' Rule: 

$$ \underset{\boldsymbol e}{\operatorname{argmax}} P(\boldsymbol e | \boldsymbol w) = {P(\boldsymbol w | \boldsymbol e) P(\boldsymbol e)}/ P(\boldsymbol w)$$ 

where we can ignore the denominator as we are looking for argmax: $$ \underset{\boldsymbol e}{\operatorname{argmax}} P(\boldsymbol e | \boldsymbol w) = {P(\boldsymbol w | \boldsymbol e) P(\boldsymbol e)}$$

In other models $P(\boldsymbol w | \boldsymbol e)$ and $P(\boldsymbol e)$ may be approximated by a product of transitions (e.g. as per a hidden markov model). However, we have dependencies across sets of words.

Each determinant can only be linked to one of the nouns in front of it before the next determinant.

Start by setting each noun as a separate entity? And marking the tokens that are not an entity? Or look at confident selections e.g. DET NOUN [:;.,]

We can maybe start with a binary classification: $\boldsymbol e = [0,1]$? No, we can confidently apply a positive determination but our negative determination is unknown, i.e. a word that is not positively marked may still form part of an entity.

We can estimate $M$ by counting the number of "a"/"an" determinants + the number of multiple nouns.  

Issue multiple nouns are often introduced by "a X of Ys".  

Also we have "at least one X" and "one or more Ys" - these may not be introduced by "a" or "an" and "at least one" may be referred to again as "the at least one".  

Can we use an estimate of number of determinants as a lower bound?

This works fairly well for a lower bound / initial estimate.  

We can cross check later for missing plural nouns.

How do we model a sequential constraint? 

For each word $w_i$

In [None]:
# This is our good algorithm

# Start with all words relate to no entities
p_all_e_word = dict()

def check_start_phrase(token, doc):
    """ Check for start of phrases 'at least one' and 'one or more' as determinant.
    
    Return true if located."""
    i = token.i
    condition = (
        doc[i:i+3].text.lower() == "at least one" or
        doc[i:i+3].text.lower() == "one or more"
    )
    condition = condition and (doc[i-1].text.lower() != "the")
    return condition

def is_det(token, doc):
    """ Wrapper function for determinant check."""
    # Add 'said' as custom determination
    condition = (token.pos == DET or token.text == "said")
    # Alternatively we can have the start phrases as above
    condition = (condition or check_start_phrase(token, doc))
    # Add check for 'a)' and 'a.' - this is not a det
    condition = condition and (doc[token.i:token.i+2].text.lower() not in ['a)', 'a.'])
    return condition

Heuristics:
* "for" marks a non-entity [e_0=1]
* "DET X of ..." [e_0=0]
* "in X with" [e_0=1]
* "at least one" / "one or more" [e_0=0]
* lemma = \["comprise", "have", "be", "include"\] [e_0=1]
* "where" in token.text [e_0=1] (e.g. "where or wherein")
* "associated with" [e_0=1]
* "configured/adapted to" [e_0=0]

Also watch out for "each of the plurality of X" or "at least one of the plurality of X"

In [53]:
from spacy.symbols import NUM

# Could we change this to slice on a key? Probably
def sliceodict(d, i):
    """Slice an ordered dict based on a passed index.
    list[:i] for an ordered dict
    """
    temp_dict = {k:v for j, (k,v) in enumerate(d.items()) if j < i}
    return OrderedDict(sorted(temp_dict.items(), key=lambda t: t[1][0][0].i))

# We want to set these if they are not already set
def set_probability(token, p_all_e_word, entity, new_value):
    """ Set probability value if not set already"""
    if entity not in p_all_e_word[token].keys():
        if sum([v for k, v in p_all_e_word[token]] + new_value) <= 1: 
            p_all_e_word[token][entity] = new_value
    return p_all_e_word
            

def heuristics(token, doc, p_all_e_word):
    """ Apply heuristics to mark entity probabilities"""
    entity_stop_chars = ["\n",":",";",".", ","]
    # Set stop characters as non-entity
    if token.text in entity_stop_chars:
        p_all_e_word[token][0] = 1
    
    # Set noun as entity
    if token.pos == NOUN and p_all_e_word[token].get(0, None) != 1:
        p_all_e_word[token][0] = 0
    
    # 'for' is an entity boundary
    if token.lemma_ == "for":
        p_all_e_word[token][0] = 1
    
    # "comprise", "have", "be", "include" do not relate to an entity
    if token.lemma_ in ["comprise", "have", "be", "include"]:
        p_all_e_word[token][0] = 1
    
    # "where" and "wherein" do not relate to an entity
    if "where" in token.lemma_:
         p_all_e_word[token][0] = 1
    
    # Look ahead - check not at end
    if token.i < (len(doc)-1):
        
        # "configured/adapted to" do not relate to an entity
        if doc[token.i+1].lemma_ == "to" and token.lemma_ in ["configure", "adapt"]:
            p_all_e_word[token][0] = 1
            p_all_e_word[doc[token.i + 1]][0] = 1
    
    if token.i < (len(doc)-2):
        # Set DETs as entity
        if (
            token.pos == DET or token.text == "said"
        ) and (
            doc[token.i:token.i+2].text.lower() not in ['a)', 'a.']
        ):
            p_all_e_word[token][0] = 0
            p_all_e_word[doc[token.i+1]][0] = 0
            
        # DET X of .. relates to an entity
        if token.pos == DET and doc[token.i+2].lemma_ == "of":
            p_all_e_word[token][0] = 0
            p_all_e_word[doc[token.i + 1]][0] = 0
            # Set of
            p_all_e_word[doc[token.i + 2]][0] = 0
            # Set term after off
            p_all_e_word[doc[token.i + 3]][0] = 0
            
        # "in X with" does not relate to an entity
        if token.lemma_ == "in" and doc[token.i+2].lemma_ == "with":
            p_all_e_word[token][0] = 1
            p_all_e_word[doc[token.i + 1]][0] = 1
            p_all_e_word[doc[token.i + 2]][0] = 1
            
        # Associated with does not relate to an entity
        if doc[token.i:token.i+2].text.lower() == "associated with":
            p_all_e_word[token][0] = 1
            p_all_e_word[doc[token.i + 1]][0] = 1
    
    if token.i < (len(doc)-3):
        # "at least NUM" / "NUM or more" relates to an entity
        if doc[token.i:token.i + 2].text.lower() == "at least" and doc[token.i + 2].pos == NUM:
            p_all_e_word[token][0] = 0
            p_all_e_word[doc[token.i + 1]][0] = 0
            p_all_e_word[doc[token.i + 2]][0] = 0
        if doc[token.i+1:token.i + 3].text.lower() == "or more" and token.pos == NUM:
            p_all_e_word[token][0] = 0
            p_all_e_word[doc[token.i + 1]][0] = 0
            p_all_e_word[doc[token.i + 2]][0] = 0
    
    return p_all_e_word
    

The algorithm generally is:
* Mark as entity or not based on rules;
* Look back from DET or punct break [':',';',',','.'] - set as non-entity until noun is found;
* Look at noun phrase chunks 

In [107]:
doc = nlp_docs[5]

In [129]:
from collections import OrderedDict
from difflib import SequenceMatcher # alternative when looking at string differences for matches
# Try out with adding heuristics

def extract_entities(doc):
    """Extract entities from a spaCy doc object."""
    # Start with all words relate to no entities
    p_all_e_word = dict()
      
    for token in doc:
        # Initialise probabilities
        p_all_e_word[token] = dict()
        
    # This can be combined with first pass easily - similar checks
    print("First pass - entity label heuristics")
    for token in doc:
        p_all_e_word = heuristics(token, doc, p_all_e_word)
        print(token.text, "[{0}]".format(p_all_e_word[token]), end = '\n')
   
    print("Second pass - look for DET ... NOUN groupings") 
    # Second parse - take any DET ... NOUN <boundary> portions
    last_break = 0
    spans_to_match = list()
    for token in doc:
        # Look for hard end points or DET
        if (p_all_e_word[token].get(0, None) == 1) or (token.pos == DET):
            print("{0} is e_0=1 or DET - looking back".format(token))
            # Step back marking as e_0=1 until first NOUN      
            for j in range(token.i-1, last_break, -1):
                print("Step back token - {0} with pos - {1}".format(doc[j], doc[j].pos))
                if doc[j].pos != NOUN:
                    print("Setting non-Noun")
                    p_all_e_word[doc[j]][0] = 1
                else:
                    last_break = j
                    break
                    
        # Look at grouping from DET
        if is_det(token, doc):
            # Tweak for "at least X" and "X or more"
            if (
                doc[token.i:token.i + 2].text.lower() == "at least" and doc[token.i + 2].pos == NUM
            ) or (
                doc[token.i+1:token.i + 3].text.lower() == "or more" and token.pos == NUM
            ):
                #print("Head index set to {0}".format())
                head_index = doc[token.i+2].head.i
            else: 
                head_index = token.head.i
            possible_entity = True
            # Step through intermediate tokens between current and head
            for j in range(token.i, head_index):
                # If head is outside of DET ... end_NOUN sequence
                if doc[j].head.i < token.i and doc[j].head.i > head_index:
                    # Check for nested portions
                    possible_entity = False
            if possible_entity:
                for k in range(token.i, head_index + 1):
                    p_all_e_word[doc[k]][0] = 0 
        # Need to adapt the above for at least one ... X and one or more ... Xs - "at" > head > "least" > "one" > X
        
        # Look at plural nouns
        if token.tag_ == "NNS":
            print("Located plural noun: {0}".format(token))
            #Step back and mark as e_0=0 any preceding word that has the token as a head
            for j in range(token.i-1, 0, -1):
                print(doc[j], doc[j].head.i, p_all_e_word[doc[j]])
                if p_all_e_word[doc[j]]:
                    break
                elif (
                    doc[j].head.i == token.i
                ):
                    print("Setting {0} as e_0=0".format(doc[j]))
                    p_all_e_word[doc[j]][0] = 0
    
    for token in doc:
        print(token.text, "[{0}]".format(p_all_e_word[token]), end = '\n') 
    
    for token in doc:
        if not p_all_e_word[token]:
            print(token.text, "[{0}]".format(p_all_e_word[token]), end = '\n') 
    
    print("Extracted possible occurrences:\n")
    poss_occ = list()
    for token in doc[1:]:
        # If transition
        if p_all_e_word[token].get(0, 0) == 0 and p_all_e_word[doc[token.i-1]].get(0, 1) == 1:
            # Add consecutive e_0=0
            for j in range(token.i, len(doc)+1):
                if p_all_e_word[doc[j]].get(0, 1) != 0:
                    poss_occ.append(doc[token.i:j])
                    break

    print(poss_occ)

    # Matching occurrences
    entity_dict = dict()
    # Now group by unique
    for entity in poss_occ:
        np_start = entity.start
        # Ignore the determinant 
        if doc[np_start].pos == DET:
            np_start += 1
        # Generate a string representation excluding the determinant
        np_string = doc[np_start:entity.end].text.lower()                        
        if np_string:
            if np_string not in entity_dict.keys():
                entity_dict[np_string] = list()          
            entity_dict[np_string].append(entity)

    print(doc)
    # print(entity_dict)

    # Quick function to sort entities by occurrence
    # Need to sort the keys by the index of the first word in the first entry
    ordered_entities = OrderedDict(sorted(entity_dict.items(), key=lambda t: t[1][0][0].i))

    print(ordered_entities)

    # Look for duplict entities and merge
    new_o_e = ordered_entities.copy()
    for i, (entity_string, occurrences) in enumerate(ordered_entities.items()):
        # Check if first entry in occurrences begins with the
        current_occurrence = occurrences[0]
        if current_occurrence[0].lemma_ in ["the", "each"]:
            print("Found entity '{0}' with incorrect antecedence".format(current_occurrence))
            possible_matches = list()
            for previous_entity_string, previous_occurrences in sliceodict(ordered_entities, i).items():
                first_entry = previous_occurrences[0]
                # Check to see if head of occurrence with "the" agrees with head of previous occurrence
                # print(first_entry[-1].text.lower(), first_entry[-1].tag_, 
                  # current_occurrence[-1].text.lower(), current_occurrence[-1].tag_)
                if (
                    first_entry[-1].text.lower() == current_occurrence[-1].text.lower()
                ) and (
                    first_entry[-1].tag == current_occurrence[-1].tag
                ):
                    # print(first_entry[0].head, first_entry[-1], first_entry[-1].tag_)
                    print("Found possible match with {0}". format(previous_entity_string))
                
                    # Need to check here for multiple term matches 
                    possible_matches.append(previous_entity_string)
        
            print(possible_matches)
            if len(possible_matches) > 0:
                if len(possible_matches) > 1:
                    best_match = 0.0
                    best_match_string = ""
                    for match in possible_matches:
                        s = SequenceMatcher(a=entity_string, b=match).quick_ratio()
                        print(s)
                        if s > best_match:
                            best_match = s
                            print("Best match = {0}".format(best_match))
                            best_match_string = match
                    previous_entity_string = best_match_string
                else:
                    previous_entity_string = possible_matches[0]
                # Merge entries in copy of dict
                print("Selected previous entity = {0}".format(previous_entity_string))
                new_o_e[previous_entity_string] += occurrences
                new_o_e.pop(entity_string)
                
    print(new_o_e)
    return new_o_e

Error with:
"the side surfaces and the rear surface form an opening" > 'rear surface form an opening'

Test claims for specifio:
  
A method for a storage system, the storage system including a first controller, a second controller and a plurality of storage devices, each of the first and second controllers communicatively coupled to each one of the storage devices, the method comprising:
starting a timer that expires after a first time period; and
subsequent to starting the timer, transmitting a first message from the first controller to a memory element shared by the first and second controllers, the first message capable of notifying the second controller of an imminent failure of the first controller, wherein subsequent to transmitting the first message to the shared memory element and before or when the timer expires, the first controller becomes unavailable to facilitate access to the storage devices.

A dynamic voltage scaling scheduling method for resource-sharing and hard real-time tasks, applicable for scheduling tasks in a delayed task set, comprising:
determining a property of a task, and executing one of the following steps, when the task belongs to the delayed task set or the task does not belong to the task collection but a waiting time has exceeded a period of the task;
when one task in the delayed task set requires for being executed, increasing a working voltage required for executing the task, removing the task from the delayed task set, and returning to the step of determining the property of the task;
when one task in the delayed task set requires for sharing resources, setting the working voltage required by the task as a current working voltage or as a larger one in least upper bounds of all tasks requiring for sharing resources, and returning to the step of determining the property of the task; and
when one task not belonging to the delayed task set exists, and the waiting time of the task has exceeded the period of the task, reducing the working voltage required for executing the task, adding the task in the delayed task set, and returning to the step of determining the property of the task.

In [85]:
SequenceMatcher(a="one or more border areas", b="the border area").quick_ratio()
# Maybe remove determinant phrase then look at match score > ratio

0.6666666666666666

```
system system
apparatus apparatus
system system
application application
offline offline
experience experience
application apparatus
modify operation
```
A possible entity cross-check - first_entry[0].head = first_entry[-1]

This paper - http://cogprints.org/5025/1/nrc-48727.pdf - suggests a two-phase process:
* Generate a "gazetteer" (a list of named entities) - similar to our first stage of simple_entity_extraction method;
* Disambiguate names in "gazetteer" (this is similar to our second stage of simple_entity_extraction method).

### To do:
* Need to look for entities with different names to merge based on number agreement and head agreement and presence before use of the in claim. (e.g. "An elongate container....the container" or "a plurality of notches....the respective notches" - DONE
* Also look for unassigned words between det and noun - mark as e_0=1 look for head = noun (two image storage regions).
* Look an phrases such as "an offline and an online experience" - currently split as "an offline" and "an online experience" - need to merge to "an offline experience" and "an online experience".

Look for spans between e_0=1 - these must contain an occurrence. If there is only one DET-NOUN (check NP using head) or X NNS (check again using NP head) - that must be an entity. (This is the second parse?)

Can we look backwards from DET? Anything that is not a NOUN is e_0=1?

Plurals need looking at:
```
user [{0: 0}]
defined [{}]
rules [{0: 0}]
```

In [24]:
# Look at POS and head for each token
for token in doc:
    print(token.text, token.i, token.lemma_, token.pos_, token.head.text, token.head.i, token.tag_)


 0 
 SPACE 1 1 SP
1 1 1 NUM 1 1 CD
- 2 - SYM 10 3 SYM
10 3 10 NUM 1 1 CD
. 4 . PUNCT 1 1 .
( 5 ( PUNCT 1 1 -LRB-
canceled 6 cancel VERB 1 1 VBN
) 7 ) PUNCT canceled 6 -RRB-

 8 
 SPACE ) 7 SP


When matching what to do with 'the time scale' and 'the time scale display information' or 'a project' and:
```
A [{0: 0}]
project [{}]
information [{0: 0}]
display [{0: 0}]
device [{0: 0}]
, [{0: 1}]
comprising [{}]
: [{0: 1}]
```
Only look for e_0 stretches of same number with matching pos and text? (Are we now getting to look at transitions?)

There are issues with "(a)" and "a."

Also "response" from "in response".

Check det is not working for "at least one"

We can iterate back from where e_0 = 1 - tokens between a last noun and determinant will be part of an entity. We can then match those across the claim. This is the simple entity finder but stepping back at [:;,.] as well as DET.  
Pattern is:
* If next step back is e_0=0;
* If next e_0=0 is a check_det=True;
* Fill in inbetween as e_0=0.


Another pattern is "DET X FOR [phrase]" - this is one entity? But contains references to other entities
```
a [{0: 0}]
system [{0: 0}]
for [{}]
providing [{}]
a [{0: 0}]
plurality [{0: 0}]
of [{}]
football [{0: 0}]
player [{0: 0}]
types [{0: 0}]
from [{}]
which [{}]
a [{0: 0}]
football [{0: 0}]
player [{0: 0}]
type [{0: 0}]
is [{}]
selected 
```

This takes a for clause as the whole entity string - e.g. "A method for modeling electrical characteristics of cells having given circuit elements" and "a layout of cells having at least one cell".

## Method Claims - Extracting Steps

Let's try something similar for method steps. This may give us some clues for "comprising" X, Y, Z structure.

Naive algorithm:
* Look for a comprising relating to a method.
* Look for VERBs following at least one of ['\n', ';', ','] after the comprising colon.



In [96]:
for doc in nlp_docs:
    print(doc[0:10])
    for token in doc:
        if token.lemma_ == "comprise":
            print(token.head)


1. A game system comprising:
an
system

1. A server for providing affiliate store information
providing

1. A cover for a mobile device,
cover

1. A dynamic voltage scaling scheduling method for
task

1. A method of forming conductive traces on
forming

1. A method for a storage system,
method

1. A method comprising:
initiating a
method

1. A power-saving method for a
terminal

1-10. (canceled)


1. A system for automatically replacing problematic media
files


We can't really rely on a shallow naive use of the dependency pass.

In [146]:
# Look at POS and head for each token
for token in nlp_docs[7]:
    print(
        token.text, token.i, token.lemma_, token.pos_, 
        token.head.text, token.head.i, token.tag_, token.dep_,
        [(c, c.dep_) for c in token.children]
    )


 0 
 SPACE 1 1 SP  []
1 1 1 PUNCT 1 1 LS ROOT [(
, ''), (., 'punct')]
. 2 . PUNCT 1 1 . punct []
A 3 a DET method 7 DT det []
power 4 power NOUN saving 6 NN npadvmod []
- 5 - PUNCT saving 6 HYPH punct []
saving 6 save VERB method 7 VBG amod [(power, 'npadvmod'), (-, 'punct')]
method 7 method NOUN method 7 NN ROOT [(A, 'det'), (saving, 'amod'), (for, 'prep'), (., 'punct')]
for 8 for ADP method 7 IN prep [(terminal, 'pobj')]
a 9 a DET terminal 11 DT det []
mobile 10 mobile ADJ terminal 11 JJ amod []
terminal 11 terminal NOUN for 8 NN pobj [(a, 'det'), (mobile, 'amod'), (,, 'punct'), (comprising, 'acl')]
, 12 , PUNCT terminal 11 , punct []
comprising 13 comprise VERB terminal 11 VBG acl [(steps, 'dobj')]
the 14 the DET steps 15 DT det []
steps 15 step NOUN comprising 13 NNS dobj [(the, 'det'), (of, 'prep')]
of 16 of ADP steps 15 IN prep [(:, 'punct'), (monitoring, 'acl')]
: 17 : PUNCT of 16 : punct [(
, '')]

 18 
 SPACE : 17 SP  []
monitoring 19 monitoring NOUN of 16 NN acl [(,, 'punct'

Commas may be used to separate steps if spans have no internal commas? (But I sometimes write claims with comma-separated steps (e.g. substeps) that have clauses.)  

Look for colon that has comprising as head then look for (semi-colon) PUNCT that have colon as head (probably the same as just finding the semi-colons!

Sometimes errors in POS tagging - 
```
monitoring 19 monitoring NOUN of 16 NN acl
```

In [122]:
test_doc = nlp("""
A method comprising: forming; selecting; and having.
""")
for token in test_doc:
    print(
        token.text, token.i, token.lemma_, token.pos_, 
        token.head.text, token.head.i, token.tag_, list(token.children)
    )


 0 
 SPACE A 1 SP []
A 1 a DET method 2 DT [
]
method 2 method NOUN method 2 NN [A, comprising, .]
comprising 3 comprise VERB method 2 VBG [:, forming]
: 4 : PUNCT comprising 3 : []
forming 5 form VERB comprising 3 VBG [;, selecting]
; 6 ; PUNCT forming 5 : []
selecting 7 select VERB forming 5 VBG [;, and, having]
; 8 ; PUNCT selecting 7 : []
and 9 and CCONJ selecting 7 CC []
having 10 have VERB selecting 7 VBG []
. 11 . PUNCT method 2 . [
]

 12 
 SPACE . 11 SP []


In [126]:
doc


1. A method for a storage system, the storage system including a first controller, a second controller and a plurality of storage devices, each of the first and second controllers communicatively coupled to each one of the storage devices, the method comprising:
starting a timer that expires after a first time period; and
subsequent to starting the timer, transmitting a first message from the first controller to a memory element shared by the first and second controllers, the first message capable of notifying the second controller of an imminent failure of the first controller, wherein subsequent to transmitting the first message to the shared memory element and before or when the timer expires, the first controller becomes unavailable to facilitate access to the storage devices.


In [136]:
from spacy.symbols import PUNCT, VERB

def extract_steps(doc):
    """ Extract steps of a method claim from a spaCy doc object."""
    step_boundaries = list()
    
    # Alternative to below is to look at head of "method" token in claim
    for t1 in doc:
        if t1.lemma_ in ["comprise"]:
            # Scan ahead for colon 
            for t2 in doc[t1.i+1:]:
                if t2.lemma_ == ":":
                    print("Colon found at {0} (text='{1}')".format(t2.i, t2))
                    step_boundaries.append(t2)
                    # Scan ahead to find semi-colons associated with colon
                    for t3 in doc[t2.i+1:]:
                        if t3.pos == PUNCT and t3.tag_ in [":", "."]:
                            step_boundaries.append(t3)
                    break
            break
    
    print("Step boundaries are {0}".format(step_boundaries))
    
    # Find first verb after each in set [colon, semi-colon]
    step_verbs = list()
    for sb in step_boundaries[:-1]:
        for t1 in doc[sb.i+1:]:
            if t1.pos == VERB and t1.tag_ == "VBG":
                # sb is previous step boundary - we want next step boundary
                step_verbs.append(t1)
                break  
    
    print("Step verbs are {0}".format(step_verbs))
    
    steps = list()
    # Tada set of method steps
    for sv, sb in zip(step_verbs, step_boundaries[1:]):
        print("Step verb is {0} with lemma {1}".format(sv, sv.lemma_))
        print("Step text is {0}".format(doc[sv.i:sb.i].text))
        steps.append((sv, sb))
        
    return steps

In [148]:
doc = nlp_docs[9]

entity_dict = extract_entities(doc)

if "method" in list(entity_dict.keys())[0]:
    print("\nClaim is method claim")
    steps = extract_steps(doc)
    print("\n", steps)
else:
    print("\nClaim is not method claim")

First pass - entity label heuristics

 [{0: 1}]
1 [{}]
. [{0: 1}]
A [{0: 0}]
system [{0: 0}]
for [{0: 1}]
automatically [{}]
replacing [{}]
problematic [{}]
media [{0: 0}]
files [{0: 0}]
comprising [{0: 1}]
: [{0: 1}]

 [{0: 1}]
a [{0: 0}]
first [{0: 0}]
media [{0: 0}]
store [{0: 0}]
configured [{0: 1}]
to [{0: 1}]
store [{}]
a [{0: 0}]
plurality [{0: 0}]
of [{0: 0}]
digitally [{0: 0}]
encoded [{}]
local [{}]
media [{0: 0}]
files [{0: 0}]
; [{0: 1}]

 [{0: 1}]
a [{0: 0}]
second [{0: 0}]
media [{0: 0}]
store [{0: 0}]
configured [{0: 1}]
to [{0: 1}]
store [{}]
a [{0: 0}]
plurality [{0: 0}]
of [{0: 0}]
digitally [{0: 0}]
encoded [{}]
source [{0: 0}]
media [{0: 0}]
files [{0: 0}]
; [{0: 1}]

 [{0: 1}]
a [{0: 0}]
media [{0: 0}]
diagnostic [{}]
engine [{0: 0}]
configured [{0: 1}]
to [{0: 1}]
identify [{}]
whether [{}]
problems [{0: 0}]
exist [{}]
within [{}]
one [{}]
of [{}]
the [{0: 0}]
media [{0: 0}]
files [{0: 0}]
located [{}]
in [{}]
the [{0: 0}]
first [{0: 0}]
media [{0: 0}]
store [{0: 

To turn into a flow chart we can use:
* https://pygraphviz.github.io/examples.html
* https://www.sharelatex.com/blog/2013/08/29/tikz-series-pt3.html

Issues:
* Method claims that have a system comprising in the pre-amble (these are generally bad for US though).
        

## Using Extracted Entities

### Creating a Wrapper Class

It may be easier to deal with entities if we create a python class to model them.

In [163]:
class Entity:
    """ Abstract object for instantiating entities. 
    
    Attributes:
        ref_num - int representing associated reference number (maybe a list?)
        parent (? - or get from navigating children)
        children
        limitations
        essential - T or F (optional = F)
        number - (default = 1, >1 = plurality) 
        order - if in a set of children where it comes in the claim
    
    """
    def __init__(self, string_name, occurrences=[]):
        """ Initialise object. """
        self.name = string_name
        self.occurrences = occurrences
        self.children = list()
        self.limitations = list()
        
    def __repr__(self):
        return (
            "<Entity - name: {n}; " 
            "occurrences: {o}; " 
            "children: {c}; " 
            "limitations: {l}"
        ).format(
            n = self.name,
            o = self.occurrences,
            c = self.children,
            l = self.limitations
        )
        
    def add_occurrence(self, spacy_span):
        """ Add an occurrence in the form of a spaCy span"""
        self.occurrences.append(spacy_span)
        return self.occurrences
    
    @property
    def first_occurrence(self):
        """ Return starting index of first occurrence."""
        return min([o[0].i for o in self.occurrences])
        
    def add_child(self, child):
        """ Add a child entity.
        
        child: an object of the same class
        """
        # if string convert to ClaimFeature
        if isinstance(child, str):
            child = type(self)(child)
        assert isinstance(child, type(self))
        self.children.append(child)
        return self.children
    
    def remove_child(self, child):
        """ Remove a child entity.
        
        child: a child object to remove
        """
        self.children.remove(child)
        return self.children
    
    def add_limitation(self, limitation):
        """ Add a limitation.
        
        limitation: a Limitation object
        """
        if isinstance(limitation, str):
            limitation = Limitation(limitation)
        assert isinstance(limitation, Limitation)
        self.limitations.append(limitation)
        return self.limitations
    
    def remove_limitation(self, limitation):
        """ Remove a limitation.
        
       limitation: a Limitation object
        """
        self.limitations.remove(limitation)
        return self.limitations
    
    def prettyprint(self, object_str_single, object_str_plural, tabs=0):
        """ Pretty print a representation of feature with
        children and limitations.
        
        object_str_single: string to call feature instantiation 
        object_str_plural: string to call feature instantiation (plural)
        """
        tabtext = "\t"*tabs
        print("{0}{1}: {2}\n".format(tabtext, object_str_single, self.__repr__()))
        
        if self.limitations:
            tabs = tabs + 1
            tabtext = "\t"*tabs
            print("{0}Limitations:\n".format(tabtext))
            for i, limitation in enumerate(self.limitations):
                print("\t{0}{1} - {2}\n".format(tabtext, i, limitation.__repr__()))
        
        if self.children:
            tabs = tabs + 1
            print("{0}Child {1}:\n".format(tabtext, object_str_plural))
            for i, child in enumerate(self.children):
                child.prettyprint(tabs=tabs)
                
class Claim:
    """ Abstract object to represent a claim. 
    
    Attributes:
        entities
    """
    # Merge this later with our claim object
    def __init__(self, entities=[]):
        """ Initialise object.
        
        features - list of instantiated Entity objects
        """
        self.entities = entities
        
    def from_entity_dict(self, ed):
        """ Populate claim entities from a dictionary ed."""
        for k, v in ed.items():
            self.entities.append(
                Entity(
                    string_name=k,
                    occurrences=v
                )
            )
        return self.entities
    

In [164]:
claim1 = Claim()
claim1.from_entity_dict(entity_dict)

[<Entity - name: system; occurrences: [A system]; children: []; limitations: [],
 <Entity - name: media files; occurrences: [media files, media files, the media files]; children: []; limitations: [],
 <Entity - name: first media store; occurrences: [a first media store, the first media store, the first media store]; children: []; limitations: [],
 <Entity - name: plurality of digitally; occurrences: [a plurality of digitally, a plurality of digitally]; children: []; limitations: [],
 <Entity - name: second media store; occurrences: [a second media store, the second media store]; children: []; limitations: [],
 <Entity - name: source media files; occurrences: [source media files]; children: []; limitations: [],
 <Entity - name: media diagnostic engine; occurrences: [a media diagnostic engine, the media diagnostic engine]; children: []; limitations: [],
 <Entity - name: problems; occurrences: [problems, problems]; children: []; limitations: [],
 <Entity - name: problem; occurrences: [a p

May not be able to use dependency information from spaCy for looking at structure - link between "comprising" and subsequent features appear lost over long claim text.  

Head of verb will give you the subject.

In [165]:
for entity in claim1.entities:
    print(entity.name, entity.first_occurrence)

system 3
media files 9
first media store 14
plurality of digitally 21
second media store 31
source media files 43
media diagnostic engine 48
problems 56
problem 73
associated file 78
problematic file 91
user input 113
software based error detection algorithm determination 117
media replacement engine 127
copy 143
corresponding media file 146


In [153]:
for d in docs:
    claim1 = d.claimset.get_claim(1)
    print("\n------\n", claim1.split_into_features())


------
 [{'endindex': 30, 'text': '\n1. A game system comprising:\n', 'startindex': 0}, {'endindex': 89, 'text': 'an interactive apparatus having a communication system; and', 'startindex': 30}, {'endindex': 403, 'text': '\nan interactive application, the interactive application and the interactive apparatus are independently operable to provide an offline and an online experience, wherein at least one of the interactive application and interactive apparatus is configured modify its operation based on the experience of the other.\n\n', 'startindex': 89}]

------
 [{'endindex': 68, 'text': '\n1. A server for providing affiliate store information, comprising:\n', 'startindex': 0}, {'endindex': 271, 'text': 'a receiver for collecting transaction count information relating to a number of transactions made in each affiliate store and receiving search condition information from a mobile communication terminal;\n', 'startindex': 68}, {'endindex': 424, 'text': 'a database for storing affiliat

In [None]:
def get_relationships(doc):
    """ Extract relationships between claim entities from a spaCy doc object."""
    
    entity_dict = extract_entities(doc)
    
    entity_boundaries = list()
    
    # Alternative to below is to look at head of first entity in claim (but does not reliably point to comprising)
    for t1 in doc:
        if t1.lemma_ in ["comprise"]:
            # Scan ahead for colon 
            for t2 in doc[t1.i+1:]:
                if t2.lemma_ == ":":
                    print("Colon found at {0} (text='{1}')".format(t2.i, t2))
                    step_boundaries.append(t2)
                    # Scan ahead to find semi-colons associated with colon
                    for t3 in doc[t2.i+1:]:
                        if t3.pos == PUNCT and t3.tag_ in [":", "."]:
                            step_boundaries.append(t3)
                    break
            break
    
    print("Step boundaries are {0}".format(step_boundaries))
    
    # Find first entity with DET != the after each in set [colon, semi-colon]
    component = list()
    for sb in step_boundaries[:-1]:
        for t1 in doc[sb.i+1:]:
            if t1.pos == VERB and t1.tag_ == "VBG":
                # sb is previous step boundary - we want next step boundary
                step_verbs.append(t1)
                break  
    
    print("Step verbs are {0}".format(step_verbs))
    
    steps = list()
    # Tada set of method steps
    for sv, sb in zip(step_verbs, step_boundaries[1:]):
        print("Step verb is {0} with lemma {1}".format(sv, sv.lemma_))
        print("Step text is {0}".format(doc[sv.i:sb.i].text))
        steps.append((sv, sb))
        
    return steps

In [28]:
# Look for ["comprise", "have", "be", "include"]
for token in doc:
    if token.lemma_ in ["comprise", "have", "include"]:
        print("Entity {0} has relationship {1}".format(token.head.text, token.text))

Entity system has relationship comprising
Entity apparatus has relationship having
