In [2]:
claim = """1. A decoding system, comprising:

a decoding engine running on a mobile device, the decoding engine in operation decoding signals produced from a read of a buyer's financial transaction card, the decoding engine in operation accepting and initializing incoming signals from the read of the buyer's financial transaction card until the signals reach a steady state, detecting the read of the buyer's financial transaction card once the incoming signals are in a steady state, identifying peaks in the incoming signals and digitizing the identified peaks in the incoming signals into bits;

and

a transaction engine running on the mobile device and coupled to the decoding engine, the transaction engine in operation receiving as its input decoded buyer's financial transaction card information from the decoding engine and serving as an intermediary between the buyer and a merchant, so that the buyer does not have to share his/her financial transaction card information with the merchant."""

In [3]:
import re

In [4]:
p = re.compile('\d+\.')
located = p.search(claim)
print(located)

<_sre.SRE_Match object; span=(0, 2), match='1.'>


In [13]:
claim[located.end():].strip()
int(located.group()[:-1])

1

In [2]:
import nltk

In [3]:
text = nltk.word_tokenize(claim)

In [4]:
print(text, end="")

['A', 'decoding', 'system', ',', 'comprising', ':', 'a', 'decoding', 'engine', 'running', 'on', 'a', 'mobile', 'device', ',', 'the', 'decoding', 'engine', 'in', 'operation', 'decoding', 'signals', 'produced', 'from', 'a', 'read', 'of', 'a', 'buyer', "'s", 'financial', 'transaction', 'card', ',', 'the', 'decoding', 'engine', 'in', 'operation', 'accepting', 'and', 'initializing', 'incoming', 'signals', 'from', 'the', 'read', 'of', 'the', 'buyer', "'s", 'financial', 'transaction', 'card', 'until', 'the', 'signals', 'reach', 'a', 'steady', 'state', ',', 'detecting', 'the', 'read', 'of', 'the', 'buyer', "'s", 'financial', 'transaction', 'card', 'once', 'the', 'incoming', 'signals', 'are', 'in', 'a', 'steady', 'state', ',', 'identifying', 'peaks', 'in', 'the', 'incoming', 'signals', 'and', 'digitizing', 'the', 'identified', 'peaks', 'in', 'the', 'incoming', 'signals', 'into', 'bits', ';', 'and', 'a', 'transaction', 'engine', 'running', 'on', 'the', 'mobile', 'device', 'and', 'coupled', 'to',

In [5]:
# This needs you to download the perceptron pos tagger from Models via nltk.download()
pos = nltk.pos_tag(text)

In [6]:
print(pos, end="")

[('A', 'DT'), ('decoding', 'NN'), ('system', 'NN'), (',', ','), ('comprising', 'VBG'), (':', ':'), ('a', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('running', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('mobile', 'JJ'), ('device', 'NN'), (',', ','), ('the', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('in', 'IN'), ('operation', 'NN'), ('decoding', 'NN'), ('signals', 'NNS'), ('produced', 'VBD'), ('from', 'IN'), ('a', 'DT'), ('read', 'NN'), ('of', 'IN'), ('a', 'DT'), ('buyer', 'NN'), ("'s", 'POS'), ('financial', 'JJ'), ('transaction', 'NN'), ('card', 'NN'), (',', ','), ('the', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('in', 'IN'), ('operation', 'NN'), ('accepting', 'NN'), ('and', 'CC'), ('initializing', 'VBG'), ('incoming', 'VBG'), ('signals', 'NNS'), ('from', 'IN'), ('the', 'DT'), ('read', 'NN'), ('of', 'IN'), ('the', 'DT'), ('buyer', 'NN'), ("'s", 'POS'), ('financial', 'JJ'), ('transaction', 'NN'), ('card', 'NN'), ('until', 'IN'), ('the', 'DT'), ('signals', 'NNS'), ('reach', 'VBP'),

In [7]:
# Print only the nouns
nouns = [word for word, part in pos if "NN" in part]

In [8]:
print(nouns, end="")

['decoding', 'system', 'engine', 'device', 'engine', 'operation', 'decoding', 'signals', 'read', 'buyer', 'transaction', 'card', 'engine', 'operation', 'accepting', 'signals', 'read', 'buyer', 'transaction', 'card', 'signals', 'state', 'read', 'buyer', 'transaction', 'card', 'incoming', 'signals', 'state', 'peaks', 'incoming', 'signals', 'peaks', 'incoming', 'signals', 'bits', 'transaction', 'engine', 'device', 'decoding', 'engine', 'transaction', 'engine', 'operation', 'receiving', 'input', 'buyer', 'transaction', 'card', 'information', 'engine', 'intermediary', 'buyer', 'merchant', 'buyer', 'share', 'transaction', 'card', 'information', 'merchant']

---
Playing with SpaCy to perform NLP

In [9]:
#import spacy

#from spacy.en import English
#parser = English()
#This causes the kernel to crash and be restarted

Back to the NLTK...
***

Now we try some chunking using Regular Expressions.

In [10]:
# Define Regex rules for Noun Phrases (NP), prepositions (P), Verbs (V), preposition phrase (PP), verb phrase (VP)
grammar = '''
    NP: {<DT>? <JJ>* <NN>*} # NP
    P: {<IN>}           # Preposition
    V: {<V.*>}          # Verb
    PP: {<P> <NP>}      # PP -> P NP
    VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
'''
cp = nltk.RegexpParser(grammar)

In [13]:
result = cp.parse(pos)

In [14]:
result.draw()

Not so bad for out of the box. We need some tweaks to recognise some verbs used as adjectives. Maybe if we add a new rule for a noun-phrase ```{<DT>+ <JJ>* <V>+ <NN>+}```...

In [34]:
grammar = '''
    NP: {<DT>+ <VBG>? <NN.?>+} 
        {<DT>? <JJ>* <NN.*>+} # NP
'''

cp = nltk.RegexpParser(grammar)
result = cp.parse(pos)
result.draw()

This is working better and picks up "a decoding engine" as a noun phrase. We also need to add a rule to identify "the buyer's financial transaction card" as a noun phrase. Currently the presence of the possesive "s" is throwing things off.

In [74]:
grammar = '''
    NP: {<DT|PRP\$> <VBG> <NN.*>+} 
        {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
        {<DT|PRP\$>? <JJ>* <NN.*>+ }
'''

cp = nltk.RegexpParser(grammar)
result = cp.parse(pos)
#result.draw()

In [75]:
print(result)

(S
  (NP A/DT decoding/NN system/NN)
  ,/,
  comprising/VBG
  :/:
  (NP a/DT decoding/VBG engine/NN)
  running/VBG
  on/IN
  (NP a/DT mobile/JJ device/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN decoding/NN signals/NNS)
  produced/VBD
  from/IN
  (NP a/DT read/NN)
  of/IN
  (NP a/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN accepting/NN)
  and/CC
  initializing/VBG
  incoming/VBG
  (NP signals/NNS)
  from/IN
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  until/IN
  (NP the/DT signals/NNS)
  reach/VBP
  (NP a/DT steady/JJ state/NN)
  ,/,
  detecting/VBG
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  once/IN
  (NP the/DT incoming/NN signals/NNS)
  are/VBP
  in/IN
  (NP a/DT steady/JJ state/NN)
  ,/,
  identifying/VBG
  (NP peaks/NNS)
  in/IN
  (NP the/DT incoming/NN signals/NNS)

In [76]:
result.draw()

Now we define a tree traversal function to print all the noun phrases.

In [77]:
result.label()

'S'

In [139]:
def traverse(t, np=False):
    try:
        t.label()
    except AttributeError:
        # This is then a tuple e.g. ('A', 'DT') which is a leaf of the tree
        if np:
            print(t[0], end=" ")
    else:
        # Now we know that t.node is defined
        if "S" in t.label():
            for child in t:
                traverse(child)
        elif "NP" in t.label():
            for child in t:
                traverse(child, np=True)
            print("")

In [140]:
traverse(result)

A decoding system 
a decoding engine 
a mobile device 
the decoding engine 
operation decoding signals 
a read 
a buyer 's financial transaction card 
the decoding engine 
operation accepting 
signals 
the read 
the buyer 's financial transaction card 
the signals 
a steady state 
the read 
the buyer 's financial transaction card 
the incoming signals 
a steady state 
peaks 
the incoming signals 
the identified peaks 
the incoming signals 
bits 
a transaction engine 
the mobile device 
the decoding engine 
the transaction engine 
operation receiving 
its input 
buyer 
financial transaction card information 
the decoding engine 
an intermediary 
the buyer 
a merchant 
the buyer 
share 
his/her financial transaction card information 
the merchant 


It would be good to get the unique entities in these noun phrases. We can ignore the determinant (although this could be used to check antecedence). A good grammar parser should do this - if a noun phrase is repeated it could be used to avoid ambiguity.

But that is for the future.

## Using Patent Data

Expand upon some of the ideas above using our patentdata source.

In [12]:
# We'll start with our test XML file
from patentdata.corpus import USPublications

path = '/patentdata/tests/test_files'
ds = USPublications(path)

pdoc = next(ds.iter_xml()).to_patentdoc()
print(pdoc)

<Patent Document object for US20060085912A1, title: Siderail support mechanism - containing: description with 47 paragraphs and claimset with 39 claims; classifications: [['A', '47', 'C', '21', '08']]


In [13]:
print(pdoc.description.get_paragraph(26).sentences[2])
pos = pdoc.description.get_paragraph(26).sentences[2].pos
print(pos[0:10])

The mounting bracket 20 includes a first opening 25 adapted for receiving a first lower pivot shaft 30 of a first arm 35 and a second opening 40 adapted for receiving a second lower pivot shaft 45 of a second arm 50.
[('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD'), ('includes', 'VBZ'), ('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD'), ('adapted', 'VBN')]


In [3]:
# Just print the POS elements
"".join(["<{0}>".format(p) for w, p in pos])

'<DT><VBG><NN><CD><VBZ><DT><JJ><NN><CD><VBN><IN><VBG><DT><JJ><JJR><NN><NN><CD><IN><DT><JJ><NN><CD><CC><DT><JJ><NN><CD><VBN><IN><VBG><DT><JJ><JJR><NN><NN><CD><IN><DT><JJ><NN><CD><.>'

In [4]:
# Our claims are here
claim1 = pdoc.claimset.get_claim(1)
# Look at some inbuilt features
claim1.split_into_features()

[{'endindex': 45,
  'startindex': 0,
  'text': '\n1. A siderail support mechanism comprising: '},
 {'endindex': 173,
  'startindex': 45,
  'text': 'a mounting bracket having a first lower pivot and a second lower pivot, the mounting bracket configured for mounting to a bed; \n'},
 {'endindex': 467,
  'startindex': 173,
  'text': 'a first support arm having a first upper pivot shaft and a first lower pivot shaft, the first upper pivot shaft configured to pivotally attach to a siderail at a first upper pivot and the first lower pivot shaft configured to pivotally attach to the first lower pivot of the mounting bracket; \n'},
 {'endindex': 770,
  'startindex': 467,
  'text': 'a second support arm having a second upper pivot shaft and a second lower pivot shaft, the second upper pivot shaft configured to pivotally attach to the siderail at a second upper pivot and the second lower pivot shaft configured to pivotally attach to the second lower pivot of the mounting bracket; \n'},
 {'endinde

In [9]:
# Look at our existing entity extraction routine
claim1.label_nounphrases()

([('1', 'CD', ''),
  ('.', '.', ''),
  ('A', 'DT', 1),
  ('siderail', 'JJ', 1),
  ('support', 'NN', 1),
  ('mechanism', 'NN', 1),
  ('comprising', 'NN', 1),
  (':', ':', ''),
  ('a', 'DT', 2),
  ('mounting', 'VBG', 2),
  ('bracket', 'NN', 2),
  ('having', 'VBG', ''),
  ('a', 'DT', ''),
  ('first', 'JJ', ''),
  ('lower', 'JJR', ''),
  ('pivot', 'NN', 3),
  ('and', 'CC', ''),
  ('a', 'DT', ''),
  ('second', 'JJ', ''),
  ('lower', 'JJR', ''),
  ('pivot', 'NN', 3),
  (',', ',', ''),
  ('the', 'DT', 2),
  ('mounting', 'VBG', 2),
  ('bracket', 'NN', 2),
  ('configured', 'VBD', ''),
  ('for', 'IN', ''),
  ('mounting', 'VBG', ''),
  ('to', 'TO', ''),
  ('a', 'DT', 4),
  ('bed', 'NN', 4),
  (';', ':', ''),
  ('a', 'DT', 5),
  ('first', 'JJ', 5),
  ('support', 'NN', 5),
  ('arm', 'NN', 5),
  ('having', 'VBG', ''),
  ('a', 'DT', 6),
  ('first', 'JJ', 6),
  ('upper', 'JJ', 6),
  ('pivot', 'NN', 6),
  ('shaft', 'NN', 6),
  ('and', 'CC', ''),
  ('a', 'DT', ''),
  ('first', 'JJ', ''),
  ('lower', 'JJ

Patterns for noun phrases - always end in a noun (singular or plural) - NN\*.  Actually, as per here - http://www.nltk.org/book/ch07.html - what we are looking for are noun phrase **chunks**, which may be smaller than noun phrases per se.   

Patterns to watch out for:
 ('a', 'DT', 10),
  ('plurality', 'NN', 10),
  ('of', 'IN', ''),
  ('circumferentially', 'RB', ''),
  ('spaced', 'VBN', ''),
  ('notches', 'NNS', 11)
  
  ('a', 'DT', 17),
  ('second', 'JJ', 17),
  ('locking', 'NN', 17),
  ('cog', 'NN', 17),
  
Plurals:
An apparatus comprising: first and second notches.  (But arguably poor claim drafting?)
An apparatus comprising: four portions.



In [None]:
# In our routine our grammar chunking Regexs are
grammar = '''
            NP: {<DT|PRP\$> <VBG> <NN.*>+}
                {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
                {<DT|PRP\$>? <JJ>* <NN.*>+ }
            '''

These aren't perfect and could be improved.

See here for POS tags: https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk .  

We can pretty much ignore pronouns - these do not really occur in patent specifications, especially in the claims.  

We always end in a noun - either singular or plural:  
```
{ ... <NN.*> }
```

A large proportion start with a determinant - DT - this will be the case for singular items:
```
{ <DT> ... <NN> }
```
But we want to match first chunks such as "a plurality of" or "a set of":  
```
{ <DT> <NN> <IN> ... <NNS> }
```
In the middle we can have any of: nouns, verbs, adjectives.  

In [None]:
revised_grammar = '''
            NP: {<DT> <NN> <IN> ... <NNS>}
                {<DT> ... <NN>}
            '''

### INSIGHT:  

**Can we use the reference numerals in specifications to train POS patterns for noun phrases in claims?**  

We can use the reference numeral CD as extra information. 

In [2]:
pdoc.description.entities

[[('a', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('10', 'CD')],
 [('a', 'DT'), ('siderail', 'NN'), ('15', 'CD')],
 [('The', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('10', 'CD')],
 [('a', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'),
  ('pair', 'NN'),
  ('of', 'IN'),
  ('fasteners', 'NNS'),
  ('22', 'CD')],
 [('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('30', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('arm', 'NN'), ('35', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('opening', 'NN'), ('40', 'CD')],
 [('a', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('45', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('arm', 'NN'), ('50', 'CD')],
 [('The', 'DT'), ('siderail', 'NN'), ('15',

In [7]:
from collections import Counter
c = Counter(["".join(["<{0}>".format(p) for w, p in entity]) for entity in pdoc.description.entities])

In [9]:
c.most_common()

[('<DT><JJ><NN><CD>', 39),
 ('<DT><VBG><NN><CD>', 37),
 ('<DT><NN><CD>', 36),
 ('<DT><NN><NN><CD>', 36),
 ('<DT><NNS><CD>', 23),
 ('<DT><JJ><NN><NN><CD>', 15),
 ('<DT><JJ><JJR><NN><NN><CD>', 12),
 ('<DT><VBG><NNS><CD>', 11),
 ('<DT><JJ><NNS><CD>', 9),
 ('<DT><NN><NN><NN><CD>', 9),
 ('<DT><NNP><NN><CD>', 7),
 ('<DT><NN><NNS><CD>', 6),
 ('<DT><JJR><NN><NNS><CD>', 4),
 ('<DT><JJR><NN><CD>', 4),
 ('<DT><JJ><CC><JJ><JJR><NN><NNS><CD>', 4),
 ('<DT><RB><VBG><NN><CD>', 3),
 ('<DT><JJ><CC><JJ><NNS><CD>', 3),
 ('<DT><NN><NN><JJ><NN><CD>', 3),
 ('<DT><JJ><JJ><NN><NN><CD>', 2),
 ('<DT><VBG><NN><NN><CD>', 2),
 ('<DT><NN><IN><NNS><CD>', 2),
 ('<DT><JJ><NN><NN><VBG><NN><CD>', 2),
 ('<DT><NNS><VBN><IN><VBG><NN><CD>', 1),
 ('<DT><JJR><NN><NN><CD>', 1),
 ('<DT><NN><IN><NN><NNS><CD>', 1),
 ('<DT><NN><IN><PRP$><JJ><JJR><NN><NN><CD>', 1),
 ('<DT><JJ><CD><CC><VBN><JJR><NN><NN><CD>', 1),
 ('<DT><NN><VBG><NN><CD>', 1),
 ('<DT><VBG><NN><RB><CC><JJ><NNS><CD>', 1),
 ('<DT><VBG><NN><NNS><CD>', 1),
 ('<DT><NN><RB>

In [10]:
print("There are {0} different POS tag sequences".format(len(c)))

There are 38 different POS tag sequences


Hence, it looks like searching for a general ```<DET> ... <NN.*>``` pattern may capture most. But issue is that we may have multiple repeated ```<N>``` blocks. We need to look for the last NN. Or look ahead to next DT and work back until an NN is found.

In [15]:
pos_list = list(enumerate(pos_list))
print(pos_list[5:10])

[(5, ('a', 'DT')), (6, ('first', 'JJ')), (7, ('opening', 'NN')), (8, ('25', 'CD')), (9, ('adapted', 'VBN'))]


In [17]:
list(reversed(pos_list))

[(42, ('.', '.')),
 (41, ('50', 'CD')),
 (40, ('arm', 'NN')),
 (39, ('second', 'JJ')),
 (38, ('a', 'DT')),
 (37, ('of', 'IN')),
 (36, ('45', 'CD')),
 (35, ('shaft', 'NN')),
 (34, ('pivot', 'NN')),
 (33, ('lower', 'JJR')),
 (32, ('second', 'JJ')),
 (31, ('a', 'DT')),
 (30, ('receiving', 'VBG')),
 (29, ('for', 'IN')),
 (28, ('adapted', 'VBN')),
 (27, ('40', 'CD')),
 (26, ('opening', 'NN')),
 (25, ('second', 'JJ')),
 (24, ('a', 'DT')),
 (23, ('and', 'CC')),
 (22, ('35', 'CD')),
 (21, ('arm', 'NN')),
 (20, ('first', 'JJ')),
 (19, ('a', 'DT')),
 (18, ('of', 'IN')),
 (17, ('30', 'CD')),
 (16, ('shaft', 'NN')),
 (15, ('pivot', 'NN')),
 (14, ('lower', 'JJR')),
 (13, ('first', 'JJ')),
 (12, ('a', 'DT')),
 (11, ('receiving', 'VBG')),
 (10, ('for', 'IN')),
 (9, ('adapted', 'VBN')),
 (8, ('25', 'CD')),
 (7, ('opening', 'NN')),
 (6, ('first', 'JJ')),
 (5, ('a', 'DT')),
 (4, ('includes', 'VBZ')),
 (3, ('20', 'CD')),
 (2, ('bracket', 'NN')),
 (1, ('mounting', 'VBG')),
 (0, ('The', 'DT'))]

In [22]:
def entity_finder(pos_list):
    """ Find entities with reference numerals using POS data."""
    entity_list = list()
    record = False
    # Add indices
    enum_pos_list = list(enumerate(pos_list))
    for i, (word, pos) in enum_pos_list:
        if pos == "DT" and not record:
            # Start recording and record start index
            record = True
            start_index = i
            
        if pos == "DT" and record:
            # Step back until last noun is found
            for j, (word, pos) in reversed(enum_pos_list[:i]):
                if "NN" in pos:
                    # Add np_chunk to buffer
                    entity_list.append(pos_list[start_index:j+1])
                    break       
            
            # Set new start index
            start_index = i
    
    return entity_list

In [23]:
entity_finder(pos)

[[('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN')],
 [('a', 'DT'), ('first', 'JJ'), ('opening', 'NN')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('a', 'DT'), ('first', 'JJ'), ('arm', 'NN')],
 [('a', 'DT'), ('second', 'JJ'), ('opening', 'NN')],
 [('a', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN')]]

In [25]:
entity_finder(pdoc.claimset.get_claim(1).pos)

[[('A', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('comprising', 'NN')],
 [('a', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN')],
 [('a', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')],
 [('a', 'DT'), ('second', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')],
 [('the', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN')],
 [('a', 'DT'), ('bed', 'NN')],
 [('a', 'DT'), ('first', 'JJ'), ('support', 'NN'), ('arm', 'NN')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('upper', 'JJ'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('the', 'DT'),
  ('first', 'JJ'),
  ('upper', 'JJ'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('a', 'DT'), ('siderail', 'NN')],
 [('a', 'DT'), ('first', 'JJ'), ('upper', 'JJ'), ('pivot', 'NN')],
 [('the', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('the', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')]

In [27]:
# How does this approach work on the description?
desc_entities = list()
for para in pdoc.description.paragraphs:
    for sentence in para.sentences:
        desc_entities += entity_finder(sentence.pos)

In [28]:
desc_entities

[[('A', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('with', 'IN'),
  ('multiple', 'JJ'),
  ('locks', 'NNS')],
 [('an', 'DT'), ('impact', 'NN'), ('release', 'NN'), ('feature', 'NN')],
 [('an', 'DT'),
  ('upright', 'JJ'),
  ('deployed', 'NN'),
  ('position', 'NN'),
  (',', ','),
  ('but', 'CC'),
  ('is', 'VBZ'),
  ('adapted', 'VBN'),
  ('to', 'TO'),
  ('release', 'VB'),
  ('upon', 'JJ'),
  ('imposition', 'NN')],
 [('a', 'DT'), ('longitudinal', 'JJ'), ('impact', 'NN'), ('load', 'NN')],
 [],
 [('This', 'DT'),
  ('application', 'NN'),
  ('claims', 'VBZ'),
  ('priority', 'NN'),
  ('under', 'IN'),
  ('35', 'CD'),
  ('U.S.C.', 'NNP'),
  ('§119', 'NNP'),
  ('(', '('),
  ('e', 'NN'),
  (')', ')'),
  ('of', 'IN'),
  ('copending', 'VBG'),
  ('provisional', 'JJ'),
  ('application', 'NN'),
  ('Ser.', 'NNP'),
  ('No.', 'NN'),
  ('60/622', 'CD'),
  ('503', 'CD'),
  ('filed', 'VBN'),
  ('Oct.', 'NNP')],
 [],
 [],
 [('the', 'DT'), ('invention', 'NN')],
 [('another', 'DT')

We still need to filter the FIG and priority phrases. It also looks like we need to stop on CC (but and and).

Using reference numerals looks to work better here, e.g. ```[('a', 'DT'),
  ('mounting', 'VBG'),
  ('bracket', 'NN'),
  ('20', 'CD'),
  ('for', 'IN'),
  ('attachment', 'NN')]```

In [29]:
# Back to the claims
entity_finder(pdoc.claimset.get_claim(2).pos)

[[('The', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('of', 'IN'),
  ('claim', 'NN')],
 [('a', 'DT'), ('collar', 'NN')],
 [('the', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN')],
 [('the', 'DT'),
  ('circumferentially', 'RB'),
  ('spaced', 'JJ'),
  ('notches', 'NNS')]]

In [31]:
for claim in pdoc.claimset.claims:
    print(entity_finder(claim.pos))

[[('A', 'DT'), ('siderail', 'JJ'), ('support', 'NN'), ('mechanism', 'NN'), ('comprising', 'NN')], [('a', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN')], [('a', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')], [('a', 'DT'), ('second', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')], [('the', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN')], [('a', 'DT'), ('bed', 'NN')], [('a', 'DT'), ('first', 'JJ'), ('support', 'NN'), ('arm', 'NN')], [('a', 'DT'), ('first', 'JJ'), ('upper', 'JJ'), ('pivot', 'NN'), ('shaft', 'NN')], [('a', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN'), ('shaft', 'NN')], [('the', 'DT'), ('first', 'JJ'), ('upper', 'JJ'), ('pivot', 'NN'), ('shaft', 'NN')], [('a', 'DT'), ('siderail', 'NN')], [('a', 'DT'), ('first', 'JJ'), ('upper', 'JJ'), ('pivot', 'NN')], [('the', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN'), ('shaft', 'NN')], [('the', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN')], [('the', 'DT'), ('mounting', 'VBG'), ('bracket', 'N