# Reference Numeral Clarity Checks

The aim of this notebook is to achieve functionality similar to that provided by TurboPatent in their report generation. This looks for misuse of reference numerals, e.g. if 100 is used as "telephone 100" and "smartphone 100".

In [1]:
# We'll start with our test XML file
from patentdata.corpus import USPublications

path = '/patentdata/tests/test_files'
ds = USPublications(path)

pdoc = next(ds.iter_xml()).to_patentdoc()
print(pdoc)

<Patent Document object for US20060085912A1, title: Siderail support mechanism - containing: description with 47 paragraphs and claimset with 39 claims; classifications: [['A', '47', 'C', '21', '08']]


In [3]:
pdoc.description.get_paragraph(26).sentences[2]

The mounting bracket 20 includes a first opening 25 adapted for receiving a first lower pivot shaft 30 of a first arm 35 and a second opening 40 adapted for receiving a second lower pivot shaft 45 of a second arm 50.

In [6]:
pos = pdoc.description.get_paragraph(26).sentences[2].pos
print(pos[0:10])

[('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD'), ('includes', 'VBZ'), ('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD'), ('adapted', 'VBN')]


Our POS pattern is DT X+ CD.  

Can we use a RegexParser to extract our noun phrases followed by a number?

In [7]:
from nltk import RegexpParser

grammar = '''
            NP: {<DT> <.*>+ <CD>}
          '''
cp = RegexpParser(grammar)
parsed = cp.parse(pos)

In [8]:
parsed



ImportError: No module named '_tkinter', please install the python3-tk package

Tree('S', [Tree('NP', [('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD'), ('includes', 'VBZ'), ('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD'), ('adapted', 'VBN'), ('for', 'IN'), ('receiving', 'VBG'), ('a', 'DT'), ('first', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN'), ('shaft', 'NN'), ('30', 'CD'), ('of', 'IN'), ('a', 'DT'), ('first', 'JJ'), ('arm', 'NN'), ('35', 'CD'), ('and', 'CC'), ('a', 'DT'), ('second', 'JJ'), ('opening', 'NN'), ('40', 'CD'), ('adapted', 'VBN'), ('for', 'IN'), ('receiving', 'VBG'), ('a', 'DT'), ('second', 'JJ'), ('lower', 'JJR'), ('pivot', 'NN'), ('shaft', 'NN'), ('45', 'CD'), ('of', 'IN'), ('a', 'DT'), ('second', 'JJ'), ('arm', 'NN'), ('50', 'CD')]), ('.', '.')])

This is just matching the whole sentence. We might need to go simpler and just create a function that parses the POS.

In [65]:
def entity_finder(pos_list):
    """ Find entities with reference numerals using POS data."""
    entity_list = list()
    entity = []
    record = False
    for i, (word, pos) in enumerate(pos_list):
        if pos == "DT":
            record = True
            entity = []
            
        if record:
            entity.append((word, pos))
            
        if "FIG" in word:
            # reset entity to ignore phrases that refer to Figures
            record = False
            entity = []
        
        if pos == "CD" and entity and record and ('NN' in pos_list[i-1][1]): 
            record = False
            entity_list.append(entity)
    
    return entity_list

In [66]:
entity_finder(pos)

[[('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('30', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('arm', 'NN'), ('35', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('opening', 'NN'), ('40', 'CD')],
 [('a', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('45', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('arm', 'NN'), ('50', 'CD')]]

In [49]:
def filter_entity_list(entity_list):
    """Filter output to remove reference to priority claims."""
    filter_list = list()
    for entity in entity_list:
        if not ({"claims", "priority", "under"} <= set([w for w, _ in entity])):
            filter_list.append(entity)
    return filter_list

In [50]:
filter_entity_list(entity_finder(pos))

[[('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('30', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('arm', 'NN'), ('35', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('opening', 'NN'), ('40', 'CD')],
 [('a', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('45', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('arm', 'NN'), ('50', 'CD')]]

In [33]:
def print_entity_list(entity_list):
    """Little function to print entity list."""
    words = [[word for word, _ in e] for e in entity_list]
    print([" ".join(word_list) for word_list in words])      

In [34]:
print_entity_list(entity_finder(pos))

['The mounting bracket 20', 'a first opening 25', 'a first lower pivot shaft 30', 'a first arm 35', 'a second opening 40', 'a second lower pivot shaft 45', 'a second arm 50']


In [None]:
def list_entities(pdoc):
    """ List entities in a patent document."""
    entities = list()
    for para in pdoc.description.paragraphs:
        for sentence in para.sentences:
            entities += entity_finder(sentence.pos)
    entities = filter_entity_list(entities)
    print_entity_list(entities)
    return entities

In [67]:
entities = list_entities(pdoc)

['a siderail support mechanism 10', 'a siderail 15', 'The siderail support mechanism 10', 'a mounting bracket 20', 'a pair of fasteners 22', 'The mounting bracket 20', 'a first opening 25', 'a first lower pivot shaft 30', 'a first arm 35', 'a second opening 40', 'a second lower pivot shaft 45', 'a second arm 50', 'The siderail 15', 'a first opening 55', 'a first upper pivot shaft 60', 'the first arm 35', 'a second opening 65', 'a second upper pivot shaft 70', 'the second arm 50', 'The siderail first and second openings 55', 'the mounting bracket first and second openings 25', 'the first and second arms 35', 'the siderail 15', 'the mounting bracket 20', 'the first and second arms 35', 'the first and second lower pivot shafts 30', 'The first and second lower pivot shafts 30', 'a synchronization link 85', 'the toggles 75', 'The synchronization link 85', 'the toggles 75', 'the synchronization link 85 and through apertures 110', 'the toggles 75', 'the first and second arms 35', 'the siderai

In my first run there were some anomalies here. These include "This application claims priority under 35", "the embodiments of FIGS. 1-13" and "The siderail support mechanism 10".

I can tweak the parsing function to reset in "FIG" in word. I can also filter out entites with the phrase "claims priority under".  

There will likely be more occurrences to filter. (We maybe also want to check for the presence of a noun (NN\*) before the number)

Once we have a list of entities - we need to compress them into a set and check that the reference numerals are always consistently used.  

First create a set of entity N-grams. Then reverse and create a dictionary with the reference numeralas an index. Compare.

In [54]:
entities

[[('a', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('10', 'CD')],
 [('a', 'DT'), ('siderail', 'NN'), ('15', 'CD')],
 [('The', 'DT'),
  ('siderail', 'JJ'),
  ('support', 'NN'),
  ('mechanism', 'NN'),
  ('10', 'CD')],
 [('a', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'),
  ('pair', 'NN'),
  ('of', 'IN'),
  ('fasteners', 'NNS'),
  ('22', 'CD')],
 [('The', 'DT'), ('mounting', 'VBG'), ('bracket', 'NN'), ('20', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('opening', 'NN'), ('25', 'CD')],
 [('a', 'DT'),
  ('first', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('30', 'CD')],
 [('a', 'DT'), ('first', 'JJ'), ('arm', 'NN'), ('35', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('opening', 'NN'), ('40', 'CD')],
 [('a', 'DT'),
  ('second', 'JJ'),
  ('lower', 'JJR'),
  ('pivot', 'NN'),
  ('shaft', 'NN'),
  ('45', 'CD')],
 [('a', 'DT'), ('second', 'JJ'), ('arm', 'NN'), ('50', 'CD')],
 [('The', 'DT'), ('siderail', 'NN'), ('15',

In [61]:
def get_entity_set(entity_list):
    """ Get a set of unique entity n-grams from a list of entities."""
    ngram_list = list()
    for entity in entity_list:
        ngram_list.append(" ".join([word for word, pos in entity if (pos != 'DT' and pos != 'CD')]))
    return set(ngram_list)              

In [68]:
get_entity_set(entities)

{'L-shaped slot',
 'angle',
 'angles',
 'anterior face',
 'aperture',
 'arcuate indexing slot',
 'arms',
 'axis of its respective lower pivot shaft',
 'base',
 'bias of spring',
 'bushing',
 'bypass plate',
 'catch',
 'catches',
 'central shaft aperture',
 'circumferential portion',
 'collar',
 'collar or notched lower pivot shaft',
 'collars',
 'cutout section',
 'cutout sections',
 'detent',
 'differences found in locking plate',
 'dog-bone locking plate',
 'face',
 'first and second arms',
 'first and second lower pivot shafts',
 'first arm',
 'first end',
 'first lower pivot shaft',
 'first notch',
 'first notches',
 'first opening',
 'first upper pivot shaft',
 'indexing aperture',
 'indexing ball bearing',
 'indexing detent',
 'indexing detents',
 'indexing pin',
 'indexing slot',
 'inwardly projecting locking cog',
 'lateral faces , of locking cog',
 'lock release pin aperture',
 'locking cog',
 'locking cogs',
 'locking plate',
 'lockout pin',
 'lockout pin aperture',
 'lower e

In [75]:
def get_entity_dict(entity_list):
    """ Get a dictionary of entities indexed by reference numeral."""
    entity_dict = {}
    for entity in entity_list:
        ref_num = entity[-1][0]
        if ref_num not in entity_dict.keys():
            entity_dict[ref_num] = list()
        # Check if a variation already exists
        exists = False
        n_gram = " ".join([w for w, _ in entity[1:-1]])
        for existing in entity_dict[ref_num]:
            if n_gram == existing:
                exists = True
        if not exists:
            entity_dict[ref_num].append(n_gram)
    return entity_dict

In [76]:
get_entity_dict(entities)

{'10': ['siderail support mechanism'],
 '100': ['siderail support mechanism'],
 '110': ['synchronization link 85 and through apertures'],
 '120': ['series of circumferentially spaced notches',
  'notches',
  'respective notches'],
 '125': ['notches', 'respective notches'],
 '140': ['collar'],
 '145': ['pair of notches'],
 '15': ['siderail'],
 '155': ['“ dog-bone ” locking plate',
  'dog-bone locking plate',
  'locking plate'],
 '160': ['first end'],
 '165': ['oblong aperture', 'oblong apertures'],
 '170': ['second end'],
 '175': ['oblong aperture'],
 '180': ['inwardly projecting locking cog', 'locking cogs'],
 '190': ['lockout pin aperture'],
 '195': ['lock release pin aperture'],
 '20': ['mounting bracket'],
 '200': ['lockout pin'],
 '205': ['washer'],
 '210': ['bushing'],
 '215': ['bypass plate'],
 '22': ['pair of fasteners'],
 '220': ['central shaft aperture', 'shaft aperture'],
 '225': ['face'],
 '230': ['L-shaped slot'],
 '235': ['arcuate indexing slot', 'indexing slot'],
 '240': 

In [77]:
def highlight_multiple(entity_dict):
    """ Highlight reference numerals used for multiple entities. """
    for key, value in entity_dict.items():
        if len(value) > 1:
            print(key, value)

In [78]:
highlight_multiple(get_entity_dict(entities))

165 ['oblong aperture', 'oblong apertures']
240 ['pair of indexing detents', 'indexing detents', 'detent']
125 ['notches', 'respective notches']
345 ['locking cog', 'locking cogs']
120 ['series of circumferentially spaced notches', 'notches', 'respective notches']
155 ['“ dog-bone ” locking plate', 'dog-bone locking plate', 'locking plate']
330 ['cutout section', 'cutout sections']
335 ['pair of oblong apertures', 'oblong apertures', 'oblong aperture']
302 ['“ dog-bone ” locking plate', 'differences found in locking plate', 'locking plate']
235 ['arcuate indexing slot', 'indexing slot']
410 ['respective angle', 'angle']
405 ['respective angle', 'angles', 'angle']
325 ['second notch', 'second notches']
30 ['first lower pivot shaft', 'first and second lower pivot shafts', 'axis of its respective lower pivot shaft', 'lower pivot shafts', 'collar 140 or notched lower pivot shaft']
350 ['catch', 'catches']
25 ['first opening', 'mounting bracket first and second openings']
285 ['spring', 'bi