# Testing Description Entity Extraction Functions in PatentData

And experimenting with new functions.

In [1]:
# Initialise logging and get it to print to notebook
# Based on answer here - 
# https://stackoverflow.com/questions/35936086/jupyter-notebook-does-not-print-logs-to-the-output-cell
import logging
import sys
import os
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()
logger.handlers[0].stream = sys.stdout
logger.level = logging.INFO
logger.info("Test logging info > output")

INFO : Test logging info > output


In [2]:
# Check for saved files
from patentdata.corpus import USPublications
from patentdata.models.patentcorpus import PatentCorpus

filename = "g06_100.patcorp.zip"

if os.path.isfile(filename):
    pcorp = PatentCorpus.load(filename)
else:
    pubs = USPublications("/media/SAMSUNG1/Patent_Downloads")
    pcorp = PatentCorpus.init_by_classification(pubs, ['G', '06'], sample_size=100)
    pcorp.save(filename)

INFO : Loading Patent Corpus
INFO : Loaded Patent Corpus with 100 documents


In [3]:
# Double check loaded corpus
print("There are {0} documents in the corpus".format(len(pcorp.documents)))
print("The title of the first document is: {0}".format(pcorp.documents[0].title))


There are 100 documents in the corpus
The title of the first document is: METHOD, APPARATUS AND SYSTEM FOR ANSWERING REQUESTS ON PEER-TO-PEER OVERLAY NETWORK


In [4]:
# Check for saved files
from patentdata.corpus import USGrants

filename = "g06_100_grants.patcorp.zip"

if os.path.isfile(filename):
    pcorp2 = PatentCorpus.load(filename)
else:
    grants = USGrants("/media/SAMSUNG1/US_Grants")
    pcorp2 = PatentCorpus.init_by_classification(grants, ['G', '06'], sample_size=100)
    pcorp2.save(filename)

INFO : Loading Patent Corpus
INFO : Loaded Patent Corpus with 200 documents


Observation:
* Maybe add each patent document to zip file as we go along. Especially useful if sampling 1000 documents or more.
* Still takes a minute or so to load 100 documents.

In [5]:
# Check why above is loading with 200 documents even though there are only 100 in the zip file
pcorp2.documents = pcorp2.documents[100:]

In [6]:
# Double check loaded corpus
print("There are {0} documents in the corpus".format(len(pcorp2.documents)))
print("The title of the first document is: {0}".format(pcorp2.documents[0].title))

There are 100 documents in the corpus
The title of the first document is: Pistol grip for a portable terminal with an internal receptacle for a stylus


In [7]:
pcorp.documents[4].claimset.get_claim(1)

1 
1. A detachable assembly for installing and detaching a memory module having a printed circuit board in a memory slot, the printed circuit board defining a terminal portion and a free portion arranged opposite the terminal portion, comprising:
a plurality of metal parts disposed on the free portion of the printed circuit board.


In [8]:
pcorp.documents[4].claimset.get_claim(1).entities

INFO : First pass - entity label heuristics
INFO : 
 - [{0: 1}]
INFO : 1 - [{}]
INFO : . - [{0: 1}]
INFO : A - [{0: 0}]
INFO : detachable - [{0: 0}]
INFO : assembly - [{0: 0}]
INFO : for - [{0: 1}]
INFO : installing - [{}]
INFO : and - [{}]
INFO : detaching - [{}]
INFO : a - [{0: 0}]
INFO : memory - [{0: 0}]
INFO : module - [{0: 0}]
INFO : having - [{0: 1}]
INFO : a - [{0: 0}]
INFO : printed - [{0: 0}]
INFO : circuit - [{0: 0}]
INFO : board - [{0: 0}]
INFO : in - [{}]
INFO : a - [{0: 0}]
INFO : memory - [{0: 0}]
INFO : slot - [{0: 0}]
INFO : , - [{0: 1}]
INFO : the - [{0: 0}]
INFO : printed - [{0: 0}]
INFO : circuit - [{0: 0}]
INFO : board - [{0: 0}]
INFO : defining - [{}]
INFO : a - [{0: 0}]
INFO : terminal - [{0: 0}]
INFO : portion - [{0: 0}]
INFO : and - [{}]
INFO : a - [{0: 0}]
INFO : free - [{0: 0}]
INFO : portion - [{0: 0}]
INFO : arranged - [{}]
INFO : opposite - [{}]
INFO : the - [{0: 0}]
INFO : terminal - [{0: 0}]
INFO : portion - [{0: 0}]
INFO : , - [{0: 1}]
INFO : comprising

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.5/logging/__init__.py", line 980, in emit
    msg = self.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 830, in format
    return fmt.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 567, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.5/logging/__init__.py", line 330, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python

INFO : Probs: module - [{0: 0}]
INFO : Probs: having - [{0: 1}]
INFO : Probs: a - [{0: 0}]
INFO : Probs: printed - [{0: 0}]
INFO : Probs: circuit - [{0: 0}]
INFO : Probs: board - [{0: 0}]
INFO : Probs: in - [{0: 1}]
INFO : Probs: a - [{0: 0}]
INFO : Probs: memory - [{0: 0}]
INFO : Probs: slot - [{0: 0}]
INFO : Probs: , - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: printed - [{0: 0}]
INFO : Probs: circuit - [{0: 0}]
INFO : Probs: board - [{0: 0}]
INFO : Probs: defining - [{0: 1}]
INFO : Probs: a - [{0: 0}]
INFO : Probs: terminal - [{0: 0}]
INFO : Probs: portion - [{0: 0}]
INFO : Probs: and - [{0: 1}]
INFO : Probs: a - [{0: 0}]
INFO : Probs: free - [{0: 0}]
INFO : Probs: portion - [{0: 0}]
INFO : Probs: arranged - [{0: 1}]
INFO : Probs: opposite - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: terminal - [{0: 0}]
INFO : Probs: portion - [{0: 0}]
INFO : Probs: , - [{0: 1}]
INFO : Probs: comprising - [{0: 1}]
INFO : Probs: : - [{0: 1}]
INFO : Probs: 
 - [{0: 1}]
INFO : Probs: 

[<Entity - name: detachable assembly; occurrences: [A detachable assembly]; children: []; limitations: [],
 <Entity - name: memory module; occurrences: [a memory module]; children: []; limitations: [],
 <Entity - name: printed circuit board; occurrences: [a printed circuit board, the printed circuit board, the printed circuit board]; children: []; limitations: [],
 <Entity - name: memory slot; occurrences: [a memory slot]; children: []; limitations: [],
 <Entity - name: terminal portion; occurrences: [a terminal portion, the terminal portion]; children: []; limitations: [],
 <Entity - name: free portion; occurrences: [a free portion, the free portion]; children: []; limitations: [],
 <Entity - name: plurality of metal parts; occurrences: [a plurality of metal parts]; children: []; limitations: []]

If we can model parent-child relationships we can assign reference numerals based on these. Assign numerals to things explicitly present first.  

If object may not need reference numeral.  

Reference numerals may be denominated based on entity number: e.g. < 10, 110, 120, etc; > 10 < 20 105, 110, 115; > 20 < 50 102, 104 etc.

## Looking at Entities in Description

In [9]:
pcorp.documents[4].description.entities

{'1': [memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module],
 '10': [printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board],
 '11': [plurality of packaged integrated circuits,
  packaged integrated circuits,
  circuits,
  circuits],
 '12': [module card, module card],
 '13': [terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion],
 '130': [metal contact region, metal contact region],
 '131': [plurality of ground terminals,
  ground terminal,
  ground terminal,
  ground terminal,
  ground terminal,
  ground t

*** Maybe supply this data structure as an ordered dict based on int(key)... ***

We can then attempt to match description and claim entities.  

And develop an entity list over the complete claimset.  

In [10]:
pcorp.documents[4].description.get_paragraph(5)

5 As the volume of electronic devices continues to miniaturize, the processing speed and operational functions are gradually becoming faster and more powerful, respectively. As a result, mainboards and electronic components in electronic devices are not only required to be miniaturized, but are also required to provide even higher operational speed in order to meet users' demand. As the data processing power increases, the demand for higher capacity and speed from memory modules of computers increases.

In [11]:
pcorp.documents[4].description.get_paragraph(5).sentences

[As the volume of electronic devices continues to miniaturize, the processing speed and operational functions are gradually becoming faster and more powerful, respectively.,
 As a result, mainboards and electronic components in electronic devices are not only required to be miniaturized, but are also required to provide even higher operational speed in order to meet users' demand.,
 As the data processing power increases, the demand for higher capacity and speed from memory modules of computers increases.]

In [12]:
for word in pcorp.documents[4].description.get_paragraph(5).sentences[0]:
    print(word.text, word.pos_)

As ADP
the DET
volume NOUN
of ADP
electronic ADJ
devices NOUN
continues VERB
to PART
miniaturize VERB
, PUNCT
the DET
processing NOUN
speed NOUN
and CCONJ
operational ADJ
functions NOUN
are VERB
gradually ADV
becoming VERB
faster ADV
and CCONJ
more ADV
powerful ADJ
, PUNCT
respectively ADV
. PUNCT


Using the spaCy pos data a reference numeral is indicated by "NUM" as opposed to "CD".

In [25]:
def entity_finder(pos_list):
    """ Find entities with reference numerals using POS data."""
    entity_list = list()
    entity = []
    record = False
    for i, (word, pos) in enumerate(pos_list):
        if pos == "DT":
            record = True
            entity = []

        if record:
            entity.append((word, pos))

        if "FIG" in word:
            # reset entity to ignore phrases that refer to Figures
            record = False
            entity = []

        if pos == "CD" and entity and record and ('NN' in pos_list[i-1][1]):
            record = False
            entity_list.append(entity)

    return entity_list

In [66]:
from spacy.symbols import NUM, DET, NOUN

def ref_num_entity_finder(doc):
    """ Find entities with reference numerals a sentence 
    in the form of a spaCy span."""
    entity_list = list()
    start_index = 0
    record = False
    for token in doc:
        if token.pos == DET:
            record = True
            start_index = token.i
            
        if "FIG" in token.text:
            record = False
            
        if token.pos == NUM and doc[token.i-1].pos == NOUN:
             # Hack for plural nouns that may lack a determinant
            if not record and doc[token.i-1].tag_ == "NNS":
                # Follow tree for plural noun phrase
                children = [c for c in doc[token.i-1].children]
                if children:
                    start_index = children[0].i
                entity_list.append(doc[start_index:token.i+1])
            # Add 
            if record:
                record = False
                entity_list.append(doc[start_index:token.i+1])

    return entity_list

In [67]:
entity_finder(pos_list)

[[('the', 'DT'), ('memory', 'NN'), ('module', 'NN'), ('1', 'CD')]]

In [68]:
ref_num_entity_finder(pcorp.documents[4].description.get_paragraph(33).sentences[1].doc)

[the memory module 1,
 the memory module 1,
 integrated circuits 11,
 the ground terminal 131,
 metal foils 21A.]

In [49]:
for t in pcorp.documents[4].description.get_paragraph(33).sentences[1]:
    print(t.text, t.i)

Specifically 37
, 38
during 39
the 40
installation 41
or 42
detachment 43
of 44
the 45
memory 46
module 47
1 48
, 49
integrated 50
circuits 51
11 52
can 53
obviate 54
damages 55
from 56
static 57
electricity 58
that 59
are 60
stored 61
in 62
users 63
' 64
hands 65
by 66
transmitting 67
the 68
static 69
electricity 70
to 71
the 72
ground 73
terminal 74
131 75
via 76
metal 77
foils 78
21A. 79


In [53]:
# Watch out each paragraph is a doc - sentences are just a span in the doc 
# - indexes are relative to the doc not the sentence
pcorp.documents[4].description.get_paragraph(33).doc[51]

circuits

In [62]:
pcorp.documents[4].description.get_paragraph(29).text

"The detachable assembly 20 includes a plurality of metal parts 21 disposed on the free portion 14 of the printed circuit board 10. In the instant embodiment, the metal parts 21 are metal foils 21A (such as copper foil), of which the quantity is set as four, and are disposed in pairs on the free portion 14 of the printed circuit board 10. The metal parts 21 are disposed in pairs on the free portion 14 of the printed circuit board 10. Furthermore, the two pairs of metal foils 21A are arranged with a predetermined distance D therebetween, in which each metal foil 21A has a substantially square shape and an area to facilitate users' fingers to exert force thereon."

This basic algorithm misses the plurals as these do not have a determinant.

In [28]:
from patentdata.models.lib.utils import (
    filter_entity_list
)
entities = list()
for para in self.paragraphs:
    for sentence in para.sentences:
        e = entity_finder(
                        [(w.text, w.tag_) for w in sentence]
                    )
        entities += e
    _entities = filter_entity_list(entities)
print(entities)
print(_entities)

[[('a', 'DT'), ('memory', 'NN'), ('module', 'NN'), ('1', 'CD')], [('The', 'DT'), ('memory', 'NN'), ('module', 'NN'), ('1', 'CD')], [('a', 'DT'), ('printed', 'VBN'), ('circuit', 'NN'), ('board', 'NN'), ('10', 'CD')], [('a', 'DT'), ('detachable', 'JJ'), ('assembly', 'NN'), ('20', 'CD')], [('the', 'DT'), ('printed', 'VBN'), ('circuit', 'NN'), ('board', 'NN'), ('10', 'CD')], [('a', 'DT'), ('plurality', 'NN'), ('of', 'IN'), ('packaged', 'VBN'), ('integrated', 'JJ'), ('circuits', 'NNS'), ('11', 'CD')], [('a', 'DT'), ('module', 'NN'), ('card', 'NN'), ('12', 'CD')], [('the', 'DT'), ('packaged', 'VBN'), ('integrated', 'VBN'), ('circuits', 'NNS'), ('11', 'CD')], [('the', 'DT'), ('module', 'NN'), ('card', 'NN'), ('12', 'CD')], [('the', 'DT'), ('printed', 'VBN'), ('circuit', 'NN'), ('board', 'NN'), ('10', 'CD')], [('a', 'DT'), ('terminal', 'NN'), ('portion', 'NN'), ('13', 'CD')], [('a', 'DT'), ('free', 'JJ'), ('portion', 'NN'), ('14', 'CD')], [('the', 'DT'), ('terminal', 'NN'), ('portion', 'NN'), 

In [30]:
def get_refnum_dict(entity_list):
    """ Get a dictionary of reference numerals indexed by entity ngram. """
    ngram_list = list()
    for entity in entity_list:
        # Add tuple of ngram without determinant and ref num to list
        ngram_list.append(
            (
                " ".join(
                            [
                                word for word, pos in entity if (pos != 'DT' and pos != 'CD')
                            ]
                        ),
                entity[-1][0]
            )
        )
    # Sort through list and generate dictionary
    entity_dict = dict()
    for ngram, ref_num in ngram_list:
        if ngram not in entity_dict.keys():
            entity_dict[ngram] = list()
        # Check if a variation already exists
        exists = False
        if ref_num not in entity_dict[ngram]:
            entity_dict[ngram].append(ref_num)
    return entity_dict

In [31]:
get_refnum_dict(_entities)

{'annular side wall': ['212B.'],
 'base wall': ['211B'],
 'detachable assembly': ['20'],
 'first and second surface': ['16'],
 'first and second surfaces': ['16,17', '16'],
 'first corner': ['161'],
 'first corners': ['161'],
 'first coupling structure': ['165'],
 'first coupling structures': ['165'],
 'first retaining structure': ['213B'],
 'first surface': ['16'],
 'fourth corner': ['164', '174'],
 'fourth corners': ['164'],
 'free portion': ['14'],
 'free portions': ['14'],
 'ground terminal': ['131'],
 'long side': ['18'],
 'memory module': ['1'],
 'metal contact region': ['130'],
 'metal foil': ['21A'],
 'metal foils': ['21A'],
 'metal parts': ['21'],
 'metal sleeve': ['21B'],
 'metal sleeves': ['21B'],
 'metal trace': ['15'],
 'module card': ['12'],
 'opposing second surface': ['17'],
 'packaged integrated circuits': ['11'],
 'pairs of metal foils': ['21A'],
 'plurality of first coupling structures': ['165'],
 'plurality of ground terminals': ['131'],
 'plurality of metal parts':

In [32]:
def get_entity_dict(entity_list):
    """ Get a dictionary of entities indexed by reference numeral."""
    entity_dict = {}
    for entity in entity_list:
        ref_num = entity[-1][0]
        if ref_num not in entity_dict.keys():
            entity_dict[ref_num] = list()
        # Check if a variation already exists
        exists = False
        n_gram = " ".join([w for w, _ in entity[1:-1]])
        for existing in entity_dict[ref_num]:
            if n_gram == existing:
                exists = True
        if not exists:
            entity_dict[ref_num].append(n_gram)
    return entity_dict

In [33]:
get_entity_dict(_entities)

{'1': ['memory module'],
 '10': ['printed circuit board'],
 '11': ['plurality of packaged integrated circuits',
  'packaged integrated circuits'],
 '12': ['module card'],
 '13': ['terminal portion'],
 '130': ['metal contact region'],
 '131': ['plurality of ground terminals', 'ground terminal'],
 '14': ['free portion', 'free portions'],
 '15': ['plurality of metal traces', 'metal trace'],
 '16': ['first surface',
  'first and second surface',
  'first and second surfaces'],
 '16,17': ['first and second surfaces'],
 '161': ['first corner', 'second corner', 'first corners'],
 '163': ['third corner', 'third corners'],
 '164': ['fourth corner', 'fourth corners'],
 '165': ['plurality of first coupling structures',
  'first coupling structure',
  'first coupling structures'],
 '17': ['second surface', 'opposing second surface'],
 '173': ['third corner'],
 '174': ['fourth corner'],
 '175': ['plurality of second coupling structures',
  'second coupling structure',
  'second coupling structures'

This is picking up some plural entities in the specification. Maybe once we develop this we can implement another parse looking for string matches to dictionary entries.  

Also:
```
'214B': ['second retaining structure'],
'214B.': ['second retaining structure'],
```

In [34]:
ed_from_desc = get_entity_dict(_entities)

In [35]:
type(ed_from_desc['21B'][0])

str

We need to adapt these functions to use the entity objects so we can store references to particular tokens rather than strings.

In [41]:
from patentdata.models.lib.utils_entities import (
    extract_entities
)

ents = extract_entities(pcorp.documents[4].description.get_paragraph(33).doc)
print(ents)

INFO : First pass - entity label heuristics
INFO : As - [{}]
INFO : a - [{0: 0}]
INFO : result - [{0: 0}]
INFO : , - [{0: 1}]
INFO : users - [{0: 0}]
INFO : can - [{}]
INFO : evenly - [{}]
INFO : exert - [{}]
INFO : force - [{0: 0}]
INFO : with - [{}]
INFO : two - [{}]
INFO : hands - [{0: 0}]
INFO : to - [{}]
INFO : ensure - [{}]
INFO : the - [{0: 0}]
INFO : installation - [{0: 0}]
INFO : and - [{}]
INFO : the - [{0: 0}]
INFO : detachment - [{0: 0}]
INFO : of - [{0: 0}]
INFO : the - [{0: 0}]
INFO : memory - [{0: 0}]
INFO : module - [{0: 0}]
INFO : 1 - [{}]
INFO : respectively - [{}]
INFO : on - [{}]
INFO : and - [{}]
INFO : from - [{}]
INFO : the - [{0: 0}]
INFO : memory - [{0: 0}]
INFO : slot - [{0: 0}]
INFO : S - [{0: 0}]
INFO : of - [{}]
INFO : the - [{0: 0}]
INFO : computer - [{0: 0}]
INFO : motherboard - [{0: 0}]
INFO : . - [{0: 1}]
INFO : Specifically - [{}]
INFO : , - [{0: 1}]
INFO : during - [{}]
INFO : the - [{0: 0}]
INFO : installation - [{0: 0}]
INFO : or - [{}]
INFO : detac

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.5/logging/__init__.py", line 980, in emit
    msg = self.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 830, in format
    return fmt.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 567, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.5/logging/__init__.py", line 330, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python

INFO : Step back token - module with pos - 90
INFO : the is e_0=1 or DET - looking back
INFO : Step back token - of with pos - 83
INFO : Setting non-Noun
INFO : Step back token - S with pos - 90
INFO : . is e_0=1 or DET - looking back
INFO : Step back token - motherboard with pos - 90
INFO : , is e_0=1 or DET - looking back
INFO : Step back token - Specifically with pos - 84
INFO : Setting non-Noun
INFO : Step back token - . with pos - 95
INFO : Setting non-Noun
INFO : the is e_0=1 or DET - looking back
INFO : Step back token - during with pos - 83
INFO : Setting non-Noun
INFO : Step back token - , with pos - 95
INFO : Setting non-Noun
INFO : Step back token - Specifically with pos - 84
INFO : Setting non-Noun
INFO : Step back token - . with pos - 95
INFO : Setting non-Noun
INFO : the is e_0=1 or DET - looking back
INFO : Step back token - of with pos - 83
INFO : Setting non-Noun
INFO : Step back token - detachment with pos - 90
INFO : , is e_0=1 or DET - looking back
INFO : Step back 

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.5/logging/__init__.py", line 980, in emit
    msg = self.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 830, in format
    return fmt.format(record)
  File "/usr/lib/python3.5/logging/__init__.py", line 567, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.5/logging/__init__.py", line 330, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python

INFO : Probs: module - [{0: 0}]
INFO : Probs: 1 - [{0: 1}]
INFO : Probs: respectively - [{0: 1}]
INFO : Probs: on - [{0: 1}]
INFO : Probs: and - [{0: 1}]
INFO : Probs: from - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: memory - [{0: 0}]
INFO : Probs: slot - [{0: 0}]
INFO : Probs: S - [{0: 0}]
INFO : Probs: of - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: computer - [{0: 0}]
INFO : Probs: motherboard - [{0: 0}]
INFO : Probs: . - [{0: 1}]
INFO : Probs: Specifically - [{0: 1}]
INFO : Probs: , - [{0: 1}]
INFO : Probs: during - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: installation - [{0: 0}]
INFO : Probs: or - [{}]
INFO : Probs: detachment - [{0: 0}]
INFO : Probs: of - [{0: 1}]
INFO : Probs: the - [{0: 0}]
INFO : Probs: memory - [{0: 0}]
INFO : Probs: module - [{0: 0}]
INFO : Probs: 1 - [{0: 1}]
INFO : Probs: , - [{0: 1}]
INFO : Probs: integrated - [{0: 0}]
INFO : Probs: circuits - [{0: 0}]
INFO : Probs: 11 - [{}]
INFO : Probs: can - [{}]
INFO : Probs: obviate - [

In [69]:
from patentdata.models.lib.utils import nlp

entities2 = list()
entities2 = ref_num_entity_finder(nlp(pcorp.documents[4].description.text))
print(entities2)

[a memory module 1, The memory module 1, a printed circuit board 10, a detachable assembly 20, the printed circuit board 10, a plurality of packaged integrated circuits 11, a module card 12, the packaged integrated circuits 11, the module card 12, the printed circuit board 10, a terminal portion 13, a free portion 14, the terminal portion 13, The terminal portion 13, a metal contact region 130, the memory module 1, The detachable assembly 20, a plurality of metal parts 21, the free portion 14, the printed circuit board 10, the metal parts 21, metal foils 21A, the free portion 14, the printed circuit board 10, The metal parts 21, the free portion 14, the printed circuit board 10, the two pairs of metal foils 21A, the printed circuit board 10, the two pairs of metal foils 21A., the metal contact region 130, the terminal portion 13, a plurality of ground terminals 131, The first surface 16, the second surface 17, the printed circuit board 10, a plurality of metal traces 15, the metal trac

In [71]:
entities2[1][-1]

1

In [79]:
def get_entity_dict2(entity_list):
    """ Get a dictionary of entities indexed by reference numeral."""
    entity_dict = {}
    for entity in entity_list:
        ref_num = entity[-1].text
        # Clean fullstops
        if ref_num[-1] == ".":
            ref_num = ref_num[:-1]
        if ref_num not in entity_dict.keys():
            entity_dict[ref_num] = list()
        # Check if a variation already exists
        exists = False
        n_gram = entity[1:-1]
        for existing in entity_dict[ref_num]:
            if n_gram == existing:
                exists = True
        if not exists:
            entity_dict[ref_num].append(n_gram)
    return entity_dict

In [80]:
get_entity_dict2(entities2)

{'1': [memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module],
 '10': [printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board],
 '11': [plurality of packaged integrated circuits,
  packaged integrated circuits,
  circuits,
  circuits],
 '12': [module card, module card],
 '13': [terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion],
 '130': [metal contact region, metal contact region],
 '131': [plurality of ground terminals,
  ground terminal,
  ground terminal,
  ground terminal,
  ground terminal,
  ground t

In [81]:
# spacy adapted version of function
def get_refnum_dict2(entity_list):
    """ Get a dictionary of reference numerals indexed by entity ngram. """
    ngram_list = list()
    for entity in entity_list:
        # Add tuple of ngram without determinant and ref num to list
        ngram_list.append(
            (
                " ".join(
                            [
                                word.text for word in entity if (word.pos != DET and word.pos != NUM)
                            ]
                        ),
                entity[-1].text
            )
        )
    # Sort through list and generate dictionary
    entity_dict = dict()
    for ngram, ref_num in ngram_list:
        if ngram not in entity_dict.keys():
            entity_dict[ngram] = list()
        # Check if a variation already exists
        exists = False
        if ref_num not in entity_dict[ngram]:
            entity_dict[ngram].append(ref_num)
    return entity_dict

In [82]:
get_refnum_dict2(entities2)

{'annular side wall': ['212B.'],
 'base wall': ['211B'],
 'detachable assembly': ['20'],
 'first and second surface': ['16'],
 'first and second surfaces': ['16,17', '16'],
 'first corner': ['161'],
 'first corners': ['161'],
 'first coupling structure': ['165'],
 'first coupling structures': ['165'],
 'first retaining structure': ['213B'],
 'first surface': ['16'],
 'fourth corner': ['164', '174'],
 'fourth corners': ['164'],
 'free portion': ['14'],
 'free portions': ['14'],
 'ground terminal': ['131'],
 'integrated circuits': ['11'],
 'long side': ['18'],
 'long sides': ['18'],
 'memory module': ['1'],
 'metal contact region': ['130'],
 'metal foil': ['21A'],
 'metal foils': ['21A', '21A.'],
 'metal parts': ['21'],
 'metal sleeve': ['21B'],
 'metal sleeves': ['21B'],
 'metal trace': ['15'],
 'module card': ['12'],
 'opposing second surface': ['17'],
 'packaged integrated circuits': ['11'],
 'pairs of metal foils': ['21A', '21A.'],
 'plurality of first coupling structures': ['165'],
