# Testing Entity Functions Added to PatentData

And experimenting with new functions.

In [1]:
# Initialise logging and get it to print to notebook
# Based on answer here - 
# https://stackoverflow.com/questions/35936086/jupyter-notebook-does-not-print-logs-to-the-output-cell
import logging
import sys
import os
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()
logger.handlers[0].stream = sys.stdout
logger.level = logging.INFO
logger.info("Test logging info > output")

INFO : Test logging info > output


In [2]:
# Check for saved files
from patentdata.corpus import USPublications
from patentdata.models.patentcorpus import PatentCorpus

filename = "g06_100.patcorp.zip"

if os.path.isfile(filename):
    pcorp = PatentCorpus.load(filename)
else:
    pubs = USPublications("/media/SAMSUNG1/Patent_Downloads")
    pcorp = PatentCorpus.init_by_classification(pubs, ['G', '06'], sample_size=100)
    pcorp.save(filename)

INFO : Loading Patent Corpus
INFO : Loaded Patent Corpus with 100 documents


In [3]:
# Double check loaded corpus
print("There are {0} documents in the corpus".format(len(pcorp.documents)))
print("The title of the first document is: {0}".format(pcorp.documents[0].title))


There are 100 documents in the corpus
The title of the first document is: METHOD, APPARATUS AND SYSTEM FOR ANSWERING REQUESTS ON PEER-TO-PEER OVERLAY NETWORK


In [4]:
# Check for saved files
from patentdata.corpus import USGrants

filename = "g06_100_grants.patcorp.zip"

if os.path.isfile(filename):
    pcorp2 = PatentCorpus.load(filename)
else:
    grants = USGrants("/media/SAMSUNG1/US_Grants")
    pcorp2 = PatentCorpus.init_by_classification(grants, ['G', '06'], sample_size=100)
    pcorp2.save(filename)

INFO : Loading Patent Corpus
INFO : Loaded Patent Corpus with 200 documents


Observation:
* Maybe add each patent document to zip file as we go along. Especially useful if sampling 1000 documents or more.
* Still takes a minute or so to load 100 documents.

In [5]:
# Check why above is loading with 200 documents even though there are only 100 in the zip file
pcorp2.documents = pcorp2.documents[100:]

In [12]:
# Double check loaded corpus
print("There are {0} documents in the corpus".format(len(pcorp2.documents)))
print("The title of the first document is: {0}".format(pcorp2.documents[0].title))

There are 100 documents in the corpus
The title of the first document is: Pistol grip for a portable terminal with an internal receptacle for a stylus


In [13]:
pcorp.documents[4].claimset.get_claim(1)

1 
1. A detachable assembly for installing and detaching a memory module having a printed circuit board in a memory slot, the printed circuit board defining a terminal portion and a free portion arranged opposite the terminal portion, comprising:
a plurality of metal parts disposed on the free portion of the printed circuit board.


In [14]:
pcorp.documents[4].claimset.get_claim(1).entities

[<Entity - name: detachable assembly; occurrences: [A detachable assembly]; children: []; limitations: [],
 <Entity - name: memory module; occurrences: [a memory module]; children: []; limitations: [],
 <Entity - name: printed circuit board; occurrences: [a printed circuit board, the printed circuit board, the printed circuit board]; children: []; limitations: [],
 <Entity - name: memory slot; occurrences: [a memory slot]; children: []; limitations: [],
 <Entity - name: terminal portion; occurrences: [a terminal portion, the terminal portion]; children: []; limitations: [],
 <Entity - name: free portion; occurrences: [a free portion, the free portion]; children: []; limitations: [],
 <Entity - name: plurality of metal parts; occurrences: [a plurality of metal parts]; children: []; limitations: []]

If we can model parent-child relationships we can assign reference numerals based on these. Assign numerals to things explicitly present first.  

If object may not need reference numeral.  

Reference numerals may be denominated based on entity number: e.g. < 10, 110, 120, etc; > 10 < 20 105, 110, 115; > 20 < 50 102, 104 etc.

## Looking at Entities in Description

In [15]:
pcorp.documents[4].description.entities

{'1': [memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module,
  memory module],
 '10': [printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board,
  printed circuit board],
 '11': [plurality of packaged integrated circuits,
  packaged integrated circuits,
  circuits,
  circuits],
 '12': [module card, module card],
 '13': [terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion,
  terminal portion],
 '130': [metal contact region, metal contact region],
 '131': [plurality of ground terminals,
  ground terminal,
  ground terminal,
  ground terminal,
  ground terminal,
  ground t

*** Maybe supply this data structure as an ordered dict based on int(key)... ***

In [16]:
from collections import OrderedDict

ordered_entities = OrderedDict(
        sorted(
            pcorp.documents[4].description.entities.items(), 
            key=lambda t: int("".join([n for n in t[0] if n.isnumeric()]))
        )
)

In [17]:
ordered_entities['10']

[printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board,
 printed circuit board]

We can then attempt to match description and claim entities.  

And develop an entity list over the complete claimset.  

We have a hierarchical organisation:
* An entity may be an abstraction of several different example entities.  
* An entity may have one or more parts.  
* An entity may have relationships with one or more other entities.

In [21]:
big_entity_list = []
for claim in pcorp.documents[4].claimset:
    big_entity_list += claim.entities

In [22]:
for e in big_entity_list:
    print(e.name)

detachable assembly
memory module
printed circuit board
memory slot
terminal portion
free portion
plurality of metal parts
detachable assembly
claim
metal parts
predetermined distance therebetween
force thereon
detachable assembly
claim
printed circuit board
first surface
oppositely arranged second surface
first corner
second corner
terminal portion
third corner
fourth corner
metal parts
detachable assembly
claim
terminal portion
printed circuit board
plurality of ground terminals
first surface
second surfaces
plurality of metal traces
thereon
end
metal trace couples
copper foil
ground terminal
detachable assembly
claim
metal parts
plurality of metal sleeves
free portion
printed circuit board
third corners
first and second surfaces
detachable assembly
claim
third and fourth corners
first surface
first coupling structure
second coupling structure
metal sleeves
first retaining structure
second retaining structure
detachable assembly
claim
metal sleeves
base wall
annular side wall
first r

***Observations:***
* There are many repetitions of common entities.
* "claim" occurs due to the "according to claim X" dependency.
* Indexes will be based on claims rather than the claim set.
* Can we use a similar matching algorithm to that used for the entity extraction?  
* Probabilistically each claim is new information that can iteratively update $P(\boldsymbol E)$.

In [23]:
print(big_entity_list[0])

<Entity - name: detachable assembly; occurrences: [A detachable assembly]; children: []; limitations: []


In [29]:
 # Matching occurrences
entity_dict = dict()
# Now group by unique
for entity in big_entity_list:
    if entity.name not in entity_dict.keys():
        entity_dict[entity.name] = list()
    entity_dict[entity.name].append(entity)
print(list(entity_dict.keys()))

['second corner', 'predetermined distance therebetween', 'inner portion', 'first and second surfaces', 'plurality of metal sleeves', 'free portion', 'copper foil', 'force thereon', 'fourth corner', 'first coupling structure', 'plurality of metal traces', 'second coupling structure', 'metal sleeves', 'third corners', 'metal trace couples', 'first corner', 'metal parts', 'third and fourth corners', 'detachable assembly', 'claim', 'oppositely arranged second surface', 'plurality of metal parts', 'base wall', 'terminal portion', 'memory module', 'memory slot', 'second surfaces', 'first surface', 'plurality of ground terminals', 'annular side wall', 'printed circuit board', 'first retaining structure', 'third corner', 'ground terminal', 'thereon', 'end', 'second retaining structure']


In [26]:
print("There are {0} entities extracted from the claimset; with {1} unique entities".format(len(big_entity_list), len(list(entity_dict.keys()))))

There are 119 entities extracted from the claimset; with 37 unique entities


In [28]:
# Order this second dictionary by occurrence
ordered_claimset_entities = OrderedDict(
        sorted(
            entity_dict.items(), 
            key=lambda t: t[1].occurrences[0][0].i 
        )
)

AttributeError: 'list' object has no attribute 'occurrences'

In [None]:
pcorp.documents[4].claimset.claims[14]

In [None]:
# Look at entities across claimset
for claim in pcorp.documents[4].claimset.claims

In [None]:
pcorp.documents[4].description.get_paragraph(5)

In [None]:
pcorp.documents[4].description.get_paragraph(5).sentences

In [None]:
for word in pcorp.documents[4].description.get_paragraph(5).sentences[0]:
    print(word.text, word.pos_)

Using the spaCy pos data a reference numeral is indicated by "NUM" as opposed to "CD".

In [None]:
def entity_finder(pos_list):
    """ Find entities with reference numerals using POS data."""
    entity_list = list()
    entity = []
    record = False
    for i, (word, pos) in enumerate(pos_list):
        if pos == "DT":
            record = True
            entity = []

        if record:
            entity.append((word, pos))

        if "FIG" in word:
            # reset entity to ignore phrases that refer to Figures
            record = False
            entity = []

        if pos == "CD" and entity and record and ('NN' in pos_list[i-1][1]):
            record = False
            entity_list.append(entity)

    return entity_list

In [None]:
from spacy.symbols import NUM, DET, NOUN

def ref_num_entity_finder(doc):
    """ Find entities with reference numerals a sentence 
    in the form of a spaCy span."""
    entity_list = list()
    start_index = 0
    record = False
    for token in doc:
        if token.pos == DET:
            record = True
            start_index = token.i
            
        if "FIG" in token.text:
            record = False
            
        if token.pos == NUM and doc[token.i-1].pos == NOUN:
             # Hack for plural nouns that may lack a determinant
            if not record and doc[token.i-1].tag_ == "NNS":
                # Follow tree for plural noun phrase
                children = [c for c in doc[token.i-1].children]
                if children:
                    start_index = children[0].i
                entity_list.append(doc[start_index:token.i+1])
            # Add 
            if record:
                record = False
                entity_list.append(doc[start_index:token.i+1])

    return entity_list

In [None]:
entity_finder(pos_list)

In [None]:
ref_num_entity_finder(pcorp.documents[4].description.get_paragraph(33).sentences[1].doc)

In [None]:
for t in pcorp.documents[4].description.get_paragraph(33).sentences[1]:
    print(t.text, t.i)

In [None]:
# Watch out each paragraph is a doc - sentences are just a span in the doc 
# - indexes are relative to the doc not the sentence
pcorp.documents[4].description.get_paragraph(33).doc[51]

In [None]:
pcorp.documents[4].description.get_paragraph(29).text

This basic algorithm misses the plurals as these do not have a determinant.

In [None]:
from patentdata.models.lib.utils import (
    filter_entity_list
)
entities = list()
for para in self.paragraphs:
    for sentence in para.sentences:
        e = entity_finder(
                        [(w.text, w.tag_) for w in sentence]
                    )
        entities += e
    _entities = filter_entity_list(entities)
print(entities)
print(_entities)

In [None]:
def get_refnum_dict(entity_list):
    """ Get a dictionary of reference numerals indexed by entity ngram. """
    ngram_list = list()
    for entity in entity_list:
        # Add tuple of ngram without determinant and ref num to list
        ngram_list.append(
            (
                " ".join(
                            [
                                word for word, pos in entity if (pos != 'DT' and pos != 'CD')
                            ]
                        ),
                entity[-1][0]
            )
        )
    # Sort through list and generate dictionary
    entity_dict = dict()
    for ngram, ref_num in ngram_list:
        if ngram not in entity_dict.keys():
            entity_dict[ngram] = list()
        # Check if a variation already exists
        exists = False
        if ref_num not in entity_dict[ngram]:
            entity_dict[ngram].append(ref_num)
    return entity_dict

In [None]:
get_refnum_dict(_entities)

In [None]:
def get_entity_dict(entity_list):
    """ Get a dictionary of entities indexed by reference numeral."""
    entity_dict = {}
    for entity in entity_list:
        ref_num = entity[-1][0]
        if ref_num not in entity_dict.keys():
            entity_dict[ref_num] = list()
        # Check if a variation already exists
        exists = False
        n_gram = " ".join([w for w, _ in entity[1:-1]])
        for existing in entity_dict[ref_num]:
            if n_gram == existing:
                exists = True
        if not exists:
            entity_dict[ref_num].append(n_gram)
    return entity_dict

In [None]:
get_entity_dict(_entities)

This is picking up some plural entities in the specification. Maybe once we develop this we can implement another parse looking for string matches to dictionary entries.  

Also:
```
'214B': ['second retaining structure'],
'214B.': ['second retaining structure'],
```

In [None]:
ed_from_desc = get_entity_dict(_entities)

In [None]:
type(ed_from_desc['21B'][0])

We need to adapt these functions to use the entity objects so we can store references to particular tokens rather than strings.

In [None]:
from patentdata.models.lib.utils_entities import (
    extract_entities
)

ents = extract_entities(pcorp.documents[4].description.get_paragraph(33).doc)
print(ents)

In [None]:
from patentdata.models.lib.utils import nlp

entities2 = list()
entities2 = ref_num_entity_finder(nlp(pcorp.documents[4].description.text))
print(entities2)

In [None]:
entities2[1][-1]

In [None]:
def get_entity_dict2(entity_list):
    """ Get a dictionary of entities indexed by reference numeral."""
    entity_dict = {}
    for entity in entity_list:
        ref_num = entity[-1].text
        # Clean fullstops
        if ref_num[-1] == ".":
            ref_num = ref_num[:-1]
        if ref_num not in entity_dict.keys():
            entity_dict[ref_num] = list()
        # Check if a variation already exists
        exists = False
        n_gram = entity[1:-1]
        for existing in entity_dict[ref_num]:
            if n_gram == existing:
                exists = True
        if not exists:
            entity_dict[ref_num].append(n_gram)
    return entity_dict

In [None]:
get_entity_dict2(entities2)

In [None]:
# spacy adapted version of function
def get_refnum_dict2(entity_list):
    """ Get a dictionary of reference numerals indexed by entity ngram. """
    ngram_list = list()
    for entity in entity_list:
        # Add tuple of ngram without determinant and ref num to list
        ngram_list.append(
            (
                " ".join(
                            [
                                word.text for word in entity if (word.pos != DET and word.pos != NUM)
                            ]
                        ),
                entity[-1].text
            )
        )
    # Sort through list and generate dictionary
    entity_dict = dict()
    for ngram, ref_num in ngram_list:
        if ngram not in entity_dict.keys():
            entity_dict[ngram] = list()
        # Check if a variation already exists
        exists = False
        if ref_num not in entity_dict[ngram]:
            entity_dict[ngram].append(ref_num)
    return entity_dict

In [None]:
get_refnum_dict2(entities2)