<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dataset" data-toc-modified-id="Dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset</a></span></li><li><span><a href="#Rule-based-Matcher" data-toc-modified-id="Rule-based-Matcher-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Rule-based Matcher</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Evaluation</a></span></li></ul></div>

# Dataset

We are going to use three patent domains from electricity G06F, computer science G06T, and medical A61M to complete and test our rule-based matcher.

In [None]:
import re 
# sentsplit = re.compile('[\n.;]')

with open('../data/G06F0011160000.txt', encoding = 'utf-8') as f1:
    G06F = f1.read().split('\n\n\n')

with open('../data/G06T0003000000.txt', encoding = 'utf-8') as f2:
    G06T = f2.read().split('\n\n\n')

with open('../data/A61M0009000000.txt', encoding = 'utf-8') as f3:
    A61M = f3.read().split('\n\n\n')

In [1]:
# claims used as default
with open('../03_spaCy_ner/claims.txt', encoding = 'utf-8', mode='r') as f:
    claims = f.read().replace('<p>', '').replace('</p>','').replace('<br/>','')

In [None]:
len(G06F)

In [None]:
len(G06T)

In [None]:
len(A61M)

In [2]:
# matching list of terms
import pandas as pd 
term_list = pd.read_csv('../01_make_matching_list/matching_list.csv', delimiter='\t', na_filter= False)

# # trigger list 
# with open('../01_make_matching_list/wordsFVE.txt', encoding = 'utf-8', mode='r') as f:
#     wordsFVE = f.read().replace('-','').replace(' ','')
# trigger_words = list(set([w for w in wordsFVE.split('\n') if w]))

In [3]:
term_list.head()

Unnamed: 0,term,annotation,df,wiki_title,wiki_summary
0,aperture z-scan experiments,Process,scienceie,Aperture,"In optics, an aperture is a hole or an opening..."
1,1560nm femtosecond laser pulses,Material,scienceie,Liquid crystal on silicon,Liquid crystal on silicon (LCoS or LCOS) is a ...
2,optical-chopper,Material,scienceie,Optical chopper,An optical chopper is a device which periodica...
3,vibrational combination states,Process,scienceie,Molecular vibration,A molecular vibration is a periodic motion of ...
4,non-radiative processes,Process,scienceie,Carrier generation and recombination,"In the solid-state physics of semiconductors, ..."


# Rule-based Matcher

In [5]:
import json 
from tqdm import tqdm
import spacy 
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer


nlp = spacy.load("en_core_web_lg", disable=['ner', 'lemmatizer', 'textcat'])

def custom_tokenizer(nlp):
    inf = list(nlp.Defaults.infixes)               # Default infixes
    inf.remove(r"(?<=[0-9])[+\-\*^](?=[0-9-])")    # Remove the generic op between numbers or between a number and a -
    inf = tuple(inf)                               # Convert inf to tuple
    infixes = inf + tuple([r"(?<=[0-9])[+*^](?=[0-9-])", r"(?<=[0-9])-(?=-)"])  # Add the removed rule after subtracting (?<=[0-9])-(?=[0-9]) pattern
    infixes = [x for x in infixes if '-|–|—|--|---|——|~' not in x] # Remove - between letters rule
    infix_re = spacy.util.compile_infix_regex(infixes)

    return Tokenizer(nlp.vocab, prefix_search=nlp.tokenizer.prefix_search,
                                suffix_search=nlp.tokenizer.suffix_search,
                                infix_finditer=infix_re.finditer,
                                token_match=nlp.tokenizer.token_match,
                                rules=nlp.Defaults.tokenizer_exceptions)

nlp.tokenizer = custom_tokenizer(nlp)

In [6]:
# add custom stop words 
nlp.Defaults.stop_words |= {'K',
                             'absolutely',
                             'advantageous',
                             'all',
                             'always',
                             'appropriate',
                             'chief',
                             'classic',
                             'clear',
                             'common',
                             'concise',
                             'considerable',
                             'convenient',
                             'correct',
                             'critical',
                             'desirable',
                             'different',
                             'difficult',
                             'essential',
                             'ever',
                             'every',
                             'exact',
                             'example',
                             'exclusive',
                             'expected',
                             'few',
                             'fewer',
                             'forth',
                             'fourth',
                             'frequent',
                             'full',
                             'fundamental',
                             'general',
                             'important',
                             'key',
                             'laborious',
                             'less',
                             'limited',
                             'main',
                             'majority',
                             'mandatory',
                             'more',
                             'most',
                             'must',
                             'necessary',
                             'needed',
                             'new',
                             'none',
                             'old',
                             'only',
                             'partly',
                             'peculiar',
                             'permanent',
                             'poor',
                             'preferable',
                             'primary',
                             'principal',
                             'rare',
                             'required',
                             'said',
                             'second',
                             'secondary',
                             'significant',
                             'solely',
                             'special',
                             'such',
                             'superior',
                             'third',
                             'traditional',
                             'typical',
                             'uncommon',
                             'useful',
                             'usual',
                             'vital',
                             'present',
                             'corresponding',
                             'i-th',
                             'particular',
                             'fifth',
                             'sixth',
                             'plural',
                             'available',
                             'received',
                             'transmitted',
                             'yet',
                             'above-mentioned',
                             'FIELD',
                             'BACKGROUND',
                             'actual',
                             'comprising',
                             'successful',
                             'smaller',
                             'large'}

# build matcher
matcher = Matcher(nlp.vocab, validate=True)

# build patterns
patterns = []
for term in tqdm(term_list.term.values):
    term_split = term.split(' ')
    if len(term_split) > 1: # if it is MWE
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False}] 
                        + [{"TEXT": token} for token in term_split]
                        + [{"POS": {"IN":["PROPN", "NOUN"]}, "OP": "*", "IS_STOP": False}])
         
    else: # if it is single word
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False}, 
                         {"TEXT": term_split[0], "POS": {"IN":["PROPN", "NOUN"]}},
                         {"POS": {"IN":["PROPN", "NOUN"]}, "OP": "*", "IS_STOP": False, "IS_DIGIT": False}])
        
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False},
                         {"TEXT": term_split[0], "POS": {"IN":["PROPN", "NOUN"]}},
                         {"TEXT": "of"},
                         {"POS": {"IN":["PROPN", "NOUN"]}, "OP": "+", "IS_STOP": False}])

        
        
patterns.append([{"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'}, 
                 {"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'},
                 {"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'}])     

patterns.append([{"POS": {"IN":["PROPN", "NOUN"]}, 
                  "LENGTH": {"<=": 4}, 
                  "IS_STOP": False,
                  'TEXT': {'REGEX': '^[A-Z]{2,}[s]?', 
                           "NOT_IN": ["FIG", "FIGS", "CODE", "CORE", "TIME", "ART", "LIST"]}}])

100%|██████████| 752070/752070 [00:10<00:00, 73218.83it/s] 


In [7]:
# warning = [[{"LOWER": war}] for war in trigger_words]

In [8]:
matcher.add('TERM', patterns) # add patterns to the matcher(this takes a quite long time)
# matcher.add('WARNING', warning)

In [9]:
from collections import defaultdict
from spacy.tokens import Span
import re 

def collect_sents(doc, matches):
    """
    collect sentences with matched spans, 
    if overlapping then pick up the longest else pick up the 1st
    """
    dict_sents = defaultdict(list)
    
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    for span in spacy.util.filter_spans(spans):        
        term = doc[span.start: span.end]      
        sent = term.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
        try:
            while term[-1].text in ['.','_','\n',';',',',' ','>','/','<'] and (not term.text.isupper() or (term.text.isupper and len(term.text)<=4)):
                term.end_char -= 1
            match = re.search('\([a-zA-Z0-9_]*$', term.text) 
            if match: term.end_char = match.start()
            
        except IndexError:
            continue

        match_ents = {
            "start": term.start_char - sent.start_char,
            "end": term.end_char - sent.start_char,
            "label": 'TERM',
        }
        dict_sents[sent.text].append(match_ents)
    dict_sents = dict(dict_sents)

    return [{"text": key, "ents": value} for key, value in dict_sents.items()]

# Evaluation

In [10]:
def visualize(patent):
    doc = nlp(patent)
    matches = matcher(doc)
    matched_sents = collect_sents(doc, matches)  # Collect data of matched sentences to be visualized

    # Serve visualization of sentences containing match with displaCy
    # set manual=True to make displaCy render straight from a dictionary
    # (if you're not running the code within a Jupyer environment, you can
    # use displacy.serve instead)
    displacy.render(matched_sents, style="ent", manual=True)

In [11]:
visualize(claims)

In [None]:
visualize(G06F[0])

In [None]:
visualize(G06F[5])

In [None]:
visualize(G06F[10])

In [None]:
visualize(G06T[0])

In [None]:
visualize(G06T[10])

In [None]:
visualize(G06T[20])

In [None]:
visualize(G06T[15])

In [None]:
visualize(A61M[0])

In [None]:
visualize(A61M[1])

In [None]:
visualize(G06T[2])

In [None]:
# save matcher as entity ruler
doc = nlp.make_doc(claims)
matches = matcher(doc)
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})

In [None]:
rulers = []
spans = [doc[start:end] for _, start, end in matches]
for span in spacy.util.filter_spans(spans):
    while span[-1].text in ['.','_','\n']:       
        span.end -= 1
        
    term = doc[span.start: span.end] 
    rulers.append(term.text)

In [None]:
rulers = list(set(rulers))
patterns = [{"label": "TERM", "pattern": rule} for rule in rulers]

ruler.add_patterns(patterns)

nlp.to_disk(r"./rule-based matcher")