<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dataset" data-toc-modified-id="Dataset-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset</a></span></li><li><span><a href="#Rule-based-Matcher" data-toc-modified-id="Rule-based-Matcher-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Rule-based Matcher</a></span></li><li><span><a href="#Evaluation" data-toc-modified-id="Evaluation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Evaluation</a></span></li></ul></div>

# Dataset

We are going to use three patent domains from electricity G06F, computer science G06T, and medical A61M to complete and test our rule-based matcher.

In [1]:
import re 
sentsplit = re.compile('[\n.;]')

with open('../data/G06F0011160000.txt', encoding = 'utf-8') as f1:
    G06F = f1.read().split('\n\n\n')

with open('../data/G06T0007254000.txt', encoding = 'utf-8') as f2:
    G06T = f2.read().split('\n\n\n')

with open('../data/A61M0009000000.txt', encoding = 'utf-8') as f3:
    A61M = f3.read().split('\n\n\n')

In [2]:
len(G06F)

641

In [3]:
len(G06T)

24

In [4]:
len(A61M)

3

In [5]:
# our matching list of terms
import pandas as pd 
term_list = pd.read_csv('../01_make_matching_list/matching_list.csv', delimiter='\t', na_filter= False)

In [6]:
term_list.head()

Unnamed: 0,term,annotation,df,wiki_title,wiki_summary
0,aperture z-scan experiments,Process,scienceie,Aperture,
1,1560nm femtosecond laser pulses,Material,scienceie,Liquid crystal on silicon,
2,optical-chopper,Material,scienceie,Optical chopper,An optical chopper is a device which periodica...
3,vibrational combination states,Process,scienceie,Molecular vibration,
4,non-radiative processes,Process,scienceie,Carrier generation and recombination,


# Rule-based Matcher

In [7]:
import spacy 
from spacy import displacy
from spacy.matcher import Matcher
from tqdm import tqdm
import json 

nlp = spacy.load("en_core_web_md", disable=['ner', 'lemmatizer', 'textcat'])

In [8]:
# add custom stop words 
nlp.Defaults.stop_words |= {"secondary", "primary", "second", "third", "forth", "fourth", "useful", "fewer", "more", "less"}

# build matcher
matcher = Matcher(nlp.vocab, validate=True)

# build patterns
patterns = []
for term in tqdm(term_list.term.values):
    term_split = term.split(' ')
    if len(term_split) > 1: # if it is MWE
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False}] 
                        + [{"TEXT": token} for token in term_split]
                        + [{"POS": {"IN":["PROPN", "NOUN"]}, "OP": "*", "IS_STOP": False}])
         
    else: # if it is single word
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False}, 
                         {"TEXT": term_split[0], "POS": {"IN":["PROPN", "NOUN"]}},
                         {"POS": {"IN":["PROPN", "NOUN"]}, "OP": "*", "IS_STOP": False, "IS_DIGIT": False}])
        
        patterns.append([{"POS": {"IN":["ADJ", "NOUN", "PROPN"]}, "OP": "*", "IS_STOP": False},
                         {"TEXT": term_split[0], "POS": {"IN":["PROPN", "NOUN"]}},
                         {"TEXT": "of"},
                         {"POS": {"IN":["PROPN", "NOUN"]}, "OP": "+", "IS_STOP": False}])
        
        
patterns.append([{"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'}, 
                 {"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'},
                 {"POS": {"IN":["PROPN", "NOUN"]}, "IS_TITLE": True, "OP": '+'}])     

patterns.append([{"POS": {"IN":["PROPN", "NOUN"]}, 
                  "LENGTH": {"<=": 4}, 
                  "IS_STOP": False,
                  'TEXT': {'REGEX': '^[A-Z]{2,}[s]?', 
                           "NOT_IN": ["FIG", "FIGS", "CODE", "CORE", "TIME", "ART", "LIST"]}}])

100%|██████████| 752287/752287 [00:16<00:00, 46008.39it/s] 


In [9]:
matcher.add('TERM', patterns) # add patterns to the matcher(this takes a quite long time)

In [10]:
from collections import defaultdict

def collect_sents(doc, matches):
    """
    collect sentences with matched spans, 
    if overlapping then pick up the longest else pick up the 1st
    """
    dict_sents = defaultdict(list)
    
    spans = [doc[start:end] for _, start, end in matches]
    for span in spacy.util.filter_spans(spans):
        term = doc[span.start: span.end]      
        sent = term.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
        match_ents = {
            "start": term.start_char - sent.start_char,
            "end": term.end_char - sent.start_char,
            "label": "TERM",
        }
        dict_sents[sent.text].append(match_ents)
    dict_sents = dict(dict_sents)

    return [{"text": key, "ents": value} for key, value in dict_sents.items()]

# Evaluation

In [11]:
def visualize(patent):
    doc = nlp(patent)
    matches = matcher(doc)
    matched_sents = collect_sents(doc, matches)  # Collect data of matched sentences to be visualized

    # Serve visualization of sentences containing match with displaCy
    # set manual=True to make displaCy render straight from a dictionary
    # (if you're not running the code within a Jupyer environment, you can
    # use displacy.serve instead)
    displacy.render(matched_sents, style="ent", manual=True)

In [12]:
visualize(G06F[0])

In [13]:
visualize(G06F[5])

In [14]:
visualize(G06F[10])

In [15]:
visualize(G06T[0])

In [16]:
visualize(G06T[10])

In [17]:
visualize(G06T[20])

In [18]:
visualize(G06T[15])

In [19]:
visualize(A61M[0])

In [20]:
visualize(A61M[1])

In [21]:
visualize(G06T[2])