## [Rule-based Matcher Explorer](https://explosion.ai/demos/matcher)

In [78]:
import ntpath
import requests
from os import path, makedirs, listdir
from tqdm import tqdm_notebook as tqdm
from lxml import etree
import standoffconverter
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [2]:
from IPython.display import IFrame
IFrame('https://explosion.ai/demos/matcher', width=1000, height=1600)

In [3]:
#https://spacy.io/usage/rule-based-matching#matcher
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)

doc = nlp(u"Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


In [4]:
#https://realpython.com/natural-language-processing-spacy-python/

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
conference_org_text = ('There is a developer conference'
    'happening on 21 July 2019 in London. It is titled'
    ' "Applications of Natural Language Processing".'
    ' There is a helpline number available'
    ' at (123) 456-789')

def extract_phone_number(nlp_doc):
    pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
               {'ORTH': ')'}, {'SHAPE': 'ddd'},
               {'ORTH': '-', 'OP': '?'},
               {'SHAPE': 'ddd'}]
    matcher.add('PHONE_NUMBER', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

'(123) 456-789'

# Phrase Matcher for stage directions

First, download Shakespeare plays that have annotations for stage directions.

In [17]:
cache_dir = "downloads"


def get_urllist():
    return pd.DataFrame([
        {"play_id": "jn", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-jn.xml"},
        {"play_id": "r2", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-r2.xml"},
        {"play_id": "1h4", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-1h4.xml"},
        {"play_id": "2h4", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-2h4.xml"},
        {"play_id": "h5", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-h5.xml"},
        {"play_id": "1h6", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-1h6.xml"},
        {"play_id": "2h6", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-2h6.xml"},
        {"play_id": "3h6", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-3h6.xml"},
        {"play_id": "r3", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-r3.xml"},
        {"play_id": "h8", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-h8.xml"},
        {"play_id": "rom", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-rom.xml"},
        {"play_id": "mnd", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-mnd.xml"},
        {"play_id": "jc", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-jc.xml"},
        {"play_id": "tn", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tn.xml"},
        {"play_id": "tem", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tem.xml"},
        {"play_id": "ham", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-ham.xml"},
        {"play_id": "mv", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-mv.xml"},
        {"play_id": "ayl", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-ayl.xml"},
        {"play_id": "shr", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-shr.xml"},
        {"play_id": "ado", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-ado.xml"},
        {"play_id": "lll", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-lll.xml"},
        {"play_id": "cor", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-cor.xml"},
        {"play_id": "err", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-err.xml"},
        {"play_id": "tgv", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tgv.xml"},
        {"play_id": "wiv", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-wiv.xml"},
        {"play_id": "wt", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-wt.xml"},
        {"play_id": "tit", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tit.xml"},
        {"play_id": "ant", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-ant.xml"},
        {"play_id": "mm", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-mm.xml"},
        {"play_id": "tim", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tim.xml"},
        {"play_id": "lr", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-lr.xml"},
        {"play_id": "tro", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-tro.xml"},
        {"play_id": "aww", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-aww.xml"},
        {"play_id": "oth", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-oth.xml"},
        {"play_id": "mac", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-mac.xml"},
        {"play_id": "cym", "url": "http://firstfolio.bodleian.ox.ac.uk/download/xml/F-cym.xml"},
    ])

def download(url, location, filename):

    if not path.exists(location):
        makedirs(location)
    r = requests.get(url)
    with open(path.join(location, filename+ ".xml"), "wb") as fout:
        fout.write(r.content)


index = get_urllist()

for _,row in tqdm(index.iterrows(), total=len(index), desc="downloading shakespeare edition"):
    download(row.url, cache_dir, row.play_id)

standoffs = []

for fn in tqdm([fn for fn in listdir(cache_dir) if fn[-4:]==".xml"], desc="extracting xml"):
    
    tree = etree.fromstring(open(path.join(cache_dir, fn), "rb").read())
    
    so = standoffconverter.Standoff()
    so.from_lxml_tree(tree)
    
    standoffs.append(so)
    

HBox(children=(IntProgress(value=0, description='downloading shakespeare edition', max=36, style=ProgressStyle…




HBox(children=(IntProgress(value=0, description='extracting xml', max=36, style=ProgressStyle(description_widt…




## Preparing the data

In [20]:
train, test = train_test_split(standoffs, random_state=4123)

'\n    \n        \n            \n                The Life of Tymon of Athens from Mr. William Shakespeare'

In [80]:
def extract_true_stage_directions(standoffs):
    stage_directions = []
    indices = []
    for standoff in standoffs:
        indices.append([])
        for annotation in standoff.standoffs:
            if annotation["tag"] == "{http://www.tei-c.org/ns/1.0}stage":
                stage_directions.append(standoff.plain[annotation["begin"]:annotation["end"]])
                indices[-1].append((annotation["begin"],annotation["end"]))
    return stage_directions, indices
                
train_sd,_ = extract_true_stage_directions(train)

## Creating some example matchers and validate
The validation function also shows you examples of fales matches (false negatives and false positives)

In [68]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern = [{'LOWER': 'exit'},
           {'IS_PUNCT': True}]
matcher.add('EXIT', None, pattern)

pattern = [{'LOWER': 'enter'},
           {'IS_PUNCT': False}]
matcher.add('Enter', None, pattern)


def validate_stage_directions(matcher, data):

    docs = [nlp(doc.plain) for doc in data]
    
    data_sd, data_indices = extract_true_stage_directions(data)
    
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    fn_examples = []
    fp_examples = []
    
    for idoc, doc in tqdm(enumerate(docs), total=len(docs)):
        
        ground_truth = np.zeros(len(doc.text)).astype(bool)
        prediction = np.zeros(len(doc.text)).astype(bool)
        for begin, end in data_indices[idoc]:
            ground_truth[begin:end] = True
            
            
        matches = matcher(doc)
        for match_id, begin, end in matches:
            prediction[doc[begin].idx:doc[end].idx + len(doc[end])-1] = True
            if ground_truth[doc[begin].idx:doc[end].idx + len(doc[end])-1].sum() == 0:
                fp_examples.append(doc[begin:end].text)
                
        
        for begin, end in data_indices[idoc]:
            if prediction[begin:end].sum() == 0:
                fn_examples.append(doc.text[begin:end])

        tp += np.logical_and(ground_truth, prediction).sum()
        fn += np.logical_and(ground_truth, ~prediction).sum()
        fp += np.logical_and(~ground_truth, prediction).sum()
        tn += np.logical_and(~ground_truth, ~prediction).sum()

    precision = tp /(tp + fp) if fp > 0 or tp > 0 else 0
    recall = tp /(tp + fn) if tp > 0 or fn > 0 else 0
    f1_score = 2*tp / (2*tp + fp + fn) if tp > 0 or fp > 0 or fn > 0 else 0
    print("precision: {:.2f}, recall: {:.2f}, f1 {:.2f}".format(precision, recall, f1_score))
    
    
    print("## What has not been identified? For example:")
    print("\n".join(np.random.choice(fn_examples, 3)))
    
    print("\n## What been identified. although it is not a stage direection? For example:")
    print("\n".join(np.random.choice(fp_examples, 3)))
    
    
validate_stage_directions(matcher, train[:2])  

