In [2]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
nlp = spacy.load("en_core_web_sm")

### Lets check our rule on a larger corpus

In [14]:
active_passive = pd.read_csv('../Dataset/active_passive.csv')
active_passive.head(2)

Unnamed: 0,Active,Passive
0,He reads a novel.,A novel is read.
1,He does not cook food.,Food is not cooked by him.


In [4]:
active_passive.shape

(40, 2)

In [5]:
active = active_passive['Active']
passive = active_passive['Passive']

### Create the rule

In [6]:
passive_rule = [{'DEP':'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [7]:
def is_passive(doc,matcher):
    if len(matcher(doc))>0:
        return True
    else:
        return False

### Check rule on active voice sentences

In [8]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


### Check rule on passive voice sentences

In [9]:
cnt = 0
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
print(cnt)

39


### Let's troubleshoot

In [15]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

39


In [16]:
missed[0]

Is a table being bought by Ritika?

In [17]:
#missed[1]

### Let's visualize their dependency trees

In [18]:
for doc in missed:
    displacy.render(doc, style="dep")

In [19]:
spacy.explain("auxpass")

'auxiliary (passive)'

[Dependencies](https://universaldependencies.org/docs/en/dep/)

### Update our rule
[Reference](https://spacy.io/usage/rule-based-matching)

In [20]:
passive_rule = [{'DEP':{"IN":['nsubjpass','auxpass']}}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [21]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


In [22]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

40


## Summary
 - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules
 - One can write intricate matching rules using `matcher` object

In [26]:
doc = nlp("Women are said to live longer than men")
is_passive(doc,matcher)

True

In [35]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp('Dole was defeated by Clinton')

displacy.render(doc, style="dep", jupyter = True)

In [33]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"DEP":{"IN":["nsubj","nsubjpass","csubj","csubjpass"]}}]
matcher.add("subject", [pattern])

In [34]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = {"DEP":{"IN":["nsubj","nsubjpass","csubj","csubjpass"]}}
matcher.add("subject", [pattern])

ValueError: [E178] Each pattern should be a list of dicts, but got: {'DEP': {'IN': ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']}}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('subject', [pattern])`