In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
from spacy.matcher import Matcher

In [3]:
matcher = Matcher(nlp.vocab)

In [5]:
# SolarPower
pattern1 = [{'LOWER':'solarpower'}]
# Solar-power
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT':True},{'LOWER':'power'}]
# Solar power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [6]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [7]:
doc = nlp("The Solar Power industry continues to grow a solarpower increases. Solar-power is amazing.")

In [8]:
found_matches = matcher(doc)

In [9]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [10]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]         # get string representation
    span = doc[start:end]                           # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [11]:
matcher.remove('SolarPower')

In [13]:
# solarpower SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar.power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

In [15]:
matcher.add('SolarPower', None, pattern1, pattern2)

In [16]:
doc2 = nlp("Solar--power is solarpower yay!")

In [17]:
found_matches = matcher(doc2)

In [18]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [19]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]         # get string representation
    span = doc[start:end]                           # get the matched span
    print(match_id, string_id, start, end, span.text)        

8656102463236116519 SolarPower 0 3 The Solar Power
8656102463236116519 SolarPower 4 5 continues


For additional information: https://spacy.io/usage/rule-based-matching

In [20]:
from spacy.matcher import PhraseMatcher

In [21]:
matcher = PhraseMatcher(nlp.vocab)

In [23]:
with open('reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [24]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [25]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [26]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [27]:
matcher.add('EconMatcher', None,*phrase_patterns)

In [28]:
found_matches = matcher(doc3)

In [29]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 46, 50),
 (3680293220734633682, 53, 55),
 (3680293220734633682, 72, 76),
 (3680293220734633682, 753, 757)]

In [31]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]         # get string representation
    span = doc3[start:end]                           # get the matched span
    print(match_id, string_id, start, end, span.text)        

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 46 50 trickle-down economics
3680293220734633682 EconMatcher 53 55 voodoo economics
3680293220734633682 EconMatcher 72 76 free-market economics
3680293220734633682 EconMatcher 753 757 supply-side economics


## NLP Basics Assessment

In [33]:
# RUN THIS CELL to perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

### 1. Create a Doc object from the file `owlcreek.txt`

In [40]:
# Enter your code here:
with open("owlcreek.txt") as f:
    doc = nlp(f.read())


# Run this cell to verify it worked:

doc[:36]

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1665: character maps to <undefined>

### 2. How many tokens are contained in the file?

### 3. How many sentences are contained in the file?

### 4. Print th second sentence in the document

HINT: Indexing starts at zero, and the title counts as the first sentence.

### 5. For each token in the sentence above, print its `text`, `POS` tag, `dep` tag and `lemma`

CHALLENGE: Have values line up in columns in the print output.

In [35]:
# NORMAL SOLUTION:



### 6. Write a matcher called `Swimming` that finds both occurrences of the phrase "swimming vigorously" in the text

In [37]:
# Import the Matcher library:

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [38]:
# Create a pattern and add it to matcher:



In [None]:
# Create a list of matches called "found_matches" and print the list:

### 7. Print the text surrounding each found match

### EXTRA CREDIT:

Print the sentence that contains each found match