In [1]:
import spacy
from spacy.language import Language
from spacy.lang.en import English

In [2]:
# When using `English()`, the below patterns will produce the below ValueError.
# A less-specific pattern must be employed in which we cannot specify the part-of-speech (POS) value required.
# patterns = [{'label': 'COMMODITY', 'pattern': [{'LOWER': 'gold', 'POS': 'NOUN'}]}]
# ValueError: [E155] The pipeline needs to include a morphologizer or tagger+attribute_ruler in order to use Matcher or PhraseMatcher with the attribute POS. Try using `nlp()` instead of `nlp.make_doc()` or `list(nlp.pipe())` instead of `list(nlp.tokenizer.pipe())`.

patterns = [{'label': 'COMMODITY', 'pattern': 'Gold' }, {'label': 'COMMODITY', 'pattern': 'gold' }]

nlp = English()

ruler = nlp.add_pipe("entity_ruler")

ruler.add_patterns(patterns)

print(nlp.pipe_names)

doc = nlp("Gold is great and this is text about gold the commodity and also about Yamana Gold the company, of which Daniel Racine is CEO, and the other companies called Gold Fields, Auminium Mining Corp, and Goldcorp. It is also about switching out of the gold ETF [exchange traded fund] into gold equities and gold equity funds.")

['entity_ruler']


In [3]:
print(doc)

Gold is great and this is text about gold the commodity and also about Yamana Gold the company, of which Daniel Racine is CEO, and the other companies called Gold Fields, Auminium Mining Corp, and Goldcorp. It is also about switching out of the gold ETF [exchange traded fund] into gold equities and gold equity funds.


In [4]:
[(i, i.pos_) for i in doc]

[(Gold, ''),
 (is, ''),
 (great, ''),
 (and, ''),
 (this, ''),
 (is, ''),
 (text, ''),
 (about, ''),
 (gold, ''),
 (the, ''),
 (commodity, ''),
 (and, ''),
 (also, ''),
 (about, ''),
 (Yamana, ''),
 (Gold, ''),
 (the, ''),
 (company, ''),
 (,, ''),
 (of, ''),
 (which, ''),
 (Daniel, ''),
 (Racine, ''),
 (is, ''),
 (CEO, ''),
 (,, ''),
 (and, ''),
 (the, ''),
 (other, ''),
 (companies, ''),
 (called, ''),
 (Gold, ''),
 (Fields, ''),
 (,, ''),
 (Auminium, ''),
 (Mining, ''),
 (Corp, ''),
 (,, ''),
 (and, ''),
 (Goldcorp, ''),
 (., ''),
 (It, ''),
 (is, ''),
 (also, ''),
 (about, ''),
 (switching, ''),
 (out, ''),
 (of, ''),
 (the, ''),
 (gold, ''),
 (ETF, ''),
 ([, ''),
 (exchange, ''),
 (traded, ''),
 (fund, ''),
 (], ''),
 (into, ''),
 (gold, ''),
 (equities, ''),
 (and, ''),
 (gold, ''),
 (equity, ''),
 (funds, ''),
 (., '')]

In [5]:
# The results include all instances of the word 'gold',
# including where it appears as part of a company name, which is unwanted.
print(doc.ents)

(Gold, gold, Gold, Gold, gold, gold, gold)


In [6]:
for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)

Gold 0 4 2181413665215533003 COMMODITY
gold 37 41 2181413665215533003 COMMODITY
Gold 78 82 2181413665215533003 COMMODITY
Gold 158 162 2181413665215533003 COMMODITY
gold 245 249 2181413665215533003 COMMODITY
gold 282 286 2181413665215533003 COMMODITY
gold 300 304 2181413665215533003 COMMODITY
