In [38]:
import spacy

In [54]:
patterns = [{'label': 'COMMODITY', 'pattern': [{'LOWER': 'gold', 'POS': 'NOUN'}]}]

# This will create a pipeline that includes the "ner" pipe
# that will also include all recognised entities (regardless of whether or not they adhere to the prescribed patterns).
nlp = spacy.load("en_core_web_sm")

# Uncomment the below line to see how the "ner" pipe is responsible for inluding named entities in the results.
# nlp.remove_pipe("ner")

ruler = nlp.add_pipe("entity_ruler")

ruler.add_patterns(patterns)

print(nlp.pipe_names)

doc = nlp("Gold is great and this is text about gold the commodity and also about Yamana Gold the company, of which Daniel Racine is CEO, and the other companies called Gold Fields, Auminium Mining Corp, and Goldcorp.")

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'entity_ruler']


In [55]:
print(doc)

Gold is great and this is text about gold the commodity and also about Yamana Gold the company, of which Daniel Racine is CEO, and the other companies called Gold Fields, Auminium Mining Corp, and Goldcorp.


In [56]:
[(i, i.pos_) for i in doc]

[(Gold, 'NOUN'),
 (is, 'AUX'),
 (great, 'ADJ'),
 (and, 'CCONJ'),
 (this, 'PRON'),
 (is, 'AUX'),
 (text, 'NOUN'),
 (about, 'ADP'),
 (gold, 'NOUN'),
 (the, 'DET'),
 (commodity, 'NOUN'),
 (and, 'CCONJ'),
 (also, 'ADV'),
 (about, 'ADP'),
 (Yamana, 'PROPN'),
 (Gold, 'PROPN'),
 (the, 'DET'),
 (company, 'NOUN'),
 (,, 'PUNCT'),
 (of, 'ADP'),
 (which, 'PRON'),
 (Daniel, 'PROPN'),
 (Racine, 'PROPN'),
 (is, 'AUX'),
 (CEO, 'PROPN'),
 (,, 'PUNCT'),
 (and, 'CCONJ'),
 (the, 'DET'),
 (other, 'ADJ'),
 (companies, 'NOUN'),
 (called, 'VERB'),
 (Gold, 'PROPN'),
 (Fields, 'PROPN'),
 (,, 'PUNCT'),
 (Auminium, 'PROPN'),
 (Mining, 'PROPN'),
 (Corp, 'PROPN'),
 (,, 'PUNCT'),
 (and, 'CCONJ'),
 (Goldcorp, 'PROPN'),
 (., 'PUNCT')]

In [57]:
# The results include instances of proper nouns in which the word 'gold' is part of that proper noun
# (e.g. company names), which is unwanted.
print(doc.ents)

(Gold, gold, Yamana Gold, Daniel Racine, Gold Fields, Auminium Mining Corp, Goldcorp)


In [58]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label, ent.label_)

Gold 0 4 2181413665215533003 COMMODITY
gold 37 41 2181413665215533003 COMMODITY
Yamana Gold 71 82 383 ORG
Daniel Racine 105 118 380 PERSON
Gold Fields 158 169 383 ORG
Auminium Mining Corp 171 191 383 ORG
Goldcorp 197 205 383 ORG
