In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Britain is a place. Mary is a doctor.")

In [None]:
for ent in doc.ents:
  print(ent.text,ent.label_)

Britain GPE
Mary PERSON


In [None]:
from spacy.language import Language
@Language.component("remove_gpe")
def remove_gpe(doc):
  original_ents = list(doc.ents)
  for ent in doc.ents:
    if ent.label_ == "GPE":
      original_ents.remove(ent)
  doc.ents = original_ents
  return doc

In [None]:
nlp.add_pipe("remove_gpe")
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [None]:
doc = nlp("Britain is a place. Mary is a doctor.")
for ent in doc.ents:
  print(ent.text,ent.label_)

Mary PERSON


In [None]:
import re
text = "This is a date February 2. Another date would be 14 August."
pattern = r"(((\d){1,2}( (January|February|March|April|May|June|July|August|September|October|November|December)))|(((January|February|March|April|May|June|July|August|September|October|November|December) )(\d){1,2}))"
iter_matches = re.finditer(pattern, text)
for hit in iter_matches:
    start = hit.start()
    end = hit.end()
    print (text[start:end])

February 2
14 August


In [None]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern,text)
for match in matches:
  print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [None]:
nlp = spacy.blank("en")
doc = nlp(text)
original_ent = list(doc.ents)
mwt_ents = [] #multi-word entity list
for match in re.finditer(pattern,doc.text):
  start,end = match.span()
  span = doc.char_span(start,end)
  if span!=None:
    mwt_ents.append((span.start,span.end,span.text))
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


In [None]:
from spacy.tokens import Span
for ent in mwt_ents:
  start,end,name = ent
  per_ent = Span(doc,start,end,"PERSON")
  original_ent.append(per_ent)
doc.ents = original_ent

In [None]:
for ent in doc.ents:
  print(ent.text,ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON


In [None]:
@Language.component("paul_ner")
def paul_ner(doc):
  pattern = r"Paul [A-Z]\w+"
  original_ent = list(doc.ents)
  mwt_ents = [] #multi-word entity list
  for match in re.finditer(pattern,doc.text):
    start,end = match.span()
    span = doc.char_span(start,end)
    if span is not None:
      mwt_ents.append((span.start,span.end,span.text))
  for ent in mwt_ents:
    start,end,name = ent
    per_ent = Span(doc,start,end,"PERSON")
    original_ent.append(per_ent)
  doc.ents = original_ent
  return doc

In [None]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

In [None]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [None]:
from spacy.util import filter_spans
@Language.component("cinema_ner")
def cinema_ner(doc):
  pattern = r"Hollywood"
  original_ent = list(doc.ents)
  mwt_ents = [] #multi-word entity list
  for match in re.finditer(pattern,doc.text):
    start,end = match.span()
    span = doc.char_span(start,end)
    if span is not None:
      mwt_ents.append((span.start,span.end,span.text))
  for ent in mwt_ents:
    start,end,name = ent
    per_ent = Span(doc,start,end,"CINEMA")
    original_ent.append(per_ent)
  filtered = filter_spans(original_ent)
  doc.ents = filtered
  return doc

In [None]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")
doc3 = nlp3(text)
for ent in doc3.ents:
  print(ent.text,ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
Paul PERSON
