# Tools

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm") #small english model

In [2]:
# We the component 'ner' by default in the pipeline
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# NER

In [7]:
# Let's look at entities

doc = nlp('Samsung and LG are the best companies in korea')

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|",spacy.explain(ent.label_))

Samsung | ORG | Companies, agencies, institutions, etc.
LG | GPE | Countries, cities, states
korea | GPE | Countries, cities, states


He thinks LG is a country!, we can add custom rule to fix that!

In [8]:
# Better Visualization

from spacy import displacy

displacy.render(doc, style='ent')

In [9]:
# What does NER have ?

nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [10]:
# Another example

doc2 = nlp('Asma Soula worked so hard, hope she get what she deserve')

displacy.render(doc2, style='ent')

In [11]:
for ent in doc2.ents:
    print(ent.text, "|", ent.label_, "|",spacy.explain(ent.label_))

Asma Soula | PERSON | People, including fictional


For more efficient model you can try : https://huggingface.co/dslim/bert-base-NER

In [15]:
# Span is a bunch of tokens for example doc[0:2] is considered a Span
# We will fix LG for the first example

doc = nlp('Samsung and LG are the best companies in korea')

from spacy.tokens import Span

s1 = Span(doc, 2,3, label='ORG')

doc.set_ents([s1], default='unmodified')

# default = unmodified means that keep the default tag for other entities

In [13]:
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|",spacy.explain(ent.label_))

Samsung | ORG | Companies, agencies, institutions, etc.
LG | ORG | Companies, agencies, institutions, etc.
korea | GPE | Countries, cities, states


In [14]:
# Now we have LG as ORG

You can change multiple element here to do that you will write :
- doc.set_ents([s1,sx], default='unmodified')
- where x is every element you want to change