# NER Tutorial with Spacy



In this tutorial, we will retrieve text from wikipedia to perform named entity recognition with spacy.

In [None]:
import spacy, requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
url = https://en.wikipedia.org/wiki/Leonardo_DiCaprio

# Retrieve text from wikipedia
response = requests.get(url)
# Clean text
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.find_all('p')
text = [t.get_text() for t in text]


## Create a dataframe from spacy-extracted entities

Spacy has three pipelines for their English NLP models. We will use all three to compare results

In [None]:
# create a function to extract entities from text
def get_entities(text):
    doc = nlp(text)
    return doc

### en_core_web_sm

In [None]:
# Load the small English NLP model
nlp = spacy.load('en_core_web_sm')

# Create a doc object
doc = get_entities(text)

#  Optional manual review of entities and labels
for ent in doc.ents:
    print(ent.text, ent.label_)

# create a table of the named entities and their labels
sm_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=['Name Entity', 'Label'])
df.head()





### en_core_web_md

In [None]:
# Load the medium English NLP model
nlp = spacy.load('en_core_web_md')

# Create a doc object
doc = get_entities(text)

# create a table of the named entities and their labels
md_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=['Name Entity', 'Label'])

### en_core_web_lg

In [None]:
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# Create a doc object
doc = get_entities(text)

# create a table of the named entities and their labels
lg_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=['Name Entity', 'Label'])


### en_core_web_trf

In [None]:
# Load the transformer English NLP model
nlp = spacy.load('en_core_web_trf')

# Create a doc object
doc = get_entities(text)

# create a table of the named entities and their labels
trf_df = pd.DataFrame([(ent.text, ent.label_) for ent in doc.ents], columns=['Name Entity', 'Label'])

## Comparing Tables

In [None]:
# comparing the number of entities extracted by each model
print('Small model entities: ', len(sm_df))
print('Medium model entities: ', len(md_df))
print('Large model entities: ', len(lg_df))
print('Transformer model entities: ', len(trf_df))

# comparing the number of labels extracted by each model
print('Small model labels: ', len(sm_df['Label'].unique()))
print('Medium model labels: ', len(md_df['Label'].unique()))
print('Large model labels: ', len(lg_df['Label'].unique()))
print('Transformer model labels: ', len(trf_df['Label'].unique()))

In [None]:
# Comparing what the transformer model extracted the small model did not
trf_df[~trf_df['Name Entity'].isin(sm_df['Name Entity'])]

In [None]:
# Comparing what the transformer model extracted the medium model did not
trf_df[~trf_df['Name Entity'].isin(md_df['Name Entity'])]

In [None]:
# Comparing what the transformer model extracted the large model did not
trf_df[~trf_df['Name Entity'].isin(lg_df['Name Entity'])]

In [None]:
# Comparing what the transformer model labeled entities the small model did not
trf_df[~trf_df['Label'].isin(sm_df['Label'])]

In [None]:
# Comparing what the transformer model labeled entities the medium model did not
trf_df[~trf_df['Label'].isin(md_df['Label'])]

In [None]:
# Comparing what the transformer model labeled entities the large model did not
trf_df[~trf_df['Label'].isin(lg_df['Label'])]