## Looking for entities using simple methods

In [1]:
table_data = "country, capital\
    Abkhazia, Sukhumi\
    Afghanistan, Kabul\
    Chad, N'Djamena\
    Chile, Santiago\
    China, Beijing"

### Try using regex - information extraction

In [2]:

# Import package
import re 

In [3]:
m = re.findall(r'(\w+), (\w+)', table_data)

In [4]:
print (m)

[('country', 'capital'), ('Abkhazia', 'Sukhumi'), ('Afghanistan', 'Kabul'), ('Chad', 'N'), ('Chile', 'Santiago'), ('China', 'Beijing')]


### Try using spacy

In [5]:
# Do imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [6]:
doc = nlp(table_data)

In [7]:
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start_char, ent.end_char)

Abkhazia GPE 20 28
Sukhumi PERSON 30 37
Afghanistan GPE 41 52
Kabul GPE 54 59
Chad GPE 63 67
N'Djamena GPE 69 78
Chile GPE 82 87
Santiago GPE 89 97
China GPE 101 106
Beijing GPE 108 115


In [8]:
# Note: Sukhumi was not correctly identified

### Idenitfying for unstructured data

In [9]:
small_data = "The Nobel Prize is a set of annual international awards bestowed in \
several categories by Swedish and Norwegian institutions in recognition of academic, \
cultural, or scientific advances. The will of the Swedish chemist, engineer and industrialist \
Alfred Nobel established the five Nobel prizes in 1895."

In [10]:
doc = nlp(small_data)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start_char, ent.end_char)

The Nobel Prize WORK_OF_ART 0 15
Swedish NORP 90 97
Norwegian NORP 102 111
Swedish NORP 203 210
Alfred Nobel PERSON 247 259
five CARDINAL 276 280
Nobel WORK_OF_ART 281 286
1895 DATE 297 301


# Now focused text from a domain

In [12]:
domain_data = """
Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC).
"""

In [13]:
doc = nlp(domain_data)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start_char, ent.end_char)

HCC ORG 222 225


In [14]:
# Notice the large number of unrecognized entites

In [15]:
# See how to work on this example with Flair at
# - https://github.com/biplav-s/course-nl/blob/master/l13-entityextraction/BioMedicalExtraction.ipynb
# - https://github.com/flairNLP/flair