In [40]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [41]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [42]:
doc = nlp("Justin Bieber has gained 5 billion streams on Spotify in 2022 (till August).")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))    

Justin Bieber | PERSON | People, including fictional
5 billion | CARDINAL | Numerals that do not fall under another type
Spotify | GPE | Countries, cities, states
2022 | DATE | Absolute or relative dates or periods
August | DATE | Absolute or relative dates or periods


In [43]:
from spacy import displacy

displacy.render(para, style="ent")

In [44]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [45]:
doc[2:5]

has gained 5

In [46]:
type(doc[2:5])

spacy.tokens.span.Span

In [47]:
sent = nlp("Tesla will acquire Twitter for $45 billion.")
for ent in sent.ents:
    print(ent.text, "|", ent.label_)

Twitter | ORG
$45 billion | MONEY


In [48]:
from spacy.tokens import Span

s1 = Span(sent, 0, 1, label="ORG")

sent.set_ents([s1], default="unmodified")

for ent in sent.ents:
    print(ent.text, "|", ent.label_)

Tesla | ORG
Twitter | ORG
$45 billion | MONEY


### exercise 1: Extract all the Geographical (cities, Countries, states) names from a given text

In [32]:
text = """Kiran wants to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc2 = nlp(text)

for ent in doc2.ents:
    print(ent.text, "|", ent.label_)

Kiran | GPE
India | GPE
Delhi | GPE
Chaat | ORG
Gujarat | GPE
Dal Dhokli | ORG
Tamilnadu | ORG
Pongal | GPE
Andhrapradesh | GPE
Biryani | PERSON
Papaya Khar | PERSON
Bihar | GPE
Litti Chowkha | PERSON


In [38]:
from spacy.tokens import Span

s2 = Span(doc2, 0, 5, label="PERSON")
doc2.set_ents([s2], default="unmodified")

for ent in doc2.ents:
    print(ent.text, "|", ent.label_)

Kiran want to know the | PERSON
India | GPE
Delhi | GPE
Chaat | ORG
Gujarat | GPE
Dal Dhokli | ORG
Tamilnadu | ORG
Pongal | GPE
Andhrapradesh | GPE
Biryani | PERSON
Papaya Khar | PERSON
Bihar | GPE
Litti Chowkha | PERSON


In [39]:
gpe_list = []

for ent in doc2.ents:
    if ent.label_ == "GPE":
        gpe_list.append(ent.text)
        
print("Geo-political Locations:", gpe_list)
print("Count:", len(gpe_list))

Geo-political Locations: ['India', 'Delhi', 'Gujarat', 'Pongal', 'Andhrapradesh', 'Bihar']
Count: 6
