# Wordnet and Named entity recognition (NER)

- NLTK    
- Spacy
- Stanza
    

In [19]:
import nltk

#from nltk.tokenize import word_tokenize
#from nltk.tokenize import sent_tokenize
#from nltk.tag import pos_tag
#from nltk import conlltags2tree, tree2conlltags

from nltk.corpus import wordnet as wn

# Download the required NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')  # Correct NER chunker model
nltk.download('words') 
nltk.download('maxent_ne_chunker_tab') 


from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk import conlltags2tree, tree2conlltags
from nltk.corpus import wordnet as wn
from nltk.chunk import ne_chunk 

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /users/kent/asingh68/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


# Wordnet

In [20]:
# get synset for 'table'

wn.synsets('table')


[Synset('table.n.01'),
 Synset('table.n.02'),
 Synset('table.n.03'),
 Synset('mesa.n.01'),
 Synset('table.n.05'),
 Synset('board.n.04'),
 Synset('postpone.v.01'),
 Synset('table.v.02')]

In [21]:
wn.langs()

['eng']

In [22]:
wn.synset('table.n.01').definition()

'a set of data arranged in rows and columns'

In [23]:
wn.synset('table.n.01').hypernyms()

[Synset('array.n.01')]

In [24]:
wn.synset('table.n.02').hyponyms()

[Synset('kitchen_table.n.01'),
 Synset('operating_table.n.01'),
 Synset('parsons_table.n.01'),
 Synset('trestle_table.n.01'),
 Synset('altar.n.01'),
 Synset('card_table.n.01'),
 Synset('booth.n.01'),
 Synset('coffee_table.n.01'),
 Synset('gaming_table.n.01'),
 Synset('table-tennis_table.n.01'),
 Synset('breakfast_table.n.01'),
 Synset('worktable.n.01'),
 Synset('platen.n.01'),
 Synset('dressing_table.n.01'),
 Synset('drop-leaf_table.n.01'),
 Synset('conference_table.n.01'),
 Synset('console_table.n.01'),
 Synset('pedestal_table.n.01'),
 Synset('counter.n.01'),
 Synset('gueridon.n.01'),
 Synset('desk.n.01'),
 Synset('pier_table.n.01'),
 Synset('pool_table.n.01'),
 Synset('tea_table.n.01'),
 Synset('card_table.n.02'),
 Synset('stand.n.04')]

In [25]:
wn.synset('table.n.02').hypernyms()[0].hypernyms()[0].hypernyms()[0].definition()

'an artifact (or system of artifacts) that is instrumental in accomplishing some end'

In [26]:
#w1 = wordnet.synset('ship.n.01')
#w2 = wordnet.synset('boat.n.01')
#print(w1.wup_similarity(w2))


#w1 = wordnet.synset('ship.n.01')
#w2 = wordnet.synset('car.n.01')
#print(w1.wup_similarity(w2))

# Wu-Palmer similarity comparisons
w1 = wn.synset('ship.n.01')
w2 = wn.synset('boat.n.01')
print(w1.wup_similarity(w2))

w1 = wn.synset('ship.n.01')
w2 = wn.synset('car.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091
0.6956521739130435


In [27]:
synonyms = []

for syn in wn.synsets("cat"):
    for l in syn.lemmas():
        synonyms.append(l.name())

print(set(synonyms))

{'CT', 'barf', 'African_tea', 'regorge', 'computerized_tomography', "cat-o'-nine-tails", 'CAT', 'regurgitate', 'throw_up', 'spue', 'bozo', 'qat', 'computerized_axial_tomography', 'chuck', 'cast', 'vomit', 'true_cat', 'computed_tomography', 'guy', 'cat', 'spew', 'honk', 'purge', 'sick', 'hombre', 'be_sick', 'computed_axial_tomography', 'disgorge', 'retch', 'upchuck', 'kat', 'khat', 'puke', 'Arabian_tea', 'big_cat', 'quat', 'vomit_up', 'Caterpillar'}


In [28]:
wn.synset('dog.n.01').lemmas()[1].name()

'domestic_dog'

In [29]:
print(wn.synset('dog.n.01').examples())

['the dog barked all night']


# Named Entity Recognition (NER)
- NLTK
- Spacy
- Stanza

In [30]:

# convert string into set of POS-tagged sentences
def preprocessText(text, flag = 1):
    # Segment text into sentences
    sent = sent_tokenize(text)
    # Tokenize each sentences
    sent = [nltk.word_tokenize(s) for s in sent]
    # Part-of-speech tagging each sentences
    if(flag == 1) : sent = [nltk.pos_tag(s) for s in sent]
    # list of sentences
    if(flag == 2) : sent = [" ".join(s) for s in sent]
    return sent


In [31]:
Michelle = """Hi! I'm Michelle and I'm 22.
I really,really like this guy. He's 27 and everything  I like in a guy. We have so much in common.
We met around three and a half months ago. A week after we met, he texted me and we didn't stop talking for a whole month and a half. We talked day and night, sometimes 'til four in the morning.
Then, he started ignoring me. When that started to  happen, a red flag went up in my head, so I started ignoring him, too. Except I started missing him.
Before I started a new semester, I asked him what was the point of saving my number if he wasn't going to ask me out. (Yes, we haven't gone out on a date yet. We've talked about it, but he doesn't make it happen.)
I told him I wasn't going to have enough time for him, and if he really wanted to go out with me, he should make it happen soon rather than later.
I just don't understand why he hasn't asked me out yet. He gives me the money excuse, or the "every time I want to, something else comes up" excuse.
If he wants to see me he should've done so already... right?"""


In [32]:
# generate POS-tagged list of sentences

MichelleSentenceTagged = preprocessText(Michelle)
for s in MichelleSentenceTagged:
    print(s)


[('Hi', 'NN'), ('!', '.')]
[('I', 'PRP'), ("'m", 'VBP'), ('Michelle', 'NNP'), ('and', 'CC'), ('I', 'PRP'), ("'m", 'VBP'), ('22', 'CD'), ('.', '.')]
[('I', 'PRP'), ('really', 'RB'), (',', ','), ('really', 'RB'), ('like', 'IN'), ('this', 'DT'), ('guy', 'NN'), ('.', '.')]
[('He', 'PRP'), ("'s", 'VBZ'), ('27', 'CD'), ('and', 'CC'), ('everything', 'NN'), ('I', 'PRP'), ('like', 'VBP'), ('in', 'IN'), ('a', 'DT'), ('guy', 'NN'), ('.', '.')]
[('We', 'PRP'), ('have', 'VBP'), ('so', 'RB'), ('much', 'JJ'), ('in', 'IN'), ('common', 'JJ'), ('.', '.')]
[('We', 'PRP'), ('met', 'VBD'), ('around', 'IN'), ('three', 'CD'), ('and', 'CC'), ('a', 'DT'), ('half', 'JJ'), ('months', 'NNS'), ('ago', 'RB'), ('.', '.')]
[('A', 'DT'), ('week', 'NN'), ('after', 'IN'), ('we', 'PRP'), ('met', 'VBD'), (',', ','), ('he', 'PRP'), ('texted', 'VBD'), ('me', 'PRP'), ('and', 'CC'), ('we', 'PRP'), ('did', 'VBD'), ("n't", 'RB'), ('stop', 'VB'), ('talking', 'VBG'), ('for', 'IN'), ('a', 'DT'), ('whole', 'JJ'), ('month', 'NN'), (

# NLTK  Named Entity Recognition
https://www.nltk.org/book_1ed/ch07.html

In [33]:
# NER ne_chunk
#MichelleChunked = [nltk.ne_chunk(sent) for sent in MichelleSentenceTagged]
#MichelleChunked[:2]


# Process text
michelle_sentences_tagged = preprocessText(Michelle)

# Named Entity Recognition (NER)
michelle_chunked = [ne_chunk(sent) for sent in michelle_sentences_tagged]

# Print first two NER chunks
for tree in michelle_chunked[:2]:
    print(tree)


(S (GPE Hi/NN) !/.)
(S I/PRP 'm/VBP (PERSON Michelle/NNP) and/CC I/PRP 'm/VBP 22/CD ./.)


In [34]:
michelle_chunked = [ne_chunk(sent) for sent in michelle_sentences_tagged]

In [35]:
#michelle_sentences_tagged 
#ne_chunk(michelle_sentences_tagged[1])
# Display first two NER chunks
from nltk.tree import Tree
for i, tree in enumerate(michelle_chunked[:2], start=1):
    print(f"\nNamed Entities in Sentence {i}:")
    for subtree in tree:
        if isinstance(subtree, Tree):  # Named Entity chunk
            print("  ", subtree)


Named Entities in Sentence 1:
   (GPE Hi/NN)

Named Entities in Sentence 2:
   (PERSON Michelle/NNP)


In [36]:
#for tree in michelle_chunked:
    #print(tree)


In [37]:
#for tree in michelle_chunked:
    #for chunk in tree:
        #if hasattr(chunk, "label"):
            #print(chunk.label(), str(chunk)) 
        #else : print("\t", chunk)
    #print("--------")
            

### NLTL IOB tags
- B: begin
- I: inside
- O: outside


In [38]:

# first Michelle sentence
tree = michelle_chunked[1]
print("Tree:", tree)

# Convert Assign IOB tags
iob_tags = tree2conlltags(tree)
print("\nIOB:", iob_tags)

# Reverse operation: IOB to chunks tags
conll = conlltags2tree(iob_tags)
print("Tree:", conll)


Tree: (S I/PRP 'm/VBP (PERSON Michelle/NNP) and/CC I/PRP 'm/VBP 22/CD ./.)

IOB: [('I', 'PRP', 'O'), ("'m", 'VBP', 'O'), ('Michelle', 'NNP', 'B-PERSON'), ('and', 'CC', 'O'), ('I', 'PRP', 'O'), ("'m", 'VBP', 'O'), ('22', 'CD', 'O'), ('.', '.', 'O')]
Tree: (S I/PRP 'm/VBP (PERSON Michelle/NNP) and/CC I/PRP 'm/VBP 22/CD ./.)


## Load Books

In [39]:
# import books brom NLTK (text1 ... text9)
import re
import nltk
from nltk.book import *


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [40]:
# make a text string from nltk.book.text1
text = ' '.join(nltk.book.text1)

# sentence segmentation, tokenization tagging 
text1Sents = preprocessText(text)

for s in [0,1]:
    print(text1Sents[s], "\n")

[('[', 'JJ'), ('Moby', 'NNP'), ('Dick', 'NNP'), ('by', 'IN'), ('Herman', 'NNP'), ('Melville', 'NNP'), ('1851', 'CD'), (']', 'NNP'), ('ETYMOLOGY', 'NNP'), ('.', '.')] 

[('(', '('), ('Supplied', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('Late', 'JJ'), ('Consumptive', 'NNP'), ('Usher', 'NNP'), ('to', 'TO'), ('a', 'DT'), ('Grammar', 'NNP'), ('School', 'NNP'), (')', ')'), ('The', 'DT'), ('pale', 'NN'), ('Usher', 'NNP'), ('--', ':'), ('threadbare', 'NN'), ('in', 'IN'), ('coat', 'NN'), (',', ','), ('heart', 'NN'), (',', ','), ('body', 'NN'), (',', ','), ('and', 'CC'), ('brain', 'NN'), (';', ':'), ('I', 'PRP'), ('see', 'VBP'), ('him', 'PRP'), ('now', 'RB'), ('.', '.')] 



In [41]:

# Create the named entity chunks: chunked_sentences
#text1Chunked = [nltk.ne_chunk(s) for s in text1Sents]
#print("chunked")

# dictionary to store NEs
#NE_NLTK_text1 = {}

# extract NEs tags
#for sent in text1Chunked:
    #for chunk in sent:
        #if hasattr(chunk, "label"):
            #z = str(chunk) # to make the chunk idexable
            #NE_NLTK_text1.setdefault(z, 0)
            #NE_NLTK_text1[z] += 1
# Create the named entity chunks
text1Chunked = [ne_chunk(sent) for sent in text1Sents]
print("Chunked Sentences Processed")

# Dictionary to store named entities
NE_NLTK_text1 = {}

# Extract named entity tags
for sent in text1Chunked:
    for chunk in sent:
        if isinstance(chunk, nltk.Tree):  # Ensure it's a chunked entity
            entity = " ".join(c[0] for c in chunk)  # Extract entity words
            label = chunk.label()  # Get entity type
            entity_label = f"{entity} ({label})"  # Format entity with label
            NE_NLTK_text1[entity_label] = NE_NLTK_text1.get(entity_label, 0) + 1

# Print extracted named entities
print("Extracted Named Entities:")
print(NE_NLTK_text1)

Chunked Sentences Processed
Extracted Named Entities:
{'Herman Melville (PERSON)': 1, 'Late Consumptive (ORGANIZATION)': 1, 'Grammar School (ORGANIZATION)': 1, 'HACKLUYT (ORGANIZATION)': 1, 'Dan (PERSON)': 1, 'Dan (GPE)': 1, 'HVALT (GPE)': 2, 'Dut (ORGANIZATION)': 1, 'Ger (GPE)': 1, 'WALLEN (GPE)': 1, 'WALW (ORGANIZATION)': 1, 'IAN (GPE)': 1, 'RICHARDSON (ORGANIZATION)': 1, 'GREEK (GPE)': 1, 'CETUS (GPE)': 1, 'LATIN (ORGANIZATION)': 1, 'WHOEL (GPE)': 1, 'ANGLO (ORGANIZATION)': 1, 'SAXON (ORGANIZATION)': 1, 'DANISH (GPE)': 2, 'DUTCH (GPE)': 2, 'HWAL (GPE)': 1, 'SWEDISH (GPE)': 1, 'WHALE (GPE)': 2, 'ICELANDIC (ORGANIZATION)': 1, 'ENGLISH (GPE)': 3, 'BALEINE (GPE)': 1, 'FRENCH (GPE)': 4, 'BALLENA (GPE)': 1, 'SPANISH (GPE)': 4, 'PEKEE (GPE)': 2, 'NUEE (ORGANIZATION)': 4, 'FEGEE (ORGANIZATION)': 1, 'ERROMANGOAN (ORGANIZATION)': 1, 'EXTRACTS (GPE)': 2, 'Sub (ORGANIZATION)': 3, 'Sub (GPE)': 1, 'Librarian (GPE)': 1, 'Sub (PERSON)': 3, 'Leviathan (GPE)': 6, 'Pale Sherry (PERSON)': 1, 'Subs (PER

In [None]:
for w in sorted(NE_NLTK_text1, key=NE_NLTK_text1.get, reverse=True):
    print(NE_NLTK_text1[w], "\t", w)

1 	 (PERSON Amittai/NNP)
1 	 (ORGANIZATION Captains/NNP)
1 	 (PERSON Cadiz/NNP)
1 	 (GPE Cadiz/NNP)
1 	 (ORGANIZATION Gibraltar/NNP)
1 	 (GPE Miserable/JJ)
1 	 (GPE Tarshish/JJ)
1 	 (GPE Strong/JJ)
1 	 (PERSON Jack/NNP)
1 	 (PERSON Joe/NNP)
1 	 (GPE Sodom/NNP)
1 	 (ORGANIZATION Customs/NNP)
1 	 (GPE Tarshish/VB)
1 	 (PERSON Captain/NNP)
1 	 (GPE Point/NN)
1 	 (GPE Wave/NNP)
1 	 (GPE Whence/NNP)
1 	 (GPE Hebrew/NNP)
1 	 (PERSON O/NNP Jonah/NNP)
1 	 (ORGANIZATION Lord/NNP God/NNP)
1 	 (PERSON Sin/NNP)
1 	 (ORGANIZATION Book/NNP)
1 	 (ORGANIZATION Joppa/NNP)
1 	 (ORGANIZATION Almighty/NNP)
1 	 (GPE Falsehood/NN)
1 	 (GPE Gospel/NNP)
1 	 (PERSON Pilot/NN Paul/NNP)
1 	 (ORGANIZATION Senators/NNS)
1 	 (GPE Judges/NNP)
1 	 (ORGANIZATION Ages/NNP)
1 	 (PERSON Thy/NNP)
1 	 (PERSON Thine/NNP)
1 	 (GPE Thee/NNP)
1 	 (GPE Savage/NN)
1 	 (ORGANIZATION General/NNP Washington/NNP)
1 	 (PERSON George/NNP Washington/NNP)
1 	 (ORGANIZATION Socratic/JJ)
1 	 (GPE Cape/NNP Horn/NNP)
1 	 (ORGANIZATION Tomah

1 	 (PERSON O/NNP Timor/NNP Tom/NNP)
1 	 (GPE Ombay/NNP)
1 	 (PERSON O/NNP New/NNP Zealand/NNP Jack/NNP)
1 	 (ORGANIZATION Tattoo/NNP Land/NNP)
1 	 (PERSON O/NNP Morquan/NNP)
1 	 (PERSON O/NNP Don/NNP Miguel/NNP)
1 	 (ORGANIZATION Cetacean/NNP History/NNP)
1 	 (PERSON Marius/NNP)
1 	 (PERSON Sylla/NNP)
1 	 (ORGANIZATION Zealand/NNP Tom/NNP)
1 	 (PERSON Don/NNP Miguel/NNP)
1 	 (ORGANIZATION Narragansett/NNP Woods/NNP)
1 	 (PERSON Captain/NNP Butler/NNP)
1 	 (PERSON Annawon/NNP)
1 	 (PERSON Essex/NNP)
1 	 (ORGANIZATION Pacific/NNP Ocean/NNP)
1 	 (PERSON Owen/NNP Chace/NNP)
1 	 (GPE Again/NN)
1 	 (ORGANIZATION HORRID/NNP)
1 	 (ORGANIZATION MYSTERIOUS/NNP)
1 	 (ORGANIZATION MORTAL/NNP)
1 	 (ORGANIZATION ANIMAL/NNP)
1 	 (PERSON Union/NNP)
1 	 (GPE Oahu/NNP)
1 	 (PERSON Sandwich/NNP Islands/NNP)
1 	 (ORGANIZATION Valparaiso/NNP)
1 	 (PERSON Saul/NNP)
1 	 (PERSON Tarsus/NNP)
1 	 (PERSON Langsdorff/NNP)
1 	 (ORGANIZATION Admiral/NNP Krusenstern/NNP)
1 	 (ORGANIZATION Langsdorff/NNP)
1 	 (GPE O

1 	 (PERSON Niagara/NNP)
1 	 (PERSON Table/NNP)
1 	 (GPE Rock/NN)
1 	 (GPE Drawn/NNP)
1 	 (GPE Midwifery/NN)
1 	 (ORGANIZATION Prairie/NNP)
1 	 (ORGANIZATION Phrenologist/NN)
1 	 (ORGANIZATION Rock/NNP)
1 	 (GPE Gibraltar/NNP)
1 	 (ORGANIZATION Gall/NNP)
1 	 (ORGANIZATION Pantheon/NNP)
1 	 (PERSON Gall/NNP)
1 	 (PERSON Spurzheim/NNP)
1 	 (PERSON Phidias/NNP)
1 	 (PERSON Shakespeare/NNP)
1 	 (PERSON Melancthon/NNP)
1 	 (ORGANIZATION Deity/NNP)
1 	 (GPE Genius/NNP)
1 	 (ORGANIZATION Egypt/NNP)
1 	 (GPE Physiognomy/NNP)
1 	 (PERSON Sir/NNP William/NNP Jones/NNP)
1 	 (ORGANIZATION Nut/NNP)
1 	 (GPE Sphinx/NNP)
1 	 (GPE Quebec/NNP)
1 	 (ORGANIZATION Pequod/NNP Meets/NNPS The/DT Virgin/NNP)
1 	 (PERSON Derick/NNP De/NNP Deer/NNP)
1 	 (GPE Yarman/NNP)
1 	 (ORGANIZATION Yarman/NNP)
1 	 (GPE Newcastle/NNP)
1 	 (PERSON Captain/NNP Derick/NNP De/NNP Deer/NNP)
1 	 (GPE Full/NN)
1 	 (GPE Adverse/JJ)
1 	 (GPE Hindostan/NNP)
1 	 (GPE Dog/NN)
1 	 (GPE Are/NNP)
1 	 (GPE Halloo/NNP)
1 	 (ORGANIZATION DO

1 	 (PERSON Invisible/JJ)
1 	 (PERSON Miriam/NNP)
1 	 (GPE Guinea/NNP)
1 	 (GPE Behold/NNP)
1 	 (GPE Wife/NNP)
1 	 (GPE Sleep/NN)
1 	 (ORGANIZATION Mate/NNP)
1 	 (GPE Luff/NNP)
1 	 (GPE Turkish/NNP)
1 	 (GPE Europa/NNP)
1 	 (GPE Crete/NNP)
1 	 (ORGANIZATION Natural/NNP Bridge/NNP)
1 	 (ORGANIZATION Channel/NNP)
1 	 (PERSON Antiochus/NNP)
1 	 (ORGANIZATION Maccabees/NNP)
1 	 (GPE Moby/NNP)
1 	 (GPE Man/NN)
1 	 (GPE Groan/NNP)
1 	 (GPE Cold/NNP)
1 	 (GPE Caught/NNP)
1 	 (GPE Accursed/JJ)
1 	 (ORGANIZATION Parsee/NN)
1 	 (GPE Gone/NN)
1 	 (PERSON Quick/JJ)
1 	 (GPE Jesus/NNP)
1 	 (PERSON Impiety/NNP)
1 	 (GPE Twas/NNP)
1 	 (GPE Believe/NNP)
1 	 (PERSON How/NNP)
1 	 (GPE Vesuvius/NNP)
1 	 (PERSON Trade/NNP Winds/NNP)
1 	 (PERSON Leeward/NN)
1 	 (PERSON O/NNP Parsee/NNP)
1 	 (GPE Future/JJ)
1 	 (GPE Strangest/NNP)
1 	 (GPE Feel/NNP)
1 	 (GPE Crushed/NNP)
1 	 (GPE Pull/NN)
1 	 (PERSON Monadnock/NNP)
1 	 (GPE Burst/NNP)
1 	 (GPE Slope/NN)
1 	 (GPE Dash/NNP)
1 	 (PERSON O/NNP Ahab/NNP)
1 	 (GP

# Named entity recognition with spacy

In [68]:
!python3 -m spacy download en_core_web_lg -t .
#import en_core_web_sm

#nlp = en_core_web_sm.load()

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/copkmeans-1.5-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [69]:
import spacy
from spacy.lang.en.examples import sentences 
from spacy import displacy

nlp_spacy = spacy.load("en_core_web_lg")

In [None]:
# pre-process by converting each sentence into a tokenized string
preprocessText(Michelle, 2)


In [None]:
# create a doc object per sentence
doc_spacy = nlp_spacy(Michelle)

print("Sentence:", doc_spacy.text, "\n\nSpacy Analysis:")
for token in doc_spacy:
    print(f"\t{token.text}\t{token.pos_}\t{token.dep_}")
    
print(f"Entities\n")
for ent in doc_spacy.ents:
    print(f"\t{ent.text:<15}\t{ent.start_char}\t{ent.end_char}\t{ent.label_}")

In [None]:
# load data
sentence = "Apple is looking at buying U.K. startup for $1 billion"
doc_spacy2 = nlp_spacy(sentence)

# print entities
for ent in doc_spacy2.ents:
    print(f"\t{ent.text:<15}\t{ent.start_char}\t{ent.end_char}\t{ent.label_}")

In [None]:
# entire Michelle text
doc_spacy3 = nlp_spacy(Michelle)

# collect NES in dictionary
NEs = {}
for ent in doc_spacy3.ents:
    NEs.setdefault(ent.label_, {})
    NEs[ent.label_].setdefault(ent.text, 0)
    NEs[ent.label_][ent.text] += 1

for typ in NEs:
    for txt in NEs[typ]:
        print(f'\t{NEs[typ][txt]}\t{typ:<10}\t{txt}')

In [None]:
# task:
# preprare text1, with preprocessText(text1, 2)
# loop throgh the list of sentences
# create a doc object per sentence
# grep out the entities 
# count entities and plot by order of frequency



# Named entity recognition with stanza

In [None]:
import stanza

nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,ner')

In [None]:

doc_stanza1 = nlp_stanza(preprocessText(Michelle, 2)[1])

print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc_stanza1.ents], sep='\n')

for i, sentence in enumerate(doc_stanza1.sentences):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')


In [None]:
# takes VERY much time

# list of sentences
text1string = preprocessText(' '.join(text1), 2)


H = {}
for s in text1string[:50]:
    doc_stanza2 = nlp_stanza(s)
    for ent in doc_stanza2.ents:
        H.setdefault(str(ent.type), {})
        H[str(ent.type)].setdefault(str(ent.text), 0)
        H[str(ent.type)][str(ent.text)] += 1

for typ in H:
    for txt in H[typ].keys():
        print(f'\t{H[typ][txt]}\t{typ}\t{txt}')