Find out how many different persons are in the **Hamlet corpus**.  
How many if you use the 3, 4 and 7-classes tagger?

In [1]:
import nltk, re
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')

In [2]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

jar_location = r"C:\Users\flexi\Documents\stanford-ner-2020-11-17\stanford-ner.jar"
model_location_3classes = r"C:\Users\flexi\Documents\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
model_location_4classes = r"C:\Users\flexi\Documents\stanford-ner-2020-11-17\classifiers\english.conll.4class.distsim.crf.ser.gz"
model_location_7classes = r"C:\Users\flexi\Documents\stanford-ner-2020-11-17\classifiers\english.muc.7class.distsim.crf.ser.gz"
st3 = StanfordNERTagger(model_location_3classes,jar_location,encoding='utf-8')
st4 = StanfordNERTagger(model_location_4classes,jar_location,encoding='utf-8')
st7 = StanfordNERTagger(model_location_7classes,jar_location,encoding='utf-8')

print(st3)
print(st4)
print(st7)

<nltk.tag.stanford.StanfordNERTagger object at 0x000001FF2E451D50>
<nltk.tag.stanford.StanfordNERTagger object at 0x000001FF2A6AD990>
<nltk.tag.stanford.StanfordNERTagger object at 0x000001FF2E4505D0>


In [3]:
import os

# Manually set JAVAHOME inside Jupyter
os.environ["JAVAHOME"] = r"C:\Program Files\Eclipse Adoptium\jdk-11.0.25.9-hotspot"

# Verify if JAVAHOME is set correctly
print("JAVAHOME:", os.environ.get("JAVAHOME"))

# Check if Java is accessible
!java -version

JAVAHOME: C:\Program Files\Eclipse Adoptium\jdk-11.0.25.9-hotspot


openjdk version "11.0.25" 2024-10-15
OpenJDK Runtime Environment Temurin-11.0.25+9 (build 11.0.25+9)
OpenJDK 64-Bit Server VM Temurin-11.0.25+9 (build 11.0.25+9, mixed mode)


In [4]:
hamlet_ner3 = st3.tag(hamlet)
hamlet_ner4 = st4.tag(hamlet)
hamlet_ner7 = st7.tag(hamlet)

print(hamlet_ner7[:50])

[('[', 'O'), ('The', 'O'), ('Tragedie', 'O'), ('of', 'O'), ('Hamlet', 'O'), ('by', 'O'), ('William', 'PERSON'), ('Shakespeare', 'PERSON'), ('1599', 'O'), (']', 'O'), ('Actus', 'O'), ('Primus', 'O'), ('.', 'O'), ('Scoena', 'O'), ('Prima', 'O'), ('.', 'O'), ('Enter', 'O'), ('Barnardo', 'PERSON'), ('and', 'O'), ('Francisco', 'O'), ('two', 'O'), ('Centinels', 'O'), ('.', 'O'), ('Barnardo', 'O'), ('.', 'O'), ('Who', 'O'), ("'", 'O'), ('s', 'O'), ('there', 'O'), ('?', 'O'), ('Fran', 'O'), ('.', 'O'), ('Nay', 'O'), ('answer', 'O'), ('me', 'O'), (':', 'O'), ('Stand', 'O'), ('&', 'O'), ('vnfold', 'O'), ('your', 'O'), ('selfe', 'O'), ('Bar', 'O'), ('.', 'O'), ('Long', 'O'), ('liue', 'O'), ('the', 'O'), ('King', 'ORGANIZATION'), ('Fran', 'ORGANIZATION'), ('.', 'O'), ('Barnardo', 'O')]


### Calculate non-unique persons per class tagger

In [5]:
persons_ner3 = sum(1 for word, label in hamlet_ner3 if label == 'PERSON')
persons_ner4 = sum(1 for word, label in hamlet_ner4 if label == 'PERSON')
persons_ner7 = sum(1 for word, label in hamlet_ner7 if label == 'PERSON')

In [6]:
print(f"Number of persons appear in hamlet text using 3-class tagger NER are: {persons_ner3}")
print(f"Number of persons appear in hamlet text using 4-class tagger NER are: {persons_ner4}")
print(f"Number of persons appear in hamlet text using 7-class tagger NER are: {persons_ner7}")

Number of persons appear in hamlet text using 3-class tagger NER are: 846
Number of persons appear in hamlet text using 4-class tagger NER are: 1063
Number of persons appear in hamlet text using 7-class tagger NER are: 375


### Calculate unique persons per class tagger

In [7]:
unique_persons_ner3 = {word for word, label in hamlet_ner3 if label == 'PERSON'}
unique_persons_ner4 = {word for word, label in hamlet_ner4 if label == 'PERSON'}
unique_persons_ner7 = {word for word, label in hamlet_ner7 if label == 'PERSON'}

In [9]:
print(f"Number of unique persons appear in hamlet text using 3-class tagger NER are: {len(unique_persons_ner3)}")
print(f"Number of unique persons appear in hamlet text using 4-class tagger NER are: {len(unique_persons_ner4)}")
print(f"Number of unique persons appear in hamlet text using 7-class tagger NER are: {len(unique_persons_ner7)}")

Number of unique persons appear in hamlet text using 3-class tagger NER are: 309
Number of unique persons appear in hamlet text using 4-class tagger NER are: 498
Number of unique persons appear in hamlet text using 7-class tagger NER are: 226
