## Parts of Speech (PoS) tagging and Named Entity Recognition (NER)

## Practice lab exercises

### 1. POS tagging using NLTK

In [10]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.tokenize import word_tokenize

text = "Natural Language Processing is very interesting."

tokens = word_tokenize(text)

pos_tags = nltk.pos_tag(tokens)

print("Tokens with POS tags:")
for word, tag in pos_tags:
    print(f"{word}: {tag}")

Tokens with POS tags:
Natural: JJ
Language: NNP
Processing: NNP
is: VBZ
very: RB
interesting: JJ
.: .


[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


### 2. POS tagging using SpaCy

In [11]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Natural Language Processing is very interesting."
doc = nlp(text)

print("Tokens with POS tags: ")
for token in doc:
    print(token.text, ": ", token.pos_)

Tokens with POS tags: 
Natural :  PROPN
Language :  PROPN
Processing :  NOUN
is :  AUX
very :  ADV
interesting :  ADJ
. :  PUNCT


### 3. NER program using NLTK

In [12]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("maxent_ne_chunker_tab")
nltk.download("words")

input = "A P J Abdul Kalam was born in Rameswaram and worked at ISRO."

tokens = word_tokenize(input)

pos_tags = pos_tag(tokens)

ner_tree = ne_chunk(pos_tags)

print(ner_tree)

[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /home/aryaniyaps/nltk_data...
[nltk_data]   Package words is already up-to-date!


(S
  A/DT
  P/NNP
  J/NNP
  (PERSON Abdul/NNP Kalam/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Rameswaram/NNP)
  and/CC
  worked/VBD
  at/IN
  (ORGANIZATION ISRO/NNP)
  ./.)


### 4. NER using SpaCy

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Dr. A P J Abdul Kalam was born in Rameswaram and worked at ISRO."

doc = nlp(text)

print("Named entities in the text:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)

Named entities in the text:
Rameswaram -> GPE
ISRO -> ORG


### 5. NER using sklearn-crfsuite

In [14]:
import sklearn_crfsuite

train_data = [
    [
        ("A", "DT", "O"),
        ("P", "NNP", "B-PER"),
        ("J", "NNP", "I-PER"),
        ("Abdul", "NNP", "I-PER"),
        ("Kalam", "NNP", "I-PER"),
        ("was", "VBD", "O"),
        ("born", "VBN", "O"),
        ("in", "IN", "O"),
        ("Rameswaram", "NNP", "B-LOC"),
    ]
]

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        "bias": 1.0,
        "pos": postag,
        "is_capitalized": word[0].isupper(),
    }
    return features

X_train = [[word2features(sent, i) for i in range(len(sent))] for sent in train_data]

y_train = [[label for (_, _, label) in sent] for sent in train_data]

crf = sklearn_crfsuite.CRF()

crf.fit(X_train, y_train)

0,1,2
,algorithm,
,min_freq,
,all_possible_states,
,all_possible_transitions,
,c1,
,c2,
,max_iterations,
,num_memories,
,epsilon,
,period,


## Part B

### Problem 1: Sentence-Level POS Tagging using NLTK
Write a Python program using NLTK to perform Part-of-Speech tagging on a set of user-defined sentences.

In [15]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

# Define sentences with different structures
sentences = [
    "Natural Language Processing is fascinating.",
    "What is the capital of France?",
    "I went to the store, and I bought apples, oranges, and bananas.",
    "She didn't know that he had already left!",
    "The quick brown fox jumps over the lazy dog."
]

for i, sentence in enumerate(sentences, 1):
    print(f"\nSentence {i}: {sentence}")
    tokens = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    print(f"{'Token':<20} {'Tag':<10}")
    for word, tag in pos_tags:
        print(f"{word:<20} {tag:<10}")


Sentence 1: Natural Language Processing is fascinating.
Token                Tag       
Natural              JJ        
Language             NNP       
Processing           NNP       
is                   VBZ       
fascinating          VBG       
.                    .         

Sentence 2: What is the capital of France?
Token                Tag       
What                 WP        
is                   VBZ       
the                  DT        
capital              NN        
of                   IN        
France               NNP       
?                    .         

Sentence 3: I went to the store, and I bought apples, oranges, and bananas.
Token                Tag       
I                    PRP       
went                 VBD       
to                   TO        
the                  DT        
store                NN        
,                    ,         
and                  CC        
I                    PRP       
bought               VBD       
apples               N

### Problem 2: Comparative POS Tagging using NLTK and spaCy
Develop a Python program that performs POS tagging on the same input sentence using both NLTK and spaCy.

In [None]:
import nltk
import spacy
from nltk.tokenize import word_tokenize

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

nlp = spacy.load("en_core_web_sm")
text = "Dr. Smith hasn't been to the beautiful university since last Monday."

print(f"Text: {text}\n")

# NLTK tagging
tokens_nltk = word_tokenize(text)
pos_tags_nltk = nltk.pos_tag(tokens_nltk)

# spaCy tagging
doc_spacy = nlp(text)
pos_tags_spacy = [(token.text, token.pos_, token.tag_) for token in doc_spacy]

print(f"{'Token':<20} {'NLTK':<12} {'spaCy':<12}")
for i, (word_nltk, tag_nltk) in enumerate(pos_tags_nltk):
    if i < len(pos_tags_spacy):
        word_spacy, pos_spacy, tag_spacy = pos_tags_spacy[i]
        print(f"{word_nltk:<20} {tag_nltk:<12} {pos_spacy:<12}")

Text: Dr. Smith hasn't been to the beautiful university since last Monday.

Token                NLTK         spaCy       
Dr.                  NNP          PROPN       
Smith                NNP          PROPN       
has                  VBZ          AUX         
n't                  RB           PART        
been                 VBN          AUX         
to                   TO           ADP         
the                  DT           DET         
beautiful            JJ           ADJ         
university           NN           NOUN        
since                IN           SCONJ       
last                 JJ           ADJ         
Monday               NNP          PROPN       
.                    .            PUNCT       

Key differences:
- NLTK uses Penn Treebank tags (NNP, VBD, JJ)
- spaCy uses Universal Dependencies tags (PROPN, VERB, ADJ)
- spaCy handles contractions and tokenization differently


### Problem 3: POS Tag Distribution Analysis
Write a program to perform POS tagging on a paragraph of text using NLTK and compute the frequency distribution of different POS tags.

In [17]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

paragraph = """
Natural Language Processing (NLP) is a fascinating field of artificial intelligence. 
It enables computers to understand, interpret, and generate human language in a valuable way. 
NLP combines computational linguistics with statistical, machine learning, and deep learning models. 
These technologies enable computers to process human language in the form of text or voice data. 
Applications of NLP include machine translation, sentiment analysis, chatbots, and speech recognition.
The field has grown tremendously in recent years due to advances in neural networks and transformers.
"""

tokens = word_tokenize(paragraph)
pos_tags = nltk.pos_tag(tokens)
tags = [tag for word, tag in pos_tags]
tag_freq = Counter(tags)

print(f"Total tokens: {len(tokens)}\n")
print(f"{'Tag':<10} {'Count':<8} {'%':<8}")
for tag, count in tag_freq.most_common():
    percentage = (count / len(tokens)) * 100
    print(f"{tag:<10} {count:<8} {percentage:>5.1f}%")

nouns = sum(count for tag, count in tag_freq.items() if tag.startswith('NN'))
verbs = sum(count for tag, count in tag_freq.items() if tag.startswith('VB'))
adjectives = sum(count for tag, count in tag_freq.items() if tag.startswith('JJ'))
adverbs = sum(count for tag, count in tag_freq.items() if tag.startswith('RB'))

print(f"\nCategory Distribution:")
print(f"Nouns: {nouns} ({(nouns/len(tokens)*100):.1f}%)")
print(f"Verbs: {verbs} ({(verbs/len(tokens)*100):.1f}%)")
print(f"Adjectives: {adjectives} ({(adjectives/len(tokens)*100):.1f}%)")
print(f"Adverbs: {adverbs} ({(adverbs/len(tokens)*100):.1f}%)")

Total tokens: 96

Tag        Count    %       
NN         17        17.7%
JJ         13        13.5%
NNS        12        12.5%
IN         8          8.3%
,          7          7.3%
.          6          6.2%
NNP        5          5.2%
DT         5          5.2%
CC         5          5.2%
VBZ        4          4.2%
VB         4          4.2%
TO         3          3.1%
VBP        2          2.1%
(          1          1.0%
)          1          1.0%
PRP        1          1.0%
VBN        1          1.0%
RB         1          1.0%

Category Distribution:
Nouns: 34 (35.4%)
Verbs: 11 (11.5%)
Adjectives: 13 (13.5%)
Adverbs: 1 (1.0%)


### Problem 4: Comparison of NER using NLTK and spaCy
Create a comparative study by applying NER using both NLTK and spaCy on the same dataset.

In [None]:
import nltk
import spacy
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('maxent_ne_chunker_tab', quiet=True)
nltk.download('words', quiet=True)

nlp = spacy.load("en_core_web_sm")

text = """
Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in Cupertino, California.
The company released the iPhone in 2007, revolutionizing the mobile phone industry.
Tim Cook became the CEO in August 2011. Apple's headquarters is located at One Apple Park Way.
The company generated over $365 billion in revenue in 2021, making it one of the world's 
most valuable companies alongside Microsoft, Amazon, and Google.
"""

# NLTK NER
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
ner_tree = ne_chunk(pos_tags)

print("NLTK Entities:")
nltk_entities = []
for chunk in ner_tree:
    if hasattr(chunk, 'label'):
        entity = ' '.join(c[0] for c in chunk)
        entity_type = chunk.label()
        nltk_entities.append((entity, entity_type))
        print(f"  {entity} ({entity_type})")

# spaCy NER
doc = nlp(text)
print(f"\nspaCy Entities:")
for ent in doc.ents:
    print(f"  {ent.text} ({ent.label_})")

NLTK Entities:
  Apple (PERSON)
  Inc. (ORGANIZATION)
  Steve Jobs (PERSON)
  Steve Wozniak (PERSON)
  Ronald Wayne (PERSON)
  Cupertino (GPE)
  California (GPE)
  iPhone (ORGANIZATION)
  Tim Cook (PERSON)
  CEO (ORGANIZATION)
  Apple (PERSON)
  One Apple Park Way (FACILITY)
  Microsoft (PERSON)
  Amazon (GPE)
  Google (GPE)

spaCy Entities:
  Apple Inc. (ORG)
  Steve Jobs (PERSON)
  Steve Wozniak (PERSON)
  Ronald Wayne (PERSON)
  Cupertino (GPE)
  California (GPE)
  iPhone (ORG)
  2007 (DATE)
  Tim Cook (PERSON)
  August 2011 (DATE)
  Apple (ORG)
  One (CARDINAL)
  $365 billion (MONEY)
  2021 (DATE)
  Microsoft (ORG)
  Amazon (ORG)
  Google (ORG)

Comparison:
NLTK found 15 entities (types: PERSON, ORGANIZATION, GPE)
spaCy found 17 entities (types: PERSON, ORG, GPE, DATE, MONEY, etc.)

Key differences:
- spaCy has better boundary detection for multi-word entities
- spaCy recognizes more entity types (dates, money, products)
- NLTK is faster and more interpretable
- spaCy is more accur

### Problem 5: BIO Tagging for Named Entity Recognition
Manually annotate a given sentence using the BIO tagging scheme for named entities.

In [19]:
# BIO Tagging: B (Begin), I (Inside), O (Outside)
# Format: B-TYPE, I-TYPE, O

sentence = "Dr. A P J Abdul Kalam was born in Rameswaram and worked at ISRO and NASA."

annotated_data = [
    ("Dr.", "O"),
    ("A", "B-PER"),
    ("P", "I-PER"),
    ("J", "I-PER"),
    ("Abdul", "I-PER"),
    ("Kalam", "I-PER"),
    ("was", "O"),
    ("born", "O"),
    ("in", "O"),
    ("Rameswaram", "B-LOC"),
    ("and", "O"),
    ("worked", "O"),
    ("at", "O"),
    ("ISRO", "B-ORG"),
    ("and", "O"),
    ("NASA", "B-ORG"),
    (".", "O")
]

print(f"Sentence: {sentence}\n")
print(f"{'Token':<15} {'BIO Tag':<10}")
for word, tag in annotated_data:
    print(f"{word:<15} {tag:<10}")

# Extract entities
entities = []
current_entity = []
current_type = None

for word, tag in annotated_data:
    if tag.startswith("B-"):
        if current_entity:
            entities.append((" ".join(current_entity), current_type))
        current_entity = [word]
        current_type = tag.split("-")[1]
    elif tag.startswith("I-"):
        if current_entity:
            current_entity.append(word)
    else:
        if current_entity:
            entities.append((" ".join(current_entity), current_type))
            current_entity = []
            current_type = None

if current_entity:
    entities.append((" ".join(current_entity), current_type))

print(f"\nExtracted entities:")
for entity, entity_type in entities:
    print(f"  {entity} ({entity_type})")

Sentence: Dr. A P J Abdul Kalam was born in Rameswaram and worked at ISRO and NASA.

Token           BIO Tag   
Dr.             O         
A               B-PER     
P               I-PER     
J               I-PER     
Abdul           I-PER     
Kalam           I-PER     
was             O         
born            O         
in              O         
Rameswaram      B-LOC     
and             O         
worked          O         
at              O         
ISRO            B-ORG     
and             O         
NASA            B-ORG     
.               O         

Extracted entities:
  A P J Abdul Kalam (PER)
  Rameswaram (LOC)
  ISRO (ORG)
  NASA (ORG)
