In [1]:
import nltk

In [2]:
# punkt dependencies. taggers.
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [32]:
# Tag legends
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [3]:
with open("data/corpus.txt", "r") as f:
    text = f.read()

In [4]:
# Tokenize into sentences, then tokenize + tag words
# Following example from sample unit 6 code: [(word, tag)], [sentence], [sentence], ...
sentences = nltk.sent_tokenize(text)
tagged_text = []

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    tagged_text.append(tagged)

In [7]:
print(tagged_text)



In [5]:
# Organize files
import sys
sys.path.append('./models')

### Trad Model: **Multinomial Naive Bayes**

In [10]:
from models.NaiveBayes import MultinomialNaiveBayes
NaiveBayes = MultinomialNaiveBayes()
NaiveBayes.train(tagged_text)

In [11]:
# Naive Bayes implementation
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = NaiveBayes.tag(tokens)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → PRP
live         → NN
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .


### NN Model: **LSTM**

In [12]:
from models.LSTM import LSTMPOSTagger, POSTagDataset, train_model, predict

In [13]:
word2idx = {'PAD': 0, 'UNK': 1}
tag2idx = {'PAD': 0}

for sent in tagged_text:
    for word, tag in sent:
        word_lower = word.lower()
        if word_lower not in word2idx:
            word2idx[word_lower] = len(word2idx)
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

In [14]:
dataset = POSTagDataset(tagged_text, word2idx, tag2idx)

LSTM = LSTMPOSTagger(
    vocab_size=len(word2idx),
    tagset_size=len(tag2idx),
)

In [15]:
train_model(LSTM, dataset, tag2idx)

Epoch 1/5 - Loss: 92.2720
Epoch 2/5 - Loss: 73.1153
Epoch 3/5 - Loss: 61.0066
Epoch 4/5 - Loss: 52.1296
Epoch 5/5 - Loss: 45.5011


In [7]:
import regex as re

In [31]:
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = predict(LSTM, tokens, word2idx, idx2tag)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → TO
live         → DT
on           → IN
a            → DT
placid       → NN
island       → NN
of           → IN
ignorance    → NN
.            → .


Loss looks high.
LSTM needs more epochs instead of default 5

In [16]:
dataset = POSTagDataset(tagged_text, word2idx, tag2idx)

LSTM = LSTMPOSTagger(
    vocab_size=len(word2idx),
    tagset_size=len(tag2idx),
)

In [17]:
train_model(LSTM, dataset, tag2idx, epochs=100) # train model, trained model after this is the one that gets sent to the bencharker

Epoch 1/100 - Loss: 93.9123
Epoch 2/100 - Loss: 74.3758
Epoch 3/100 - Loss: 61.7300
Epoch 4/100 - Loss: 53.5244
Epoch 5/100 - Loss: 47.1608
Epoch 6/100 - Loss: 42.1142
Epoch 7/100 - Loss: 37.9757
Epoch 8/100 - Loss: 34.4990
Epoch 9/100 - Loss: 31.7590
Epoch 10/100 - Loss: 29.4518
Epoch 11/100 - Loss: 27.4079
Epoch 12/100 - Loss: 25.4233
Epoch 13/100 - Loss: 23.4867
Epoch 14/100 - Loss: 21.9691
Epoch 15/100 - Loss: 20.6957
Epoch 16/100 - Loss: 19.3036
Epoch 17/100 - Loss: 18.1253
Epoch 18/100 - Loss: 16.9352
Epoch 19/100 - Loss: 15.9601
Epoch 20/100 - Loss: 14.9288
Epoch 21/100 - Loss: 13.9365
Epoch 22/100 - Loss: 13.0727
Epoch 23/100 - Loss: 12.3079
Epoch 24/100 - Loss: 11.5146
Epoch 25/100 - Loss: 10.7062
Epoch 26/100 - Loss: 9.9062
Epoch 27/100 - Loss: 9.4138
Epoch 28/100 - Loss: 8.7981
Epoch 29/100 - Loss: 8.2889
Epoch 30/100 - Loss: 7.7139
Epoch 31/100 - Loss: 7.1195
Epoch 32/100 - Loss: 6.7003
Epoch 33/100 - Loss: 6.3286
Epoch 34/100 - Loss: 5.9070
Epoch 35/100 - Loss: 5.5318
Epoc

In [130]:
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = predict(LSTM, tokens, word2idx, idx2tag)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → PRP
live         → VBP
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .


better

Benchmarking

In [51]:
# Correct tags
from nltk import pos_tag
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = pos_tag(tokens)

print("Correct tags:")

for word, tag in tags:
    print(f"{word:12} → {tag}")

# Sample tag predictions from both models
models = [NaiveBayes, LSTM]
model_names = ["Naive Bayes", "LSTM"]

print("\nSample tag predictions:")
for model in models:
    print(f"\n{model_names[models.index(model)]} tag predictions:")
    if model == NaiveBayes:
        tags = model.tag(tokens)
    else:
        tags = predict(model, tokens, word2idx, idx2tag)
    for word, tag in tags:
        print(f"{word:12} → {tag}")

Correct tags:
We           → PRP
live         → VBP
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .

Sample tag predictions:

Naive Bayes tag predictions:
We           → PRP
live         → NN
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .

LSTM tag predictions:
We           → PRP
live         → VBP
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .


In [169]:
from sklearn.metrics import accuracy_score

def evaluate_model_accuracy(model, model_name, tagged_text):
    correct_tags = []
    pred_tags = []

    for sentence in tagged_text:
        words = [word for word, tag in sentence]
        tags = [tag for word, tag in sentence]

        if model_name == 'NaiveBayes':
            predicted = model.tag(words)  # returns [(word, tag)]
        elif model_name == 'LSTM':
            predicted = predict(LSTM, words, word2idx, idx2tag)  # returns [(word, tag)]
        else:
            raise ValueError("Unknown model name.")

        pred = [tag for word, tag in predicted]

        correct_tags.extend(tags)
        pred_tags.extend(pred)

    acc = accuracy_score(correct_tags, pred_tags)
    print(f"Accuracy of {model_name}: {acc:.4f}")
    return acc

# Run benchmark comparisons
acc_nb = evaluate_model_accuracy(NaiveBayes, 'NaiveBayes', tagged_text)
acc_lstm = evaluate_model_accuracy(LSTM, 'LSTM', tagged_text)

Accuracy of NaiveBayes: 0.8450
Accuracy of LSTM: 0.9995
