In [1]:
import nltk

In [2]:
# punkt dependencies. taggers.
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
# Tag legends
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [97]:
with open("corpus.txt", "r") as f:
    text = f.read()

In [98]:
# Tokenize into sentences, then tokenize + tag words
# Following example from sample unit 6 code: [(word, tag)], [sentence], [sentence], ...
sentences = nltk.sent_tokenize(text)
tagged_text = []

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    tagged_text.append(tagged)

In [99]:
print(tagged_text)

[[('The', 'DT'), ('most', 'RBS'), ('merciful', 'JJ'), ('thing', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), (',', ','), ('I', 'PRP'), ('think', 'VBP'), (',', ','), ('is', 'VBZ'), ('the', 'DT'), ('inability', 'NN'), ('of', 'IN'), ('the', 'DT'), ('human', 'JJ'), ('mind', 'NN'), ('to', 'TO'), ('correlate', 'VB'), ('all', 'DT'), ('its', 'PRP$'), ('contents', 'NNS'), ('.', '.')], [('We', 'PRP'), ('live', 'VBP'), ('on', 'IN'), ('a', 'DT'), ('placid', 'JJ'), ('island', 'NN'), ('of', 'IN'), ('ignorance', 'NN'), ('in', 'IN'), ('the', 'DT'), ('midst', 'NN'), ('of', 'IN'), ('black', 'JJ'), ('seas', 'NNS'), ('of', 'IN'), ('infinity', 'NN'), (',', ','), ('and', 'CC'), ('it', 'PRP'), ('was', 'VBD'), ('not', 'RB'), ('meant', 'JJ'), ('that', 'IN'), ('we', 'PRP'), ('should', 'MD'), ('voyage', 'VB'), ('far', 'RB'), ('.', '.')], [('The', 'DT'), ('sciences', 'NNS'), (',', ','), ('each', 'DT'), ('straining', 'NN'), ('in', 'IN'), ('its', 'PRP$'), ('own', 'JJ'), ('direction', 'NN'), (',', ','), ('ha

### Trad Model: **HMM**
###### *from unit 6 sample*

In [7]:
import sys
sys.path.append('./models')
from models.HMM import HiddenMarkovModel

tagger = HiddenMarkovModel()
tagger.train(tagged_text)

In [8]:
# HMM implementation
import re
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
predicted_tags = tagger.viterbi(tokens)
for word, tag in zip(tokens, predicted_tags):
    print(f"{word:10} → {tag}")

We         → PRP
live       → VBP
on         → IN
a          → DT
placid     → JJ
island     → NN
of         → IN
ignorance  → NN
.          → .


### Trad Model: **Multinomial Naive Bayes**

In [9]:
from models.NaiveBayes import MultinomialNaiveBayes
nb_tagger = MultinomialNaiveBayes()
nb_tagger.train(tagged_text)

In [10]:
# Naive Bayes implementation
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = nb_tagger.tag(tokens)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → PRP
live         → NN
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .


Naive Bayes performs worse but that is expected

### NN Model: *TBA*

In [100]:
from models.LSTM import LSTMPOSTagger, POSTagDataset, train_model, predict

In [85]:
word2idx = {'PAD': 0, 'UNK': 1}
tag2idx = {'PAD': 0}

for sent in tagged_text:
    for word, tag in sent:
        word_lower = word.lower()
        if word_lower not in word2idx:
            word2idx[word_lower] = len(word2idx)
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

In [86]:
dataset = POSTagDataset(tagged_text, word2idx, tag2idx)

LSTM = LSTMPOSTagger(
    vocab_size=len(word2idx),
    tagset_size=len(tag2idx),
    embedding_dim=64,
    hidden_dim=64,
    pad_idx=word2idx['PAD']
)

In [87]:
train_model(LSTM, dataset, tag2idx)

Epoch 1/5 - Loss: 3.2810
Epoch 2/5 - Loss: 3.2640
Epoch 3/5 - Loss: 3.2471
Epoch 4/5 - Loss: 3.2303
Epoch 5/5 - Loss: 3.2136


In [88]:
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = predict(LSTM, tokens, word2idx, idx2tag)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → PRP$
live         → NNS
on           → VBP
a            → IN
placid       → IN
island       → MD
of           → MD
ignorance    → MD
.            → WDT


Output is absolute garbage.
LSTM needs either: 1. Bigger dataset or 2. More epochs instead of default 5

We will do both.

In [90]:
with open("corpus_big.txt", "r") as f:
    text_big = f.read()

In [112]:
# Tokenize into sentences, then tokenize + tag words
# Following example from sample unit 6 code: [(word, tag)], [sentence], [sentence], ...
sentences = nltk.sent_tokenize(text_big)
tagged_text_big = []

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    tagged_text_big.append(tagged)

In [113]:
word2idx = {'PAD': 0, 'UNK': 1}
tag2idx = {'PAD': 0}

for sent in tagged_text_big:
    for word, tag in sent:
        word_lower = word.lower()
        if word_lower not in word2idx:
            word2idx[word_lower] = len(word2idx)
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

In [117]:
dataset = POSTagDataset(tagged_text_big, word2idx, tag2idx)

LSTM = LSTMPOSTagger(
    vocab_size=len(word2idx),
    tagset_size=len(tag2idx),
    embedding_dim=64,
    hidden_dim=64,
    pad_idx=word2idx['PAD']
)

In [118]:
train_model(LSTM, dataset, tag2idx, epochs=100)

Epoch 1/100 - Loss: 94.1476
Epoch 2/100 - Loss: 73.4120
Epoch 3/100 - Loss: 59.2749
Epoch 4/100 - Loss: 50.7676
Epoch 5/100 - Loss: 44.5667
Epoch 6/100 - Loss: 40.1255
Epoch 7/100 - Loss: 36.1998
Epoch 8/100 - Loss: 33.0339
Epoch 9/100 - Loss: 30.8593
Epoch 10/100 - Loss: 28.6322
Epoch 11/100 - Loss: 26.7187
Epoch 12/100 - Loss: 24.9698
Epoch 13/100 - Loss: 23.2871
Epoch 14/100 - Loss: 21.6985
Epoch 15/100 - Loss: 20.3494
Epoch 16/100 - Loss: 19.2059
Epoch 17/100 - Loss: 17.9854
Epoch 18/100 - Loss: 16.6814
Epoch 19/100 - Loss: 15.6241
Epoch 20/100 - Loss: 14.8344
Epoch 21/100 - Loss: 13.6511
Epoch 22/100 - Loss: 12.8000
Epoch 23/100 - Loss: 12.2071
Epoch 24/100 - Loss: 11.2776
Epoch 25/100 - Loss: 10.5439
Epoch 26/100 - Loss: 9.8990
Epoch 27/100 - Loss: 9.3099
Epoch 28/100 - Loss: 8.8406
Epoch 29/100 - Loss: 8.1425
Epoch 30/100 - Loss: 7.5875
Epoch 31/100 - Loss: 7.1387
Epoch 32/100 - Loss: 6.6061
Epoch 33/100 - Loss: 6.2655
Epoch 34/100 - Loss: 5.8852
Epoch 35/100 - Loss: 5.5580
Epoc

In [119]:
sentence = "We live on a placid island of ignorance."
tokens = re.findall(r'\b\w+\b|[^\w\s]', sentence)
tags = predict(LSTM, tokens, word2idx, idx2tag)

for word, tag in tags:
    print(f"{word:12} → {tag}")

We           → PRP
live         → VBP
on           → IN
a            → DT
placid       → JJ
island       → NN
of           → IN
ignorance    → NN
.            → .


better