<b>Simple script to train Hidden Markov Model for Part of Speech tagging using NLTK</b>

In [1]:
#import the models
import nltk
from nltk import HiddenMarkovModelTagger as hmm # do not use nltk.tag.hmm
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank
import warnings
import dill


warnings.filterwarnings('ignore')

<b>Download the data. Run only once</b>

In [2]:
# Ensure the treebank dataset is downloaded
#nltk.download('treebank')
#nltk,download('punkt')

<b>Prepare the data. We'll use the Penn Treebank which is an English Corpus that includes pos tagging. For information on the tagset: https://www.sketchengine.eu/penn-treebank-tagset/
We split the data into training and testing. Try to change the data size and experiment with the accuracy change.</b>


In [11]:
print(f'The number of tagged examples in the dataset is: {len(treebank.tagged_sents())}')
train_data = treebank.tagged_sents()[:2000]
test_data = treebank.tagged_sents()[-500:]


print(train_data[0])

# Extracting unique tags from train_data
unique_tags = set(tag for sent in train_data for _, tag in sent)

print(unique_tags)


The number of tagged examples in the dataset is: 3914
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
{'RP', 'EX', 'CC', 'VBD', 'NNP', 'JJ', 'RBR', 'VB', 'SYM', 'RBS', '-RRB-', 'MD', 'RB', 'JJR', '-LRB-', '#', 'VBN', 'WP$', 'DT', 'LS', 'VBP', 'POS', 'PDT', 'NN', 'WP', 'PRP', 'VBG', '-NONE-', 'NNS', ',', 'JJS', 'NNPS', 'CD', 'IN', 'VBZ', 'WRB', 'TO', 'PRP$', '$', 'WDT', '.', "''", ':', 'UH', '``', 'FW'}


<b>Define the trainer and train the model</b>

In [12]:
tagger = hmm.train(train_data, verbose=True)

In [13]:
# Evaluate the model's accuracy on the test data
accuracy = tagger.accuracy(test_data)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.89


<b>Generate true tags list and model prediction to get more detailed stats on where the model performed better and where it didn't perform so well</b>

In [14]:
# Generate Predictions
true_tags = [tag for sent in test_data for _, tag in sent]
predicted_tags = [tag for sent in tagger.tag_sents([[word for word, _ in sent] for sent in test_data]) for _, tag in sent]

In [15]:
# Compute accuracy for each label
labels = list(set(true_tags))
for label in labels:
    correct_predictions = sum(1 for t, p in zip(true_tags, predicted_tags) if t == label and p == label)
    total_predictions = sum(1 for t in true_tags if t == label)
    wrong_predictions = total_predictions - correct_predictions
    label_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

    print(f"Label: {label}")
    print(f"Correct Predictions: {correct_predictions}")
    print(f"Wrong Predictions: {wrong_predictions}")
    print(f"Accuracy: {label_accuracy:.2f}\n")

Label: RP
Correct Predictions: 15
Wrong Predictions: 9
Accuracy: 0.62

Label: EX
Correct Predictions: 3
Wrong Predictions: 0
Accuracy: 1.00

Label: CC
Correct Predictions: 275
Wrong Predictions: 1
Accuracy: 1.00

Label: VBD
Correct Predictions: 394
Wrong Predictions: 70
Accuracy: 0.85

Label: NNP
Correct Predictions: 927
Wrong Predictions: 247
Accuracy: 0.79

Label: JJ
Correct Predictions: 559
Wrong Predictions: 148
Accuracy: 0.79

Label: RBR
Correct Predictions: 3
Wrong Predictions: 10
Accuracy: 0.23

Label: VB
Correct Predictions: 289
Wrong Predictions: 19
Accuracy: 0.94

Label: RBS
Correct Predictions: 1
Wrong Predictions: 0
Accuracy: 1.00

Label: -RRB-
Correct Predictions: 14
Wrong Predictions: 2
Accuracy: 0.88

Label: MD
Correct Predictions: 132
Wrong Predictions: 0
Accuracy: 1.00

Label: RB
Correct Predictions: 235
Wrong Predictions: 56
Accuracy: 0.81

Label: JJR
Correct Predictions: 38
Wrong Predictions: 6
Accuracy: 0.86

Label: -LRB-
Correct Predictions: 15
Wrong Predictions: 1

<b>If I'm happy with the model, I can save it for later usage</b>

In [8]:
# Save the trained model to a file
with open('hmm_tagger.pkl', 'wb') as f:
    dill.dump(tagger, f)

<b>You can load the model at anytime to use it for tagging sentences</b>

In [9]:
# Load the trained model from the file
with open('hmm_tagger.pkl', 'rb') as f:
    loaded_tagger = dill.load(f)

<b>Try the model on new text</b>

In [10]:
sentence = 'I took the train from Zurich to Italy last night'

tokens = nltk.word_tokenize(sentence)

# Tag the tokenized sentence
tagged_sentence = loaded_tagger.tag(tokens)

print(tagged_sentence)

[('I', 'PRP'), ('took', 'VBD'), ('the', 'DT'), ('train', 'NN'), ('from', 'IN'), ('Zurich', '-NONE-'), ('to', 'TO'), ('Italy', 'VB'), ('last', 'JJ'), ('night', 'NN')]
