In [1]:
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger, BigramTagger
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')      # For tokenizing text into words
nltk.download('treebank')   # For accessing the Treebank tagged corpus
nltk.download('averaged_perceptron_tagger')  # For POS tagging models

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\anaconda3\envs\neuralnetwork\nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\ahmed\anaconda3\envs\neuralnetwork\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping corpora\treebank.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ahmed\anaconda3\envs\neuralnetwork\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
# Load a portion of the treebank corpus for training and testing
train_sents = treebank.tagged_sents()[:3000]
test_sents = treebank.tagged_sents()[3000:3500]

# Alternatively, we can also tokenize text to make use of it later
example_sentence = "This is an example sentence."
tokenized_sentence = word_tokenize(example_sentence)

In [8]:
from nltk.tag import DefaultTagger

# Create a DefaultTagger for unseen words (e.g., tag everything as 'NN' - noun)
default_tagger = DefaultTagger('NN')

# Train Unigram and Bigram taggers with a backoff to the default tagger
unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)
bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)

# Combining the taggers (if needed)
combined_tagger = BigramTagger(train_sents, backoff=unigram_tagger)

In [13]:
# Test the POS tagger on a tokenized sentence
test_sentence = "NLTK is a great toolkit for NLP tasks.".split()
tagged_sentence = combined_tagger.tag(test_sentence)
print(tagged_sentence)

[('NLTK', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('toolkit', 'NN'), ('for', 'IN'), ('NLP', 'NN'), ('tasks.', 'NN')]


In [14]:
# Evaluate the tagger on the test set
accuracy = combined_tagger.accuracy(test_sents)
print(f"Tagger Accuracy: {accuracy * 100:.2f}%")

Tagger Accuracy: 88.31%
