# Categorizing and Tagging Words

In [1]:
import nltk

## Using a Tagger

In [2]:
''' 
The process of classifying words into their parts-of-speech and labeling them accordingly
is known as part-of-speech tagging, POS tagging, or simply tagging. Partsof-
speech are also known as word classes or lexical categories. The collection of tags
used for a particular task is known as a tagset. Our emphasis in this chapter is on
exploiting tags, and tagging text automatically.
'''

text = nltk.word_tokenize("And now for something completely different")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [3]:
''' 
Here we see that and is CC, a coordinating conjunction; now and completely are RB, or
adverbs; for is IN, a preposition; something is NN, a noun; and different is JJ, an adjective.
'''

''' 
The text.similar() method takes a word w, finds all contexts w1w w2,
then finds all words w' that appear in the same context, i.e. w1w'w2.
'''

text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')


man time day year car moment world house family child country boy
state job place way war girl work word


### A Simplified Part-of-Speech Tagset

![POS-Table](images/1.png)

In [11]:
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.keys()



dict_keys(['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'VBD', 'NR', 'NN', 'IN', 'NP$', 'JJ', '``', "''", 'CS', 'DTI', 'NNS', '.', 'RBR', ',', 'WDT', 'HVD', 'VBZ', 'CC', 'IN-TL', 'BEDZ', 'VBN', 'NP', 'BEN', 'TO', 'VB', 'RB', 'DT', 'PPS', 'DOD', 'AP', 'BER', 'HV', 'DTS', 'VBG', 'PPO', 'QL', 'JJT', 'ABX', 'NN-HL', 'VBN-HL', 'WRB', 'CD', 'MD', 'BE', 'JJR', 'VBG-TL', 'BEZ', 'NN$-TL', 'HVZ', 'ABN', 'PN', 'PPSS', 'PP$', 'DO', 'NN$', 'NNS-HL', 'WPS', '*', 'EX', 'VB-HL', ':', '(', ')', 'NNS-TL', 'NPS', 'JJS', 'RP', '--', 'BED', 'OD', 'BEG', 'AT-HL', 'VBG-HL', 'AT-TL', 'PPL', 'DOZ', 'NP-HL', 'NR$', 'DOD*', 'BEDZ*', ',-HL', 'CC-TL', 'MD*', 'NNS$', 'PPSS+BER', "'", 'PPSS+BEM', 'CD-TL', 'RBT', '(-HL', ')-HL', 'MD-HL', 'VBZ-HL', 'IN-HL', 'JJ-HL', 'PPLS', 'CD-HL', 'WPO', 'JJS-TL', 'ABL', 'BER-HL', 'PPS+HVZ', 'VBD-HL', 'RP-HL', 'MD*-HL', 'AP-HL', 'CS-HL', 'DT$', 'HVN', 'FW-IN', 'FW-DT', 'VBN-TL', 'NR-TL', 'NNS$-TL', 'FW-NN', 'HVG', 'DTX', 'OD-TL', 'BEM', 'RB-HL', 'PPSS+MD', 'NPS-HL', 'NPS$', 'WP$', 'NN-TL-HL', '

## Defining Dictionaries

In [12]:
''' 
If we try to access a key that is not in a dictionary, we get an error. However, it’s often
useful if a dictionary can automatically create an entry for this new key and give it a
default value, such as zero or the empty list. Since Python 2.5, a special kind of dictionary
called a defaultdict has been available. (It is provided as nltk.defaultdict for
the benefit of readers who are using Python 2.4.) In order to use it, we have to supply
a parameter which can be used to create the default value, e.g., int, float, str, list,
dict, tuple.
'''

frequency = nltk.defaultdict(int)
frequency['colorless'] = 5
frequency['ideas'] 

0

In [13]:
''' 
We need to create a default dictionary that maps each word to its replacement. The
most frequent n words will be mapped to themselves. Everything else will be mapped
to UNK.
'''

alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = list(vocab)[:1000]
mapping = nltk.defaultdict(lambda: 'UNK')
for v in v1000:
    mapping[v] = v
alice2 = [mapping[v] for v in alice]
alice2[:100]

['[',
 'Alice',
 "'",
 's',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'UNK',
 'UNK',
 'UNK',
 'UNK',
 'CHAPTER',
 'I',
 '.',
 'Down',
 'the',
 'Rabbit',
 '-',
 'UNK',
 'Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by',
 'her',
 'sister',
 'on',
 'the',
 'bank',
 ',',
 'and',
 'of',
 'having',
 'nothing',
 'to',
 'do',
 ':',
 'once',
 'or',
 'twice',
 'she',
 'had',
 'peeped',
 'into',
 'the',
 'book',
 'her',
 'sister',
 'was',
 'reading',
 ',',
 'but',
 'it',
 'had',
 'no',
 'pictures',
 'or',
 'UNK',
 'in',
 'it',
 ',',
 "'",
 'and',
 'what',
 'is',
 'the',
 'use',
 'of',
 'a',
 'book',
 ",'",
 'thought',
 'Alice',
 "'",
 'without',
 'pictures',
 'or',
 'conversation',
 "?'",
 'So',
 'she',
 'was',
 'considering',
 'in',
 'her',
 'own',
 'mind',
 '(',
 'as',
 'well',
 'as',
 'she',
 'could',
 ',']

### Incrementally Updating a Dictionary

In [17]:
'''We can employ dictionaries to count occurrences, emulating the method for tallying
words shown in Figure 1-3. We begin by initializing an empty defaultdict, then process
each part-of-speech tag in the text. If the tag hasn’t been seen before, it will have a zero
count by default. Each time we encounter a tag, we increment its count using the +=
operator
'''

counts = nltk.defaultdict(int)
from nltk.corpus import brown

for(word, tag) in brown.tagged_words(categories='news'):
    counts[tag] += 1
    
counts['NN']

    

13162

In [18]:
list(counts)

['AT',
 'NP-TL',
 'NN-TL',
 'JJ-TL',
 'VBD',
 'NR',
 'NN',
 'IN',
 'NP$',
 'JJ',
 '``',
 "''",
 'CS',
 'DTI',
 'NNS',
 '.',
 'RBR',
 ',',
 'WDT',
 'HVD',
 'VBZ',
 'CC',
 'IN-TL',
 'BEDZ',
 'VBN',
 'NP',
 'BEN',
 'TO',
 'VB',
 'RB',
 'DT',
 'PPS',
 'DOD',
 'AP',
 'BER',
 'HV',
 'DTS',
 'VBG',
 'PPO',
 'QL',
 'JJT',
 'ABX',
 'NN-HL',
 'VBN-HL',
 'WRB',
 'CD',
 'MD',
 'BE',
 'JJR',
 'VBG-TL',
 'BEZ',
 'NN$-TL',
 'HVZ',
 'ABN',
 'PN',
 'PPSS',
 'PP$',
 'DO',
 'NN$',
 'NNS-HL',
 'WPS',
 '*',
 'EX',
 'VB-HL',
 ':',
 '(',
 ')',
 'NNS-TL',
 'NPS',
 'JJS',
 'RP',
 '--',
 'BED',
 'OD',
 'BEG',
 'AT-HL',
 'VBG-HL',
 'AT-TL',
 'PPL',
 'DOZ',
 'NP-HL',
 'NR$',
 'DOD*',
 'BEDZ*',
 ',-HL',
 'CC-TL',
 'MD*',
 'NNS$',
 'PPSS+BER',
 "'",
 'PPSS+BEM',
 'CD-TL',
 'RBT',
 '(-HL',
 ')-HL',
 'MD-HL',
 'VBZ-HL',
 'IN-HL',
 'JJ-HL',
 'PPLS',
 'CD-HL',
 'WPO',
 'JJS-TL',
 'ABL',
 'BER-HL',
 'PPS+HVZ',
 'VBD-HL',
 'RP-HL',
 'MD*-HL',
 'AP-HL',
 'CS-HL',
 'DT$',
 'HVN',
 'FW-IN',
 'FW-DT',
 'VBN-TL',
 'NR-TL',
 '

In [19]:
from operator import itemgetter
sorted(counts.items(), key=itemgetter(1), reverse=True)

[('NN', 13162),
 ('IN', 10616),
 ('AT', 8893),
 ('NP', 6866),
 (',', 5133),
 ('NNS', 5066),
 ('.', 4452),
 ('JJ', 4392),
 ('CC', 2664),
 ('VBD', 2524),
 ('NN-TL', 2486),
 ('VB', 2440),
 ('VBN', 2269),
 ('RB', 2166),
 ('CD', 2020),
 ('CS', 1509),
 ('VBG', 1398),
 ('TO', 1237),
 ('PPS', 1056),
 ('PP$', 1051),
 ('MD', 1031),
 ('AP', 923),
 ('NP-TL', 741),
 ('``', 732),
 ('BEZ', 730),
 ('BEDZ', 716),
 ("''", 702),
 ('JJ-TL', 689),
 ('PPSS', 602),
 ('DT', 589),
 ('BE', 525),
 ('VBZ', 519),
 ('NR', 495),
 ('RP', 482),
 ('QL', 468),
 ('PPO', 412),
 ('WPS', 395),
 ('NNS-TL', 344),
 ('WDT', 343),
 ('BER', 328),
 ('WRB', 328),
 ('OD', 309),
 ('HVZ', 301),
 ('--', 300),
 ('NP$', 279),
 ('HV', 265),
 ('HVD', 262),
 ('*', 256),
 ('BED', 252),
 ('NPS', 215),
 ('BEN', 212),
 ('NN$', 210),
 ('DTI', 205),
 ('NP-HL', 186),
 ('ABN', 183),
 ('NN-HL', 171),
 ('IN-TL', 164),
 ('EX', 161),
 (')', 151),
 ('(', 148),
 ('JJR', 145),
 (':', 137),
 ('DTS', 136),
 ('JJT', 100),
 ('CD-TL', 96),
 ('NNS-HL', 92),
 ('

## Automatic Tagging

### The Default Tagger

In [24]:
'''
The simplest possible tagger assigns the same tag to each token. This may seem to be
a rather banal step, but it establishes an important baseline for tagger performance. In
order to get the best result, we tag each word with the most likely tag.
''' 
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
tags = [tag for(word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()

'NN'

In [25]:
# Now we can create a tagger that tags everything as NN.

raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [28]:
### The Regular Expression Tagger
patterns = [
    (r'.*ing$', 'VBG'), # gerunds
    (r'.*ed$', 'VBD'), # simple past
    (r'.*es$', 'VBZ'), # 3rd singular present
    (r'.*ould$', 'MD'), # modals
    (r'.*\'s$', 'NN$'), # possessive nouns
    (r'.*s$', 'NNS'), # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN') # nouns (default)
]

regex_tagger = nltk.RegexpTagger(patterns)
regex_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  regex_tagger.evaluate(brown_tagged_sents)


0.20326391789486245

# N-Gram Tagging

### Unigram Tagging

In [30]:
''' 
Unigram taggers are based on a simple statistical algorithm: for each token, assign the
tag that is most likely for that particular token. For example, it will assign the tag JJ to
any occurrence of the word frequent, since frequent is used as an adjective (e.g., a frequent
word) more often than it is used as a verb (e.g., I frequent this cafe).
'''

# A unigram tagger behaves just like a lookup tagger, except there is a more convenient technique for setting it up, called training.

brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])


[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [31]:
unigram_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_tagger.evaluate(brown_tagged_sents)


0.9349006503968017

### General N-Gram Tagging

![tagger context](images/2.png)

In [32]:
''' 
An n-gram tagger is a generalization of a unigram tagger whose context is the current
word together with the part-of-speech tags of the n-1 preceding tokens
'''
'''
The NgramTagger class uses a tagged training corpus to determine which part-of-speech
tag is most likely for each context. Here we see a special case of an n-gram tagger,
namely a bigram tagger.
'''
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])


[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [33]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [39]:
nltk.download('treebank')  # Ensure the corpus is available
nltk.download('averaged_perceptron_tagger')

from nltk.tag import brill, brill_trainer

# Load sample data
train_data = nltk.corpus.treebank.tagged_sents()[:3000]
test_data = nltk.corpus.treebank.tagged_sents()[3000:]

# Use a basic tagger as the initial tagger
initial_tagger = nltk.tag.UnigramTagger(train_data)

# Define a simple Brill tagging template
templates = brill.brill24()

# Train a Brill tagger using BrillTaggerTrainer
trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates)
brill_tagger = trainer.train(train_data)

# Test the Brill tagger
print(brill_tagger.evaluate(test_data))

[nltk_data] Downloading package treebank to /home/anson/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(brill_tagger.evaluate(test_data))


0.8755018346643644
