# Tokenization, Tagging, Chunking - Part of Speech Tagging

In [17]:
import nltk

A part of speech tagger will identify the part of speech for a sequence of words.

In [18]:
text = "I walked to the cafe to buy coffee before work."

In [19]:
tokens = nltk.word_tokenize(text)

In [20]:
nltk.pos_tag(tokens)

[('I', 'PRP'),
 ('walked', 'VBD'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('cafe', 'NN'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('coffee', 'NN'),
 ('before', 'IN'),
 ('work', 'NN'),
 ('.', '.')]

For an extensive list of part-of-speech tags visit:
https://en.wikipedia.org/w/index.php?title=Brown_Corpus

In [21]:
nltk.pos_tag(nltk.word_tokenize("I will have desert."))

[('I', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('desert', 'NN'), ('.', '.')]

In [22]:
nltk.pos_tag(nltk.word_tokenize("They will desert us."))

[('They', 'PRP'), ('will', 'MD'), ('desert', 'VB'), ('us', 'PRP'), ('.', '.')]

Create a list of all nouns.

In [23]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [24]:
md_norm = [word.lower() for word in md if word.isalpha()]

In [25]:
md_tags = nltk.pos_tag(md_norm,tagset="universal")

In [26]:
md_tags[:5]

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('by', 'ADP'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN')]

In [27]:
md_nouns = [word for word in md_tags if word[1] == "NOUN"]

In [28]:
nouns_fd = nltk.FreqDist(md_nouns)

In [29]:
nouns_fd.most_common()[:10]  

[(('i', 'NOUN'), 1182),
 (('whale', 'NOUN'), 909),
 (('s', 'NOUN'), 774),
 (('man', 'NOUN'), 527),
 (('ship', 'NOUN'), 498),
 (('sea', 'NOUN'), 435),
 (('head', 'NOUN'), 337),
 (('time', 'NOUN'), 334),
 (('boat', 'NOUN'), 332),
 (('ahab', 'NOUN'), 278)]