'''
 Parts of Speech using NLTK 
 Author : Amruta Abhyankar
 Date : 04/28/2021
'''

In [1]:
import nltk

In [2]:
text = "I walked to the cafe to buy the coffee before the work."

In [3]:
tokens = nltk.word_tokenize(text)

In [4]:
nltk.pos_tag(tokens)

[('I', 'PRP'),
 ('walked', 'VBD'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('cafe', 'NN'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('the', 'DT'),
 ('coffee', 'NN'),
 ('before', 'IN'),
 ('the', 'DT'),
 ('work', 'NN'),
 ('.', '.')]

In [5]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [6]:
nltk.pos_tag(nltk.word_tokenize("I will have desert."))

[('I', 'PRP'), ('will', 'MD'), ('have', 'VB'), ('desert', 'NN'), ('.', '.')]

In [7]:
nltk.pos_tag(nltk.word_tokenize("They will desert us."))

[('They', 'PRP'), ('will', 'MD'), ('desert', 'VB'), ('us', 'PRP'), ('.', '.')]

In [8]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [9]:
md_norm = [word.lower() for word in md if word.isalpha()]

In [10]:
md_norm

['moby',
 'dick',
 'by',
 'herman',
 'melville',
 'etymology',
 'supplied',
 'by',
 'a',
 'late',
 'consumptive',
 'usher',
 'to',
 'a',
 'grammar',
 'school',
 'the',
 'pale',
 'usher',
 'threadbare',
 'in',
 'coat',
 'heart',
 'body',
 'and',
 'brain',
 'i',
 'see',
 'him',
 'now',
 'he',
 'was',
 'ever',
 'dusting',
 'his',
 'old',
 'lexicons',
 'and',
 'grammars',
 'with',
 'a',
 'queer',
 'handkerchief',
 'mockingly',
 'embellished',
 'with',
 'all',
 'the',
 'gay',
 'flags',
 'of',
 'all',
 'the',
 'known',
 'nations',
 'of',
 'the',
 'world',
 'he',
 'loved',
 'to',
 'dust',
 'his',
 'old',
 'grammars',
 'it',
 'somehow',
 'mildly',
 'reminded',
 'him',
 'of',
 'his',
 'mortality',
 'while',
 'you',
 'take',
 'in',
 'hand',
 'to',
 'school',
 'others',
 'and',
 'to',
 'teach',
 'them',
 'by',
 'what',
 'name',
 'a',
 'whale',
 'fish',
 'is',
 'to',
 'be',
 'called',
 'in',
 'our',
 'tongue',
 'leaving',
 'out',
 'through',
 'ignorance',
 'the',
 'letter',
 'h',
 'which',
 'almos

In [11]:
md_tags = nltk.pos_tag(md_norm,tagset = "universal")

In [12]:
md_tags[:5]

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('by', 'ADP'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN')]

In [13]:
md_nouns = [word for word in md_tags if word[1] == "NOUN"]

In [14]:
md_nouns

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN'),
 ('etymology', 'NOUN'),
 ('consumptive', 'NOUN'),
 ('usher', 'NOUN'),
 ('grammar', 'NOUN'),
 ('school', 'NOUN'),
 ('pale', 'NOUN'),
 ('usher', 'NOUN'),
 ('threadbare', 'NOUN'),
 ('heart', 'NOUN'),
 ('body', 'NOUN'),
 ('brain', 'NOUN'),
 ('i', 'NOUN'),
 ('lexicons', 'NOUN'),
 ('grammars', 'NOUN'),
 ('queer', 'NOUN'),
 ('handkerchief', 'NOUN'),
 ('gay', 'NOUN'),
 ('flags', 'NOUN'),
 ('nations', 'NOUN'),
 ('world', 'NOUN'),
 ('grammars', 'NOUN'),
 ('mortality', 'NOUN'),
 ('hand', 'NOUN'),
 ('school', 'NOUN'),
 ('others', 'NOUN'),
 ('name', 'NOUN'),
 ('fish', 'NOUN'),
 ('tongue', 'NOUN'),
 ('ignorance', 'NOUN'),
 ('letter', 'NOUN'),
 ('h', 'NOUN'),
 ('signification', 'NOUN'),
 ('word', 'NOUN'),
 ('hackluyt', 'NOUN'),
 ('sw', 'NOUN'),
 ('dan', 'NOUN'),
 ('hval', 'NOUN'),
 ('animal', 'NOUN'),
 ('roundness', 'NOUN'),
 ('dan', 'NOUN'),
 ('hvalt', 'NOUN'),
 ('webster', 'NOUN'),
 ('s', 'NOUN'),
 ('whale', 'NOUN'),


In [15]:
nouns_fd = nltk.FreqDist(md_nouns)

In [16]:
nouns_fd

FreqDist({('i', 'NOUN'): 1182, ('whale', 'NOUN'): 909, ('s', 'NOUN'): 774, ('man', 'NOUN'): 527, ('ship', 'NOUN'): 498, ('sea', 'NOUN'): 435, ('head', 'NOUN'): 337, ('time', 'NOUN'): 334, ('boat', 'NOUN'): 332, ('ahab', 'NOUN'): 278, ...})

In [17]:
nouns_fd.most_common(10)

[(('i', 'NOUN'), 1182),
 (('whale', 'NOUN'), 909),
 (('s', 'NOUN'), 774),
 (('man', 'NOUN'), 527),
 (('ship', 'NOUN'), 498),
 (('sea', 'NOUN'), 435),
 (('head', 'NOUN'), 337),
 (('time', 'NOUN'), 334),
 (('boat', 'NOUN'), 332),
 (('ahab', 'NOUN'), 278)]