# Import Packages

In [1]:
import nltk
import os

In [2]:
import nltk.corpus

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [4]:
from nltk.data import load

# Accessing NLTK corpora

In [6]:
print(os.listdir(nltk.data.find("corpora")))

['words.zip', 'movie_reviews', 'wordnet.zip', 'gutenberg.zip', 'stopwords.zip', 'stopwords', 'movie_reviews.zip', 'gutenberg', 'wordnet', 'words']


In [7]:
# nltk.download('gutenberg')
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [8]:
gutenberg.words('austen-emma.txt')

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

### Text Mining from Shakspeare's Hamlet Document in Gutenberg Corpora

In [9]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')

In [10]:
len(hamlet)

37360

In [12]:
hamlet_pos = []

In [13]:
for word in hamlet[:2000]:
    word_pos=nltk.pos_tag([word])
    hamlet_pos.append(word_pos)

In [14]:
hamlet_pos

[[('[', 'NN')],
 [('The', 'DT')],
 [('Tragedie', 'NN')],
 [('of', 'IN')],
 [('Hamlet', 'NN')],
 [('by', 'IN')],
 [('William', 'NNP')],
 [('Shakespeare', 'NN')],
 [('1599', 'CD')],
 [(']', 'NN')],
 [('Actus', 'NN')],
 [('Primus', 'NN')],
 [('.', '.')],
 [('Scoena', 'NN')],
 [('Prima', 'NN')],
 [('.', '.')],
 [('Enter', 'NN')],
 [('Barnardo', 'NN')],
 [('and', 'CC')],
 [('Francisco', 'NNP')],
 [('two', 'CD')],
 [('Centinels', 'NNS')],
 [('.', '.')],
 [('Barnardo', 'NN')],
 [('.', '.')],
 [('Who', 'WP')],
 [("'", "''")],
 [('s', 'NN')],
 [('there', 'RB')],
 [('?', '.')],
 [('Fran', 'NN')],
 [('.', '.')],
 [('Nay', 'NN')],
 [('answer', 'NN')],
 [('me', 'PRP')],
 [(':', ':')],
 [('Stand', 'NN')],
 [('&', 'CC')],
 [('vnfold', 'NN')],
 [('your', 'PRP$')],
 [('selfe', 'NN')],
 [('Bar', 'NN')],
 [('.', '.')],
 [('Long', 'RB')],
 [('liue', 'NN')],
 [('the', 'DT')],
 [('King', 'VBG')],
 [('Fran', 'NN')],
 [('.', '.')],
 [('Barnardo', 'NN')],
 [('?', '.')],
 [('Bar', 'NN')],
 [('.', '.')],
 [('He'

### Getting words with a specific POS tag from the Corpora

In [15]:
hamlet_nnp = []
for each_pos in hamlet_pos:
    if each_pos[0][1] =='NNP':
        hamlet_nnp.append(each_pos[0][0])

In [16]:
hamlet_nnp

['William',
 'Francisco',
 'Francisco',
 'Westward',
 'Bell',
 'God',
 'Mart',
 'Sunday',
 'Doth',
 'Doth',
 'God',
 'Day',
 'Sea',
 'Wherein',
 'Kingdome',
 'Bedrid']

### Nouns beginning with upper case are only tagged as Proper Nouns (NNP)

In [17]:
same_names = ["William","william","Reading","reading"]

In [18]:
for each_name in same_names:
    print(nltk.pos_tag([each_name]))

[('William', 'NNP')]
[('william', 'NN')]
[('Reading', 'VBG')]
[('reading', 'NN')]


# POS Tagging extras

In [19]:
sent = "Mary is driving a big car."

In [20]:
sent_tokens = word_tokenize(sent)

In [21]:
for token in sent_tokens:
    print(nltk.pos_tag([token]))

[('Mary', 'NNP')]
[('is', 'VBZ')]
[('driving', 'VBG')]
[('a', 'DT')]
[('big', 'JJ')]
[('car', 'NN')]
[('.', '.')]


In [22]:
sent2 = "John is eating a delicious cake"

In [23]:
sent2_tokens = word_tokenize(sent2)

In [24]:
for token in sent2_tokens:
    print(nltk.pos_tag([token]))

[('John', 'NNP')]
[('is', 'VBZ')]
[('eating', 'VBG')]
[('a', 'DT')]
[('delicious', 'JJ')]
[('cake', 'NN')]


In [25]:
sent3= "Jim eats a banana"

In [26]:
sent3_tokens = word_tokenize(sent3)

In [27]:
for tokens in sent3_tokens:
    print(nltk.pos_tag([tokens]))

[('Jim', 'NNP')]
[('eats', 'NNS')]
[('a', 'DT')]
[('banana', 'NN')]


### Regex Tokenizer

In [28]:
reg_tokenizer = RegexpTokenizer('(?u)\W+|\$[\d\.]+|\S+')

In [29]:
regex_tokenize = reg_tokenizer.tokenize(sent3)

In [30]:
regex_tokenize

['Jim', ' ', 'eats', ' ', 'a', ' ', 'banana']

In [31]:
regex_tag = nltk.pos_tag(regex_tokenize)

In [32]:
regex_tag

[('Jim', 'NNP'),
 (' ', 'NNP'),
 ('eats', 'VBZ'),
 (' ', 'VBP'),
 ('a', 'DT'),
 (' ', 'NN'),
 ('banana', 'NN')]

# NER Tagging Extras

In [33]:
from nltk import ne_chunk

In [34]:
NE_sent = "The US President stays in the White House"

In [35]:
NE_tokens = word_tokenize(NE_sent)

In [36]:
NE_tags = nltk.pos_tag(NE_tokens)

In [37]:
NE_NER = ne_chunk(NE_tags)

In [38]:
print(NE_NER)

(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  (FACILITY White/NNP House/NNP))


In [39]:
NE_sent2 = "The state of New York touches the Atlantic Ocean"

In [40]:
print(ne_chunk(nltk.pos_tag(word_tokenize(NE_sent2))))

(S
  The/DT
  state/NN
  of/IN
  (GPE New/NNP York/NNP)
  touches/VBZ
  the/DT
  (ORGANIZATION Atlantic/NNP Ocean/NNP))


In [1]:
NE_sent3 = "Apple is a fruit and Apple is a Company's name"

In [None]:
print(ne_chunk(nltk.pos_tag(word_tokenize(NE_sent3))))