In [22]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

## Parts of Speech (POS) Tagging
**Note: All solution notebooks can be found by clicking on the Jupyter icon on the top left of this workspace.**

In [23]:
# import statements
from nltk import pos_tag,ne_chunk
import re


In [24]:
text = "I always lie down to tell a lie."

In [25]:
# tokenize text
sentence = word_tokenize(re.sub(re.compile(r'[^a-zA-Z0-9]'),' ',text.lower()))
print(sentence)
# tag each word with part of speech
print(pos_tag(sentence))

['i', 'always', 'lie', 'down', 'to', 'tell', 'a', 'lie']
[('i', 'NN'), ('always', 'RB'), ('lie', 'VBD'), ('down', 'RB'), ('to', 'TO'), ('tell', 'VB'), ('a', 'DT'), ('lie', 'NN')]


## Named Entity Recognition (NER)

In [26]:
text = "Antonio joined Udacity Inc. in California."
sentence = word_tokenize(text)
print(sentence)  #re.sub(re.compile(r'[^a-zA-Z0-9]'),' ',text.lower())

['Antonio', 'joined', 'Udacity', 'Inc.', 'in', 'California', '.']


In [29]:
print(pos_tag(sentence))

[('Antonio', 'NNP'), ('joined', 'VBD'), ('Udacity', 'NNP'), ('Inc.', 'NNP'), ('in', 'IN'), ('California', 'NNP'), ('.', '.')]


In [28]:
# tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(sentence))
print(tree)

(S
  (PERSON Antonio/NNP)
  joined/VBD
  (ORGANIZATION Udacity/NNP Inc./NNP)
  in/IN
  (GPE California/NNP)
  ./.)


### Sentence Parsing

In [30]:
# Define a custom grammar
my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

In [31]:
# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
