In [75]:
import nltk

In [76]:
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
nltk.download('udhr')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
from nltk.book import *

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package genesis to /home/jovyan/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package inaugural to /home/jovyan/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package nps_chat to /home/jovyan/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
[nltk_data] Downloading package webtext to /home/jovyan/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package treebank to /home/jovyan/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package udhr to /home/jovyan/nltk_data...
[nltk_data]   Package udhr is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downlo

In [77]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [78]:
# one sentence from text1
sent1 

['Call', 'me', 'Ishmael', '.']

In [79]:
# length of Moby Dick (includes some punctuation)
len(text1) 

260819

In [80]:
len(sent1)

4

In [81]:
# length of unique characters of Moby Dick
len(set(text1)) 

19317

In [82]:
# first five words
list(set(text7))[:5] 

['denying', 'initiated', 'Salvador', 'requests', 'cracks']

In [83]:
dist = FreqDist(text7)
len(dist)

12408

In [84]:
vocab1 = dist.keys()
list(vocab1)[:5]

['Pierre', 'Vinken', ',', '61', 'years']

In [85]:
# num occurences of 'four'
dist['four'] 

20

In [86]:
# finds words in vocab1 that ARE GREATER THAN FIVE CHARACTERS LONG and OCCURS MORE THAN 100 TIMES
freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100] 
freqwords[:5]

['billion', 'company', 'president', 'because', 'market']

### Normalization and Stemming

In [87]:
input1 = "List listed lists listing listings"
words1 = input1.lower().split()
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [88]:
porter = nltk.PorterStemmer()
[porter.stem(w) for w in words1]

['list', 'list', 'list', 'list', 'list']

### Lemmatization

In [89]:
# removes suffix but can be incorrect
porter.stem('universal')

'univers'

In [90]:
# Instead, this keeps word unchanged if removed suffix is not a word
wnl = nltk.WordNetLemmatizer()
wnl.lemmatize('universal')

'universal'

### Tokenization

In [91]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split()

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [96]:
nltk.word_tokenize(text11)

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

### Sentence Splitting

In [93]:
text12 = 'This is the first sentnece. A gallon of milk in U.S. is $2.99. Is this the third sentence? Yes, it is!'
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

### Part of Speech Tagging (adjective, noun, pronoun, etc.)

In [99]:
# touples of word and part of speech
nltk.pos_tag(nltk.word_tokenize(text11))

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [101]:
# explains what tag means
nltk.help.upenn_tagset("NNP")

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [102]:
# ambiguous example
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14) 
# labels visiting as a verb instead of an adjective because more likely

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

### Parsing Sentence Structure

In [115]:
# Parsing sentence structure
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [119]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
