In [1]:
import nltk

In [2]:
nltk.__version__

'3.2.2'

### Tokenizing text into sentences

In [8]:
from nltk.tokenize import sent_tokenize
# sent_tokenize uses an instance of PunktSentenceTokenizer 
#    from the nltk.tokenize.punkt

In [9]:
para = "Hello World. É bom ver você. Obrigado por comprar este livro, Mr. Abelardo."

In [10]:
sent_tokenize(para)
# linguagem default = english

['Hello World.',
 'É bom ver você.',
 'Obrigado por comprar este livro, Mr. Abelardo.']

cada chamada a sent_tokenize carrega um tokenizer -> para processar várias vezes, melhor instanciar um tokenizer e reutilizar

In [13]:
import nltk.data

tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")

tokenizer.tokenize(para)

['Hello World.',
 'É bom ver você.',
 'Obrigado por comprar este livro, Mr. Abelardo.']

### Tokenizing sentences into words

In [21]:
from nltk.tokenize import word_tokenize

word_tokenize("Hello World.")
# linguagem default = english

['Hello', 'World', '.']

#### TreebankWordTokenizer

In [22]:
from nltk.tokenize import TreebankWordTokenizer

treebank_tokenizer = TreebankWordTokenizer()

treebank_tokenizer.tokenize("Hello World!")

['Hello', 'World', '!']

In [23]:
treebank_tokenizer.tokenize("can't")

['ca', "n't"]

#### PunktWordTokenizer

removido na versão 3 > ver: https://github.com/nltk/nltk/pull/746

#### WordPunctTokenizer

In [25]:
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()

word_punct_tokenizer.tokenize("can't")

['can', "'", 't']

### Tokenizing sentences using regular expressions

In [28]:
from nltk.tokenize import regexp_tokenize

regexp_tokenize("can't do anything", "[\w']+")

["can't", 'do', 'anything']

In [41]:
from nltk.tokenize import RegexpTokenizer

regexp_tokenizer_1 = RegexpTokenizer("[\w']+")

regexp_tokenizer_1.tokenize("Não posso mais te ver! Não insista!")

['Não', 'posso', 'mais', 'te', 'ver', 'Não', 'insista']

In [42]:
regexp_tokenizer_2 = RegexpTokenizer("\s+", gaps=True)

regexp_tokenizer_2.tokenize("Não posso mais te ver! Não insista!")

['Não', 'posso', 'mais', 'te', 'ver!', 'Não', 'insista!']

In [37]:
from nltk.tokenize import BlanklineTokenizer

blankline_tokenizer = BlanklineTokenizer()

blankline_tokenizer.tokenize("can't do anything \n\n anything")

["can't do anything", 'anything']

### Filtering stopwords in a tokenized sentence

In [40]:
from nltk.corpus import stopwords

pt_stops = set(stopwords.words("portuguese"))
# stopwords.words() retorna uma lista de stopwords de todas as linguagens disponíveis

words = regexp_tokenizer_1.tokenize("Não posso mais te ver! Não insista!")

[word for word in words if word not in pt_stops]

['Não', 'posso', 'ver', 'Não', 'insista']

In [50]:
len(pt_stops), list(pt_stops)[10:20]

(203,
 ['nossos',
  'esse',
  'tu',
  'houvéramos',
  'for',
  'terá',
  'tua',
  'pelas',
  'essas',
  'houverá'])

#### linguagens disponíveis

In [47]:
stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'portuguese',
 'russian',
 'spanish',
 'swedish',
 'turkish']

### Looking up synsets for a word in WordNet

In [55]:
from nltk.corpus import wordnet

syn = wordnet.synsets("cookbook")[0]

syn.name(), syn.definition()

('cookbook.n.01', 'a book of recipes and cooking directions')

In [63]:
wordnet.synset(syn.name())

Synset('cookbook.n.01')

In [64]:
wordnet.synsets("sex")[0].examples()

['they had sex in the back seat']

#### hypernym

In [65]:
# menos específicos
wordnet.synsets("sex")[0].hypernyms()

[Synset('bodily_process.n.01')]

In [66]:
# mais específicos
wordnet.synsets("sex")[0].hyponyms()

[Synset('autoeroticism.n.01'),
 Synset('bestiality.n.02'),
 Synset('bisexuality.n.02'),
 Synset('bondage.n.03'),
 Synset('carnal_abuse.n.01'),
 Synset('conception.n.02'),
 Synset('coupling.n.03'),
 Synset('foreplay.n.01'),
 Synset('heterosexuality.n.01'),
 Synset('homosexuality.n.01'),
 Synset('lechery.n.01'),
 Synset('outercourse.n.01'),
 Synset('perversion.n.02'),
 Synset('pleasure.n.05'),
 Synset('promiscuity.n.01'),
 Synset('reproduction.n.05'),
 Synset('safe_sex.n.01'),
 Synset('sexual_intercourse.n.01'),
 Synset('sexual_love.n.02')]

In [67]:
wordnet.synsets("sex")[0].root_hypernyms()

[Synset('entity.n.01')]

In [72]:
wordnet.synset("can.v.01").root_hypernyms()

[Synset('make.v.03')]

In [75]:
wordnet.synsets("sex")[0].hypernym_paths()
# ?por que é list de list?

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('process.n.06'),
  Synset('organic_process.n.01'),
  Synset('bodily_process.n.01'),
  Synset('sexual_activity.n.01')]]

### Part-of-speech(POS)