In [2]:
import nltk

In [3]:
nltk.__version__

'3.2.1'

### Tokenizing text into sentences

In [4]:
from nltk.tokenize import sent_tokenize
# sent_tokenize uses an instance of PunktSentenceTokenizer 
#    from the nltk.tokenize.punkt

In [5]:
para = "Hello World. É bom ver você. Obrigado por comprar este livro, Mr. Abelardo."

In [6]:
sent_tokenize(para)
# linguagem default = english

['Hello World.',
 'É bom ver você.',
 'Obrigado por comprar este livro, Mr. Abelardo.']

cada chamada a sent_tokenize carrega um tokenizer -> para processar várias vezes, melhor instanciar um tokenizer e reutilizar

In [7]:
import nltk.data

tokenizer = nltk.data.load("tokenizers/punkt/portuguese.pickle")

tokenizer.tokenize(para)

['Hello World.',
 'É bom ver você.',
 'Obrigado por comprar este livro, Mr. Abelardo.']

### Tokenizing sentences into words

In [8]:
from nltk.tokenize import word_tokenize

word_tokenize("Hello World.")
# linguagem default = english

['Hello', 'World', '.']

#### TreebankWordTokenizer

In [9]:
from nltk.tokenize import TreebankWordTokenizer

treebank_tokenizer = TreebankWordTokenizer()

treebank_tokenizer.tokenize("Hello World!")

['Hello', 'World', '!']

In [10]:
treebank_tokenizer.tokenize("can't")

['ca', "n't"]

#### PunktWordTokenizer

removido na versão 3 > ver: https://github.com/nltk/nltk/pull/746

#### WordPunctTokenizer

In [11]:
from nltk.tokenize import WordPunctTokenizer

word_punct_tokenizer = WordPunctTokenizer()

word_punct_tokenizer.tokenize("can't")

['can', "'", 't']

### Tokenizing sentences using regular expressions

In [12]:
from nltk.tokenize import regexp_tokenize

regexp_tokenize("can't do anything", "[\w']+")

["can't", 'do', 'anything']

In [13]:
from nltk.tokenize import RegexpTokenizer

regexp_tokenizer_1 = RegexpTokenizer("[\w']+")

regexp_tokenizer_1.tokenize("Não posso mais te ver! Não insista!")

['Não', 'posso', 'mais', 'te', 'ver', 'Não', 'insista']

In [14]:
regexp_tokenizer_2 = RegexpTokenizer("\s+", gaps=True)

regexp_tokenizer_2.tokenize("Não posso mais te ver! Não insista!")

['Não', 'posso', 'mais', 'te', 'ver!', 'Não', 'insista!']

In [15]:
from nltk.tokenize import BlanklineTokenizer

blankline_tokenizer = BlanklineTokenizer()

blankline_tokenizer.tokenize("can't do anything \n\n anything")

["can't do anything", 'anything']

### Filtering stopwords in a tokenized sentence

In [18]:
from nltk.corpus import stopwords

pt_stops = set(stopwords.words("portuguese"))
# stopwords.words() retorna uma lista de stopwords de todas as linguagens disponíveis

words = regexp_tokenizer_1.tokenize("Não posso mais te ver! Não insista!")

[word for word in words if word not in pt_stops]

['Não', 'posso', 'ver', 'Não', 'insista']

In [19]:
len(pt_stops), list(pt_stops)[10:20]

(203,
 ['pela',
  'esteja',
  'tenho',
  'são',
  'depois',
  'tínhamos',
  'houveriam',
  'da',
  'hajamos',
  'suas'])

#### linguagens disponíveis

In [20]:
stopwords.fileids()

['danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'kazakh',
 'norwegian',
 'portuguese',
 'russian',
 'spanish',
 'swedish',
 'turkish']

### Looking up synsets for a word in WordNet

In [22]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [23]:
from nltk.corpus import wordnet

syn = wordnet.synsets("cookbook")[0]

syn.name(), syn.definition()

('cookbook.n.01', 'a book of recipes and cooking directions')

In [24]:
wordnet.synset(syn.name())

Synset('cookbook.n.01')

In [25]:
wordnet.synsets("sex")[0].examples()

['they had sex in the back seat']

#### hypernym

In [26]:
# menos específicos
wordnet.synsets("sex")[0].hypernyms()

[Synset('bodily_process.n.01')]

In [27]:
# mais específicos
wordnet.synsets("sex")[0].hyponyms()

[Synset('autoeroticism.n.01'),
 Synset('bestiality.n.02'),
 Synset('bisexuality.n.02'),
 Synset('bondage.n.03'),
 Synset('carnal_abuse.n.01'),
 Synset('conception.n.02'),
 Synset('coupling.n.03'),
 Synset('foreplay.n.01'),
 Synset('heterosexuality.n.01'),
 Synset('homosexuality.n.01'),
 Synset('lechery.n.01'),
 Synset('outercourse.n.01'),
 Synset('perversion.n.02'),
 Synset('pleasure.n.05'),
 Synset('promiscuity.n.01'),
 Synset('reproduction.n.05'),
 Synset('safe_sex.n.01'),
 Synset('sexual_intercourse.n.01'),
 Synset('sexual_love.n.02')]

In [28]:
wordnet.synsets("sex")[0].root_hypernyms()

[Synset('entity.n.01')]

In [29]:
wordnet.synset("can.v.01").root_hypernyms()

[Synset('make.v.03')]

In [30]:
wordnet.synsets("sex")[0].hypernym_paths()
# ?por que é list de list?

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('process.n.06'),
  Synset('organic_process.n.01'),
  Synset('bodily_process.n.01'),
  Synset('sexual_activity.n.01')]]

### Part-of-speech(POS)

In [32]:
syn.pos()

'n'

There are four common POS found in WordNet.

<table>
    <tr>
        <th>Part-of-speech</th>
        <th>Tag</th>
    </tr>
    <tr>
        <td>Noun</td>
        <td>n</td>
    </tr>
    <tr>
        <td>Adjective</td>
        <td>a</td>
    </tr>
    <tr>
        <td>Adverb</td>
        <td>r</td>
    </tr>
    <tr>
        <td>Verb</td>
        <td>v</td>
    </tr>
</table>

In [51]:
from functools import reduce
from collections import defaultdict

def redutor(d, s):
    
    d[s.pos()].append(s)
    
    return d

reduce(redutor, wordnet.synsets("great"), defaultdict(list))

defaultdict(list,
            {'n': [Synset('great.n.01')],
             's': [Synset('great.s.01'),
              Synset('great.s.02'),
              Synset('great.s.03'),
              Synset('bang-up.s.01'),
              Synset('capital.s.03'),
              Synset('big.s.13')]})

### Looking up lemmas and synonyms in WordNet

In [91]:
def lemas(word, syn=False):
    
    print("synsets:\n")
    if syn:
        
        synsets = [wordnet.synset(word)]
    else:
        
        synsets = wordnet.synsets(word)
        
    for syn in synsets:
        
        print("[{}]".format(syn.name()))
        print("lemmas:")
        for lem in syn.lemmas():
            
            print("\t{}".format(lem.name()))

In [92]:
lemas("cookbook")

synsets:

[cookbook.n.01]
lemmas:
	cookbook
	cookery_book


In [93]:
lemas("button")

synsets:

[button.n.01]
lemmas:
	button
[push_button.n.01]
lemmas:
	push_button
	push
	button
[button.n.03]
lemmas:
	button
[button.n.04]
lemmas:
	button
[clitoris.n.01]
lemmas:
	clitoris
	clit
	button
[release.n.08]
lemmas:
	release
	button
[button.n.07]
lemmas:
	button
[button.v.01]
lemmas:
	button
[button.v.02]
lemmas:
	button


#### Antonyms

##### good 1

In [94]:
good = wordnet.synset("good.n.02")

good.definition()

'moral excellence or admirableness'

In [100]:
evil = good.lemmas()[0].antonyms()[0]

evil.name()

'evil'

In [102]:
evil.synset().definition()

'the quality of being morally wrong in principle or practice'

##### good 2

In [105]:
good = wordnet.synset("good.a.01")

good.definition()

'having desirable or positive qualities especially those suitable for a thing specified'

In [106]:
bad = good.lemmas()[0].antonyms()[0]

bad.name()

'bad'

In [107]:
bad.synset().definition()

'having undesirable or negative qualities'

### Calculating WordNet synset similarity

*wup_similarity* is short for *Wu-Palmer Similarity*, which is a scoring method based on how similar the word senses are and where the synsets occur relative to each other in the hypernym tree

In [109]:
cb = wordnet.synset("cookbook.n.01")
ib = wordnet.synset("instruction_book.n.01")

cb.wup_similarity(ib), ib.wup_similarity(cb)

(0.9166666666666666, 0.9166666666666666)

In [112]:
cb.hypernyms(), ib.hypernyms()
# same hypernyms

([Synset('reference_book.n.01')], [Synset('reference_book.n.01')])

In [113]:
cb.wup_similarity(wordnet.synset("book.n.01"))

0.9090909090909091

In [116]:
cb.shortest_path_distance(wordnet.synset("book.n.01"))

2

In [117]:
cb.common_hypernyms(wordnet.synset("book.n.01"))

[Synset('work.n.02'),
 Synset('entity.n.01'),
 Synset('object.n.01'),
 Synset('book.n.01'),
 Synset('product.n.02'),
 Synset('physical_entity.n.01'),
 Synset('creation.n.02'),
 Synset('artifact.n.01'),
 Synset('whole.n.02'),
 Synset('publication.n.01')]

In [120]:
cb.wup_similarity(wordnet.synset("dog.n.01"))

0.38095238095238093

#### Comparing verbs

In [122]:
cook = wordnet.synset("cook.v.01")
bake = wordnet.synset("bake.v.02")

cook.wup_similarity(bake)
# pelo visto a similaridade mudou :M

0.6666666666666666

#### Path and LCH similarity

In [137]:
cb.path_similarity(ib)

0.3333333333333333

In [138]:
cb.path_similarity(wordnet.synset("dog.n.01"))

0.07142857142857142

In [142]:
%%time

cb.lch_similarity(ib)

Wall time: 500 µs


2.538973871058276

In [143]:
cb.lch_similarity(wordnet.synset("dog.n.01"))

0.9985288301111273

### Discovering word collocations

In [163]:
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures
from nltk.collocations import BigramAssocMeasures

In [147]:
words = [w.lower() for w in webtext.words("grail.txt")]

bcf = BigramCollocationFinder.from_words(words)

bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]

let's remove punctuation and stopwords

In [148]:
stopset = set(stopwords.words("english"))

filter_stops = lambda w: len(w) < 3 or w in stopset

bcf.apply_word_filter(filter_stops)

bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)

[('black', 'knight'),
 ('clop', 'clop'),
 ('head', 'knight'),
 ('mumble', 'mumble')]

## Clarice

In [179]:
import codecs

def make_cfs(filename):
    
    words = []

    with codecs.open(filename, 'r', 'utf-8') as f:

        text = ' '.join(f.readlines())
        words = word_tokenize(text, language="portuguese")
        
    stopset = set(stopwords.words("portuguese"))

    filter_stops = lambda w: len(w) < 3 or w in stopset

    bcf = BigramCollocationFinder.from_words(words)
    tcf = TrigramCollocationFinder.from_words(words)

    bcf.apply_word_filter(filter_stops)
    bcf.apply_freq_filter(3)
    
    tcf.apply_word_filter(filter_stops)
    tcf.apply_freq_filter(3)
    
    return bcf, tcf

In [184]:
clarice_bcf, clarice_tcf = make_cfs('perto_do_coracao_selvagem.txt')

clarice_bcf.nbest(BigramAssocMeasures.likelihood_ratio, 5)

[('alguma', 'coisa'),
 ('cada', 'vez'),
 ('Aos', 'poucos'),
 ('olhos', 'abertos'),
 ('Não', 'sei')]

In [185]:
clarice_tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 5)

[('havia', 'alguma', 'coisa'),
 ('alguma', 'coisa', '...'),
 ('Não', 'sei', '...')]

In [186]:
casmurro_bcf, casmurro_tcf = make_cfs('dom_casmurro.txt')

casmurro_bcf.nbest(BigramAssocMeasures.likelihood_ratio, 5)

[('José', 'Dias'),
 ('prima', 'Justina'),
 ('tio', 'Cosme'),
 ('Minha', 'mãe'),
 ('Padre', 'Cabral')]

In [187]:
casmurro_tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 5)

[('José', 'Dias', 'achou'),
 ('Sr.', 'José', 'Dias'),
 ('posso', 'ser', 'padre'),
 ('veio', 'ter', 'comigo')]

#### Scoring ngrams

In [206]:
sum((1,2))

3

In [215]:
def filtra(juntas, pt1_pt2, N): 
    
    return -1*(sum(pt1_pt2) - juntas)

list(clarice_bcf.above_score(filtra, -15))

[('Direito', 'Público'),
 ('atenção', 'voltada'),
 ('segunda', 'vertigem'),
 ('prima', 'Isabel'),
 ('profundezas', 'chamo'),
 ('profúndezas', 'chamo'),
 ('Papai', 'morreu'),
 ('Estou', 'sofrendo')]

In [221]:
list(clarice_bcf.score_ngrams(filtra))[:10]

[(('Direito', 'Público'), -4.0),
 (('atenção', 'voltada'), -8.0),
 (('segunda', 'vertigem'), -9.0),
 (('prima', 'Isabel'), -11.0),
 (('profundezas', 'chamo'), -12.0),
 (('profúndezas', 'chamo'), -12.0),
 (('Papai', 'morreu'), -13.0),
 (('Estou', 'sofrendo'), -14.0),
 (('primeiro', 'romance'), -17.0),
 (('meia', 'escuridão'), -18.0)]

In [222]:
list(casmurro_bcf.score_ngrams(filtra))[:10]

[(('Passeio', 'Público'), -4.0),
 (('cigana', 'oblíqua'), -5.0),
 (('Dom', 'Casmurro'), -7.0),
 (('santos', 'óleos'), -7.0),
 (('Engenho', 'Novo'), -9.0),
 (('Nosso', 'Senhor'), -10.0),
 (('Santa', 'Mônica'), -11.0),
 (('arte', 'fina'), -13.0),
 (('juízo', 'final'), -13.0),
 (('serás', 'feliz'), -13.0)]