# Mineração de Texto

In [1]:
import string
from unidecode import unidecode

import pandas as pd
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.corpus import brown, treebank, movie_reviews, reuters

import re
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import word2vec
import spacy
from scipy.spatial import distance

# !pip install pyemd
# import pyemd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

## Reconhecimento de Entidades

In [2]:
nlp = spacy.load("pt_core_news_sm")
doc = nlp("Pedro Álvares Cabral descobriu o Brasil em 22/04/1500.")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Pedro Álvares Cabral 0 20 PER
Brasil 33 39 LOC


In [3]:
ner = nlp.get_pipe('ner')
print(ner.labels)

('LOC', 'MISC', 'ORG', 'PER')


In [4]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Brazil was officially discovered in 1500, when a fleet commanded by Portuguese diplomat Pedro Álvares Cabral, \
on its way to India, landed in Porto Seguro, between Salvador and Rio de Janeiro. There is, however, strong evidence that \
other Portuguese adventurers preceded him. Duarte Pacheco Pereira, in his book De Situ Orbis, tells of being in Brazil \
in 1498, sent by King Manuel of Portugal.")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Brazil 0 6 GPE
1500 36 40 CARDINAL
Portuguese 68 78 NORP
Pedro Álvares Cabral 88 108 PERSON
India 124 129 GPE
Porto Seguro 141 153 PERSON
Salvador 163 171 GPE
Rio de Janeiro 176 190 GPE
Portuguese 238 248 NORP
Duarte Pacheco Pereira 275 297 PERSON
De Situ Orbis 311 324 WORK_OF_ART
Brazil 344 350 GPE
1498 354 358 DATE
King Manuel 368 379 PERSON
Portugal 383 391 GPE


In [5]:
ner = nlp.get_pipe('ner')
print(ner.labels)

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [6]:
from spacy import displacy
displacy.render(doc, style="ent")

In [7]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

[('Brazil', 0, 6, 'GPE'), ('1500', 36, 40, 'CARDINAL'), ('Portuguese', 68, 78, 'NORP'), ('Pedro Álvares Cabral', 88, 108, 'PERSON'), ('India', 124, 129, 'GPE'), ('Porto Seguro', 141, 153, 'PERSON'), ('Salvador', 163, 171, 'GPE'), ('Rio de Janeiro', 176, 190, 'GPE'), ('Portuguese', 238, 248, 'NORP'), ('Duarte Pacheco Pereira', 275, 297, 'PERSON'), ('De Situ Orbis', 311, 324, 'WORK_OF_ART'), ('Brazil', 344, 350, 'GPE'), ('1498', 354, 358, 'DATE'), ('King Manuel', 368, 379, 'PERSON'), ('Portugal', 383, 391, 'GPE')]


In [8]:
gpeEntities = list(set([e.text for e in doc.ents if e.label_ == "GPE"]))
personEntities = list(set([e.text for e in doc.ents if e.label_ == "PERSON"]))

In [9]:
gpeEntities

['Portugal', 'India', 'Brazil', 'Salvador', 'Rio de Janeiro']

In [10]:
personEntities

['Porto Seguro',
 'King Manuel',
 'Pedro Álvares Cabral',
 'Duarte Pacheco Pereira']

In [11]:
from itertools import *
entities = {key: list(g) for key, g in groupby(sorted(doc.ents, key=lambda x: x.label_), lambda x: x.label_)}
print(entities['GPE'], entities['PERSON'])

[Brazil, India, Salvador, Rio de Janeiro, Brazil, Portugal] [Pedro Álvares Cabral, Porto Seguro, Duarte Pacheco Pereira, King Manuel]


## Relacionamento de Entidades

In [12]:
doc = nlp("Portuguese sailor discovered Brazil in 1500.")
displacy.render(doc, style="dep")

In [13]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Portuguese amod sailor NOUN []
sailor nsubj discovered VERB [Portuguese]
discovered ROOT discovered VERB [sailor, Brazil, in, .]
Brazil dobj discovered VERB []
in prep discovered VERB [1500]
1500 pobj in ADP []
. punct discovered VERB []


In [14]:
spacy.explain('amod')

'adjectival modifier'

In [15]:
spacy.explain('nsubj')

'nominal subject'

In [16]:
spacy.explain('dobj')

'direct object'

In [17]:
spacy.explain('prep')

'prepositional modifier'

In [18]:
spacy.explain('pobj')

'object of preposition'

In [19]:
doc = nlp("The book is on the table. The cat is on the roof. The book is on the roof. The cat is on the table.")
sentence_spans = list(doc.sents)
options = {"compact": True, "bg": "#09a3d5", "color": "white", "font": "Courier New"}
displacy.render(sentence_spans, style="dep", options=options)

In [20]:
doc = nlp("Brazil was officially discovered in 1500, when a fleet commanded by Portuguese diplomat Pedro Álvares Cabral, \
on its way to India, landed in Porto Seguro, between Salvador and Rio de Janeiro. There is, however, strong evidence that \
other Portuguese adventurers preceded him. Duarte Pacheco Pereira, in his book De Situ Orbis, tells of being in Brazil \
in 1498, sent by King Manuel of Portugal.")

sent = [sent.text for sent in doc.sents]
sent

['Brazil was officially discovered in 1500, when a fleet commanded by Portuguese diplomat Pedro Álvares Cabral, on its way to India, landed in Porto Seguro, between Salvador and Rio de Janeiro.',
 'There is, however, strong evidence that other Portuguese adventurers preceded him.',
 'Duarte Pacheco Pereira, in his book De Situ Orbis, tells of being in Brazil in 1498, sent by King Manuel of Portugal.']

In [21]:
from pathlib import Path
for sent in doc.sents:
    texto = nlp(str(sent.text))
    svg = displacy.render(texto, style="dep", jupyter=False)
    file_name = '-'.join([w.text for w in texto if not w.is_punct]) + ".svg"
    output_path = Path("./imagens/" + file_name)
    output_path.open("w", encoding="utf-8").write(str(svg))

# POS Tagging

In [22]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Brazil was officially discovered in 1500, when a fleet commanded by Portuguese diplomat Pedro Álvares Cabral, \
on its way to India, landed in Porto Seguro, between Salvador and Rio de Janeiro. There is, however, strong evidence that \
other Portuguese adventurers preceded him. Duarte Pacheco Pereira, in his book De Situ Orbis, tells of being in Brazil \
in 1498, sent by King Manuel of Portugal.")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Brazil Brazil PROPN NNP nsubjpass Xxxxx True False
was be AUX VBD auxpass xxx True True
officially officially ADV RB advmod xxxx True False
discovered discover VERB VBN ROOT xxxx True False
in in ADP IN prep xx True True
1500 1500 NUM CD pobj dddd False False
, , PUNCT , punct , False False
when when ADV WRB advmod xxxx True True
a a DET DT det x True True
fleet fleet NOUN NN nsubj xxxx True False
commanded command VERB VBN acl xxxx True False
by by ADP IN agent xx True True
Portuguese portuguese ADJ JJ amod Xxxxx True False
diplomat diplomat NOUN NN compound xxxx True False
Pedro Pedro PROPN NNP compound Xxxxx True False
Álvares Álvares PROPN NNP compound Xxxxx True False
Cabral Cabral PROPN NNP pobj Xxxxx True False
, , PUNCT , punct , False False
on on ADP IN prep xx True True
its -PRON- DET PRP$ poss xxx True True
way way NOUN NN pobj xxx True False
to to ADP IN prep xx True True
India India PROPN NNP pobj Xxxxx True False
, , PUNCT , punct , False False
landed land VERB VBD relcl 

In [23]:
tag_lst = nlp.pipe_labels['tagger']
print(tag_lst)

['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``']


In [24]:
POS_counts = doc.count_by(spacy.attrs.POS)
for k,v in sorted(POS_counts.items()):
    print(f'{k:{4}}. {doc.vocab[k].text:{5}}: {v}')

  84. ADJ  : 4
  85. ADP  : 12
  86. ADV  : 3
  87. AUX  : 3
  89. CCONJ: 1
  90. DET  : 3
  92. NOUN : 6
  93. NUM  : 2
  95. PRON : 2
  96. PROPN: 21
  97. PUNCT: 12
  98. SCONJ: 1
 100. VERB : 6


In [25]:
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

164681854541413346. RB  : 2
783433942507015291. NNS : 1
1292078113972184607. IN  : 13
1534113631682161808. VBG : 1
2593208677638477497. ,   : 9
3822385049556375858. VBN : 3
4062917326063685704. PRP$: 2
8427216679587749980. CD  : 2
10554686591937588953. JJ  : 4
12646065887601541794. .   : 3
13656873538139661788. PRP : 1
13927759927860985106. VBZ : 2
15267657372422890137. DT  : 1
15308085513773655218. NN  : 5
15361090031084224697. EX  : 1
15794550382381185553. NNP : 21
17109001835818727656. VBD : 3
17524233984504158541. WRB : 1
17571114184892886314. CC  : 1


In [26]:
# Brown corpus
print(str(nltk.corpus.brown).replace('\\\\','/'))
# Penn Treebank Corpus
print(str(nltk.corpus.treebank).replace('\\\\','/'))

<CategorizedTaggedCorpusReader in '.../corpora/brown' (not loaded yet)>
<BracketParseCorpusReader in '.../corpora/treebank/combined' (not loaded yet)>


In [27]:
print(treebank.fileids())

['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', 'wsj_0005.mrg', 'wsj_0006.mrg', 'wsj_0007.mrg', 'wsj_0008.mrg', 'wsj_0009.mrg', 'wsj_0010.mrg', 'wsj_0011.mrg', 'wsj_0012.mrg', 'wsj_0013.mrg', 'wsj_0014.mrg', 'wsj_0015.mrg', 'wsj_0016.mrg', 'wsj_0017.mrg', 'wsj_0018.mrg', 'wsj_0019.mrg', 'wsj_0020.mrg', 'wsj_0021.mrg', 'wsj_0022.mrg', 'wsj_0023.mrg', 'wsj_0024.mrg', 'wsj_0025.mrg', 'wsj_0026.mrg', 'wsj_0027.mrg', 'wsj_0028.mrg', 'wsj_0029.mrg', 'wsj_0030.mrg', 'wsj_0031.mrg', 'wsj_0032.mrg', 'wsj_0033.mrg', 'wsj_0034.mrg', 'wsj_0035.mrg', 'wsj_0036.mrg', 'wsj_0037.mrg', 'wsj_0038.mrg', 'wsj_0039.mrg', 'wsj_0040.mrg', 'wsj_0041.mrg', 'wsj_0042.mrg', 'wsj_0043.mrg', 'wsj_0044.mrg', 'wsj_0045.mrg', 'wsj_0046.mrg', 'wsj_0047.mrg', 'wsj_0048.mrg', 'wsj_0049.mrg', 'wsj_0050.mrg', 'wsj_0051.mrg', 'wsj_0052.mrg', 'wsj_0053.mrg', 'wsj_0054.mrg', 'wsj_0055.mrg', 'wsj_0056.mrg', 'wsj_0057.mrg', 'wsj_0058.mrg', 'wsj_0059.mrg', 'wsj_0060.mrg', 'wsj_0061.mrg', 'wsj_0062.mrg', 'wsj_00

In [28]:
print(treebank.words('wsj_0003.mrg'))

['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...]


In [29]:
print(treebank.tagged_words('wsj_0003.mrg'))

[('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...]


In [30]:
print(treebank.parsed_sents('wsj_0003.mrg')[0])

(S
  (S-TPC-1
    (NP-SBJ
      (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos))))
      (RRC
        (ADVP-TMP (RB once))
        (VP
          (VBN used)
          (NP (-NONE- *))
          (S-CLR
            (NP-SBJ (-NONE- *))
            (VP
              (TO to)
              (VP
                (VB make)
                (NP (NNP Kent) (NN cigarette) (NNS filters))))))))
    (VP
      (VBZ has)
      (VP
        (VBN caused)
        (NP
          (NP (DT a) (JJ high) (NN percentage))
          (PP (IN of) (NP (NN cancer) (NNS deaths)))
          (PP-LOC
            (IN among)
            (NP
              (NP (DT a) (NN group))
              (PP
                (IN of)
                (NP
                  (NP (NNS workers))
                  (RRC
                    (VP
                      (VBN exposed)
                      (NP (-NONE- *))
                      (PP-CLR (TO to) (NP (PRP it)))
                      (ADVP-TMP
                        (NP
                 

In [31]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [32]:
movie_reviews.categories()

['neg', 'pos']

In [33]:
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [34]:
from nltk.corpus import names
names.fileids()

['female.txt', 'male.txt']

In [35]:
len(names.words('female.txt'))

5001

In [36]:
len(names.words('male.txt'))

2943

# MAC_Morpho

In [37]:
from nltk.corpus import mac_morpho
from nltk.tag import UnigramTagger

In [38]:
mac_morpho.words()

['Jersei', 'atinge', 'média', 'de', 'Cr$', '1,4', ...]

In [39]:
mac_morpho.sents()

[['Jersei', 'atinge', 'média', 'de', 'Cr$', '1,4', 'milhão', 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'São', 'Paulo'], ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposição', 'Nacional', 'do', 'Zebu', ',', 'que', 'começa', 'dia', '25'], ...]

In [40]:
mac_morpho.tagged_words()

[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ...]

In [41]:
mac_morpho.tagged_sents()

[[('Jersei', 'N'), ('atinge', 'V'), ('média', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milhão', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('São', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposição', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('começa', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]

In [42]:
doc = "Pedro Álvares Cabral descobriu o Brasil em 22/04/1500."
tokens = nltk.word_tokenize(doc)
print(*tokens)

Pedro Álvares Cabral descobriu o Brasil em 22/04/1500 .


In [43]:
sentencas = mac_morpho.tagged_sents()
label = UnigramTagger(sentencas)
tags = label.tag(tokens)
print(tags)

[('Pedro', 'NPROP'), ('Álvares', 'NPROP'), ('Cabral', 'NPROP'), ('descobriu', 'V'), ('o', 'ART'), ('Brasil', 'NPROP'), ('em', 'PREP|+'), ('22/04/1500', None), ('.', '.')]


In [44]:
doc = "Quando meu time é campeão, eu morro de felicidade. Com grito de campeão, o morro é só felicidade."
tokens = nltk.word_tokenize(doc)
print(*tokens)

Quando meu time é campeão , eu morro de felicidade . Com grito de campeão , o morro é só felicidade .


In [45]:
sentencas = mac_morpho.tagged_sents()
label = UnigramTagger(sentencas)
tags = label.tag(tokens)
print(tags)

[('Quando', 'KS'), ('meu', 'PROADJ'), ('time', 'N'), ('é', 'V'), ('campeão', 'N'), (',', ','), ('eu', 'PROPESS'), ('morro', 'N'), ('de', 'PREP'), ('felicidade', 'N'), ('.', '.'), ('Com', 'PREP'), ('grito', 'N'), ('de', 'PREP'), ('campeão', 'N'), (',', ','), ('o', 'ART'), ('morro', 'N'), ('é', 'V'), ('só', 'PDEN'), ('felicidade', 'N'), ('.', '.')]


In [46]:
nlp = spacy.load("pt_core_news_sm")
doc = nlp("Quando meu time é campeão, eu morro de felicidade. Com grito de campeão, o morro é só felicidade.")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Quando Quando ADV <rel>|ADV|@ADVL> advmod Xxxxx True True
meu meu DET <poss>|DET|M|S|@>N det xxx True True
time time NOUN <np-def>|N|M|S|@SUBJ> nsubj xxxx True False
é ser VERB <mv>|V|PR|3S|IND|@FS-<ACC cop x True True
campeão campeão ADJ ADJ|M|S|@<SC ROOT xxxx True False
, , PUNCT PU|@PU punct , False False
eu eu PRON PERS|M|1S|NOM|@SUBJ> appos xx True True
morro morrer PROPN PROP|M|S|@NPHR acl xxxx True False
de de ADP PRP|@A< case xx True True
felicidade felicidade NOUN <np-idf>|N|F|S|@P< nmod xxxx True False
. . PUNCT PU|@PU punct . False False
Com Com ADP PRP|@ADVL> case Xxx True True
grito gritar NOUN <np-idf>|N|M|S|@P< nmod xxxx True False
de de ADP PRP|@N< case xx True True
campeão campeão NOUN <np-idf>|N|M|S|@P< nmod xxxx True False
, , PUNCT PU|@PU punct , False False
o o DET <artd>|ART|M|S|@>N det x True True
morro morrer NOUN <np-def>|N|M|S|@SUBJ> nsubj xxxx True False
é ser VERB <mv>|V|PR|3S|IND|@FS-STA cop x True True
só só ADV ADV|@>N advmod xx True True
felicidade felic

In [47]:
displacy.render(doc, style="dep")

In [48]:
nlp = spacy.load("pt_core_news_sm")
doc = nlp("Pedro Álvares Cabral descobriu o Brasil em 22/04/1500.")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Pedro Pedro PROPN PROPN nsubj Xxxxx True False
Álvares Álvares PROPN PROPN flat:name Xxxxx True False
Cabral Cabral PROPN PROPN flat:name Xxxxx True False
descobriu descobrir VERB <mv>|V|PS|3S|IND|@FS-STA ROOT xxxx True False
o o DET <artd>|ART|M|S|@>N det x True True
Brasil Brasil PROPN PROP|M|S|@<ACC obj Xxxxx True False
em em ADP PRP|@<ADVL case xx True True
22/04/1500 22/04/1500 NUM <card>|NUM|M|S|@P< obl dd/dd/dddd False False
. . PUNCT PU|@PU punct . False False


In [49]:
displacy.render(doc, style="dep")

In [50]:
from nltk.corpus import floresta
floresta.words()

['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]

In [51]:
floresta.sents()

[['Um', 'revivalismo', 'refrescante'], ['O', '7_e_Meio', 'é', 'um', 'ex-libris', 'de', 'a', 'noite', 'algarvia', '.'], ...]

In [52]:
def simplify_tag(t):
    if "+" in t:
        return t[t.index("+")+1:]
    else:
        return t

In [53]:
twords = nltk.corpus.floresta.tagged_words()
twords = [(w.lower(),simplify_tag(t)) for (w,t) in twords]
twords[:10]

[('um', 'art'),
 ('revivalismo', 'n'),
 ('refrescante', 'adj'),
 ('o', 'art'),
 ('7_e_meio', 'prop'),
 ('é', 'v-fin'),
 ('um', 'art'),
 ('ex-libris', 'n'),
 ('de', 'prp'),
 ('a', 'art')]

In [54]:
words = floresta.words()
len(words)
fd = nltk.FreqDist(words)
len(fd)
fd

FreqDist({'de': 14569, ',': 13444, 'a': 12656, 'o': 10025, '.': 7725, 'em': 5505, 'e': 3981, 'que': 3956, 'os': 3223, '«': 2369, ...})

In [55]:
tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()]
fd = nltk.FreqDist(tags)
fd.most_common(20)

[('n', 40081),
 ('prp', 32442),
 ('art', 29360),
 ('v-fin', 15802),
 (',', 13444),
 ('prop', 11652),
 ('adj', 10725),
 ('adv', 9096),
 ('.', 7725),
 ('conj-c', 5119),
 ('v-inf', 5015),
 ('pron-det', 4972),
 ('v-pcp', 4661),
 ('num', 4157),
 ('pron-indp', 3278),
 ('pron-pers', 2748),
 ('«', 2369),
 ('»', 2310),
 ('conj-s', 2284),
 ('}', 1047)]

In [56]:
def concordance(word, context=30):
    for sent in floresta.sents():
        if word in sent:
            pos = sent.index(word)
            left = ' '.join(sent[:pos])
            right = ' '.join(sent[pos+1:])
            print('%*s %s %-*s'%(context, left[-context:], word, context, right[:context]))

In [57]:
concordance("morro")

 a principal via de acesso a o morro .                             
Southern_Airlines , bate em um morro pouco antes de aterrissar em a
ificaram como traficantes de o morro Azul { em o Flamengo } .      


# Similaridade Semântica


In [58]:
sent_1 = 'The book is on the table.'
sent_2 = 'The cat is on the roof.'
sent_3 = 'The book is on the roof.'
sent_4 = 'The cat is on the table.'

In [59]:
docs = [sent_1, sent_2, sent_3, sent_4]

In [60]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(docs)
doc_term_matrix = sparse_matrix.todense()

df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['sent_1', 'sent_2', 'sent_3', 'sent_4'])
df

Unnamed: 0,book,cat,is,on,roof,table,the
sent_1,1,0,1,1,0,1,2
sent_2,0,1,1,1,1,0,2
sent_3,1,0,1,1,1,0,2
sent_4,0,1,1,1,0,1,2


In [61]:
def Jaccard_Similarity(sent_a, sent_b): 
    
    words_sent_a = set(sent_a.lower().split()) 
    words_sent_b = set(sent_b.lower().split())
    
    intersection = words_sent_a.intersection(words_sent_b)
    union = words_sent_a.union(words_sent_b)
    
    return float(len(intersection)) / len(union)

In [62]:
Jaccard_Similarity(sent_1, sent_2)

0.42857142857142855

In [63]:
Jaccard_Similarity(sent_1, sent_3)

0.6666666666666666

In [64]:
Jaccard_Similarity(sent_1, sent_4)

0.6666666666666666

In [65]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

[[1.    0.75  0.875 0.875]
 [0.75  1.    0.875 0.875]
 [0.875 0.875 1.    0.75 ]
 [0.875 0.875 0.75  1.   ]]


In [66]:
# Word Mover's Distance
# Precisa instalar o pyemd
# PyEmd precisa do Microsoft Visual Studio C++
# path = "C:\\Temp\\GoogleNews-vectors-negative300.bin"
# model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

In [67]:
# sent_5 = 'Obama speaks to the media in Illinois'
# sent_6 = 'The president greets the press in Chicago'
# distance = model.wmdistance(sent_7, sent_8)

# Parsing

In [68]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [69]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x256ec993100>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x256e62cdf40>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x256ed0c58e0>)]

In [70]:
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [71]:
from nltk import pos_tag, word_tokenize, RegexpParser

tags = pos_tag(word_tokenize(sent_1))
chunker = RegexpParser("""
                       NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases
                       P: {<IN>}               #To extract Prepositions
                       V: {<V.*>}              #To extract Verbs
                       PP: {<p> <NP>}          #To extract Prepositional Phrases
                       VP: {<V> <NP|PP>*}      #To extract Verb Phrases
                       """)
 
output = chunker.parse(tags)
print("Chunks:\n", output)

Chunks:
 (S
  (NP The/DT book/NN)
  (VP (V is/VBZ))
  (P on/IN)
  (NP the/DT table/NN)
  ./.)


In [72]:
# output.draw()

In [73]:
from nltk.tree import Tree
from nltk.draw.tree import TreeView
t = Tree.fromstring(str(output))
TreeView(t)._cframe.print_to_file('./output.ps')

In [74]:
# Instalar ghostscript e adicionar caminho ao path
# pip install ghostscript
from PIL import Image

psimage=Image.open('./output.ps')
psimage.save('./output.png')

OSError: Unable to locate Ghostscript on paths

In [75]:
sentencas = mac_morpho.tagged_sents()
label = UnigramTagger(sentencas)
tags = label.tag(tokens)
print(tags)

[('Quando', 'KS'), ('meu', 'PROADJ'), ('time', 'N'), ('é', 'V'), ('campeão', 'N'), (',', ','), ('eu', 'PROPESS'), ('morro', 'N'), ('de', 'PREP'), ('felicidade', 'N'), ('.', '.'), ('Com', 'PREP'), ('grito', 'N'), ('de', 'PREP'), ('campeão', 'N'), (',', ','), ('o', 'ART'), ('morro', 'N'), ('é', 'V'), ('só', 'PDEN'), ('felicidade', 'N'), ('.', '.')]


In [76]:
from nltk.chunk import RegexpParser
pattern = 'NP:{<NPROP><NPROP>|<N><N>}'
analysis = RegexpParser(pattern)
arvore = analysis.parse(tags)
print(arvore)

(S
  Quando/KS
  meu/PROADJ
  time/N
  é/V
  campeão/N
  ,/,
  eu/PROPESS
  morro/N
  de/PREP
  felicidade/N
  ./.
  Com/PREP
  grito/N
  de/PREP
  campeão/N
  ,/,
  o/ART
  morro/N
  é/V
  só/PDEN
  felicidade/N
  ./.)


## Extração de Tópicos

In [77]:
corpus = "Brazil was officially discovered in 1500, when a fleet commanded by Portuguese diplomat Pedro Álvares Cabral, \
on its way to India, landed in Porto Seguro, between Salvador and Rio de Janeiro. There is, however, strong evidence that \
other Portuguese adventurers preceded him. Duarte Pacheco Pereira, in his book De Situ Orbis, tells of being in Brazil \
in 1498, sent by King Manuel of Portugal."

In [78]:
import texthero as hero
from nltk.corpus import stopwords
stops = stopwords.words('english')

texto = pd.Series(corpus)
corpus = hero.remove_stopwords(texto, stops).values[0]

In [None]:
texto = pd.Series(corpus)
corpus = hero.remove_punctuation(texto).values[0]
texto = pd.Series(corpus)
tokens = hero.tokenize(texto).values[0]

In [None]:
dictionary = gensim.corpora.Dictionary([tokens])
print(dictionary)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in [tokens]]
print(bow_corpus)

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 2, 
                                   id2word = dictionary,                                    
                                   passes = 20,
                                   workers = 2)

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dictionary,                                    
                                   passes = 20,
                                   workers = 2)

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

In [None]:
import json
with open("./datasets/Medical/Corona2.json") as f:
    corona2 = json.load(f)

In [None]:
def extrai_topicos(corpus):
    texto = pd.Series(corpus)
    corpus = hero.remove_stopwords(texto, stops).values[0]
    texto = pd.Series(corpus)
    corpus = hero.remove_punctuation(texto).values[0]
    texto = pd.Series(corpus)
    tokens = hero.tokenize(texto).values[0]
    dictionary = gensim.corpora.Dictionary([tokens])
    bow_corpus = [dictionary.doc2bow(doc) for doc in [tokens]]
    lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   id2word = dictionary,
                                   num_topics = 4, 
                                   passes = 10,
                                   workers = 2)
    return lda_model

In [None]:
for example in corona2["examples"]:
    content = example["content"]
    print(content)
    print('\n')
    lda_model = extrai_topicos(content)
    for idx, topic in lda_model.show_topics(formatted=False, num_words= 10):
        print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))
    print('\n\n')