# Tokenização com spaCY

Tokenização é o processo de quebrar um documento em representações padronizadas de palavras, bem como pontuação.

In [1]:
import spacy
from utils import table

## As tabelas apresentam as seguintes informações:

Text: The original word text.

Lemma: The base form of the word.

POS: The simple part-of-speech tag.

Tag: The detailed part-of-speech tag.

Dep: Syntactic dependency, i.e. the relation between tokens.

Shape: The word shape – capitalisation, punctuation, digits.

is alpha: Is the token an alpha character?

is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [4]:
nlp_en = spacy.load('en')

en_str = "The quick brown fox jumps over the lazy dog."
en_doc = nlp_en(en_str)

rows = [[token,
         token.text,
         token.lemma_,
         token.pos_,
         token.tag_,
         token.dep_,
         token.shape_,
         token.is_alpha,
         token.is_stop,
         token.vector[0:3]] for token in en_doc]
headers = ["Token", "Text", "Lemma", "Pos", "Tag", "Dep", "Shape", "Alpha", "Stop", "Vectors"]
table(headers, rows)

Token,Text,Lemma,Pos,Tag,Dep,Shape,Alpha,Stop,Vectors
The,The,the,DET,DT,det,Xxx,True,False,[0.80844855 0.36085728 0.33025074]
quick,quick,quick,ADJ,JJ,amod,xxxx,True,False,[-2.8631716 1.2935663 -0.41818547]
brown,brown,brown,ADJ,JJ,amod,xxxx,True,False,[ 0.32617 -0.58817935 0.35415292]
fox,fox,fox,NOUN,NN,nsubj,xxx,True,False,[0.01345855 2.4567294 0.98378897]
jumps,jumps,jump,VERB,VBZ,ROOT,xxxx,True,False,[-1.9692166 1.5927203 0.08396816]
over,over,over,ADP,IN,prep,xxxx,True,True,[ 1.4140644 -0.77489394 1.6844331 ]
the,the,the,DET,DT,det,xxx,True,True,[-0.06463778 -0.11468075 0.7628048 ]
lazy,lazy,lazy,ADJ,JJ,amod,xxxx,True,False,[-1.3845626 -2.2873337 -0.632537 ]
dog,dog,dog,NOUN,NN,pobj,xxx,True,False,[1.846381 2.2142863 2.315588 ]
.,.,.,PUNCT,.,punct,.,False,False,[ 3.6785886 -0.66314673 1.6109401 ]


In [5]:
nlp_br = spacy.load('pt')

pt_str = "Estamos apresentando o seminário de Inteligência Artificial."
pt_doc = nlp_br(pt_str)

rows = [[token,
          token.text,
          token.lemma_,
          token.pos_,
          token.tag_,
          token.dep_,
          token.shape_,
          token.is_alpha,
          token.is_stop,
          token.vector[0:3]] for token in pt_doc]

headers = ["Token", "Text", "Lemma", "Pos", "Tag", "Dep", "Shape", "Alpha", "Stop", "Vectors"]

table(headers, rows)

Token,Text,Lemma,Pos,Tag,Dep,Shape,Alpha,Stop,Vectors
Estamos,Estamos,Estamos,AUX,|V|PR|1P|IND|@FS-STA,aux,Xxxxx,True,False,[-5.4169607 2.5968633 4.549823 ]
apresentando,apresentando,apresentar,VERB,|V|GER|@ICL-AUX<,ROOT,xxxx,True,False,[-4.783967 7.909391 -8.671597]
o,o,o,DET,|ART|M|S|@>N,det,x,True,False,[-0.33449972 3.8372946 3.7064953 ]
seminário,seminário,seminário,NOUN,|N|M|S|@obj xxxx True False [6.4645123 0.06163967 3.4980528 ],,,,,
de,de,de,ADP,PRP|@N<,case,xx,True,True,[-4.913734 5.7201037 6.324004 ]
Inteligência,Inteligência,Inteligência,PROPN,PROPN,nmod,Xxxxx,True,False,[ 1.5684817 -0.96657425 6.945867 ]
Artificial,Artificial,Artificial,PROPN,PROP|@N<,flat:name,Xxxxx,True,False,[ 0.99761987 -3.7495167 4.514478 ]
.,.,.,PUNCT,PU|@PU,punct,.,False,False,[ 2.0695763 -2.4045844 2.5616748]
