# Vetorização de Textos

A biblioteca `nlpbox` disponibiliza uma gama de estratégias de vetorização para o Português em seu pacote `nlpbox.vectorizers`. Extratores de características são uma forma de obter uma representação numérica de um texto, todavia existem outras.

In [1]:
from nlpbox.vectorizers.tfidf_vectorizer import TFIDFVectorizer
from nlpbox.vectorizers.bert_vectorizer import BertVectorizer
from nlpbox.vectorizers.fasttext_word_vectorizer import FasttextWordVectorizer

In [2]:
# === Vetorização com TF-IDF ===
# Alguns dos vetorizadores são treináveis,
#   como o TF-IDF. Dessa forma, precisamos
#   realizar o `fit(...)` antes de realizar
#   qualquer vetorização de textos.
vectorizer = TFIDFVectorizer()
vectorizer.fit(["O poeta modernista Oswald de Andrade relata..."])
vectorizer.vectorize("Oswaldo de Andrade")

array([0.70710678, 0.70710678, 0.        , 0.        , 0.        ,
       0.        ])

In [3]:
vectorizer.vectorize("Oswaldo de Andrade", vector_type='torch')

tensor([0.7071, 0.7071, 0.0000, 0.0000, 0.0000, 0.0000], dtype=torch.float64)

In [4]:
# === Vetorização com o BERT ===
vectorizer = BertVectorizer()
vectorizer.vectorize("Oswaldo de Andrade")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


array([ 2.64385343e-01,  2.16397747e-01,  5.14491081e-01,  3.07694674e-01,
        6.47188604e-01,  5.84586203e-01,  2.58738935e-01, -4.37633507e-02,
        8.35895613e-02,  2.77667075e-01, -1.18047923e-01, -1.56391934e-01,
       -4.75931689e-02, -1.29662514e-01,  4.34319377e-02,  2.15437591e-01,
        1.53155431e-01, -8.56867060e-04, -3.65796685e-02,  6.48877978e-01,
        1.62571013e-01,  2.08059058e-01,  6.16300285e-01, -3.37160006e-02,
        2.76481748e-01,  1.36987612e-01, -1.33273184e-01,  6.53725088e-01,
        1.52791157e-01, -3.59704494e-02, -1.21531323e-01,  2.93516457e-01,
        8.18944629e-03,  4.28493440e-01, -1.14869095e-01,  8.82675126e-02,
       -1.32777229e-01,  5.88505119e-02, -6.43031448e-02, -1.24747761e-01,
        4.90952991e-02, -1.24715969e-01, -2.13417485e-01, -3.14205796e-01,
        2.77461439e-01, -1.39166921e-01,  8.68733302e-02, -2.25311935e-01,
       -5.67873240e-01,  9.08298120e-02, -9.34687555e-02, -1.02077410e-01,
        2.78992951e-01,  

In [5]:
vectorizer.vectorize("Oswaldo de Andrade", vector_type='torch')

tensor([ 2.6439e-01,  2.1640e-01,  5.1449e-01,  3.0769e-01,  6.4719e-01,
         5.8459e-01,  2.5874e-01, -4.3763e-02,  8.3590e-02,  2.7767e-01,
        -1.1805e-01, -1.5639e-01, -4.7593e-02, -1.2966e-01,  4.3432e-02,
         2.1544e-01,  1.5316e-01, -8.5687e-04, -3.6580e-02,  6.4888e-01,
         1.6257e-01,  2.0806e-01,  6.1630e-01, -3.3716e-02,  2.7648e-01,
         1.3699e-01, -1.3327e-01,  6.5373e-01,  1.5279e-01, -3.5970e-02,
        -1.2153e-01,  2.9352e-01,  8.1894e-03,  4.2849e-01, -1.1487e-01,
         8.8268e-02, -1.3278e-01,  5.8851e-02, -6.4303e-02, -1.2475e-01,
         4.9095e-02, -1.2472e-01, -2.1342e-01, -3.1421e-01,  2.7746e-01,
        -1.3917e-01,  8.6873e-02, -2.2531e-01, -5.6787e-01,  9.0830e-02,
        -9.3469e-02, -1.0208e-01,  2.7899e-01,  1.4285e-01,  3.2249e-01,
        -1.3989e-02,  7.2039e-02, -3.7924e-02,  1.3590e-02,  1.8673e-01,
         8.8871e-02, -2.5229e-01, -6.1246e-02, -8.4979e-02,  8.7489e-02,
        -1.9505e-01,  3.0995e-01, -4.4238e-01, -2.1

In [6]:
# === Vetorização com FastText ===
# Esse vetorizador produz Embeddings a nível
#   de palavras, em contraste com BERT.
vectorizer = FasttextWordVectorizer(language='pt', dims=50)
vectorizer.vectorize("Oswaldo de Andrade")



array([[-5.2886888e-02, -1.7561170e-01,  1.3918166e-01,  6.1004467e-02,
        -5.3558713e-03,  1.3114677e-01,  8.0130771e-02, -7.0898794e-02,
         4.0646223e-03,  7.0829466e-02,  1.0253237e-01, -6.7408614e-02,
        -2.9848602e-02,  2.8654059e-02, -1.4011158e-02, -6.5185040e-02,
        -2.7422376e-02, -8.0749080e-02, -7.2940784e-03, -2.1041073e-02,
        -2.4684085e-04, -1.2846577e-02, -2.5830334e-02, -3.7927028e-02,
         7.3005073e-02,  5.1672690e-02, -3.9212778e-02, -3.9599404e-02,
        -5.8107093e-02, -3.5409156e-02,  1.0954356e-02,  8.5326899e-03,
        -2.0045838e-03,  2.0096442e-02,  5.0923387e-03,  5.2462842e-02,
         8.1380792e-02, -1.5884088e-01,  1.7451486e-01,  3.1203162e-03,
        -7.4836999e-02, -1.1107292e-02,  8.3753634e-03, -1.3731433e-01,
         5.3399377e-02,  2.4465170e-02,  8.5646110e-03,  4.2264380e-02,
        -2.5862515e-02,  3.4345940e-02],
       [-1.9987541e-01,  3.9156839e-01, -2.0347607e-01,  2.1562086e-01,
         2.1254006e-01,

In [7]:
vectorizer.vectorize("Oswaldo de Andrade", vector_type='torch')

tensor([[-5.2887e-02, -1.7561e-01,  1.3918e-01,  6.1004e-02, -5.3559e-03,
          1.3115e-01,  8.0131e-02, -7.0899e-02,  4.0646e-03,  7.0829e-02,
          1.0253e-01, -6.7409e-02, -2.9849e-02,  2.8654e-02, -1.4011e-02,
         -6.5185e-02, -2.7422e-02, -8.0749e-02, -7.2941e-03, -2.1041e-02,
         -2.4684e-04, -1.2847e-02, -2.5830e-02, -3.7927e-02,  7.3005e-02,
          5.1673e-02, -3.9213e-02, -3.9599e-02, -5.8107e-02, -3.5409e-02,
          1.0954e-02,  8.5327e-03, -2.0046e-03,  2.0096e-02,  5.0923e-03,
          5.2463e-02,  8.1381e-02, -1.5884e-01,  1.7451e-01,  3.1203e-03,
         -7.4837e-02, -1.1107e-02,  8.3754e-03, -1.3731e-01,  5.3399e-02,
          2.4465e-02,  8.5646e-03,  4.2264e-02, -2.5863e-02,  3.4346e-02],
        [-1.9988e-01,  3.9157e-01, -2.0348e-01,  2.1562e-01,  2.1254e-01,
          3.4088e-01, -7.3346e-02, -1.6893e-01, -1.9249e-01,  7.1387e-02,
         -1.9757e-01, -7.9163e-02, -2.1747e-01, -8.6211e-02,  3.1767e-01,
         -1.9370e-01, -8.1982e-02, -5