### Carrega pacotes

In [None]:
!pip install transformers scipy torch --quiet

In [None]:
!python -m spacy download pt_core_news_md --quiet

2023-11-18 19:16:40.449344: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-18 19:16:40.449397: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-18 19:16:40.449431: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-18 19:16:40.460053: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-18 19:16:44.879782: I tensorflow/c

In [None]:

import numpy as np
import pandas as pd

import spacy
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel
import torch


import plotly.express as px
from plotly.subplots import make_subplots

### Define variáveis e funções

Agora vamos carregar o modelo e tokenizer, além de mover o modelo para o dispositivo disponível. Se estiver no Google Colab, ative a GPU em **Runtime** > **Change runtime type**.


In [None]:
# Primeiro, defina o dispositivo para usar a GPU, se disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### De palavras para tokens

In [None]:
# Carrega um modelo em Português
nlp = spacy.load('pt_core_news_md')

# Processa um texto de exemplo
doc = nlp("Caracteres em pt-br. ヾ 々")

# Check the tokens
for token in doc:
    print(token.text, token.has_vector)


Caracteres True
em True
pt-br True
. True
ヾ False
々 False


In [None]:
vectors = [token.vector for token in doc if token.has_vector]

vectors[1][:5]

array([-2.6585 ,  0.91233, -2.269  ,  4.6615 , -5.4004 ], dtype=float32)

---

### Vetorizando palavras

In [None]:
palavras = ['vereador', 'prefeitura', 'municipio', 'governador', 'câmara','câmera',\
         'fotográfica','rio de janeiro','são paulo','recife','coral', 'hospital', 'melancia','morango','maca','maçã']

### Carrega modelo Spacy

In [None]:
# carrega palavras
nlp = spacy.load('pt_core_news_md')

X = [nlp(palavra).vector for palavra in palavras]

# reduz dimensoes
pca = PCA(n_components=4)
pca.fit(X)

transformed = pca.transform(X)

transformed

array([[-1.69043373e+01, -1.08809820e+01,  1.62807690e+00,
        -6.93810566e+00],
       [-1.32155993e+01, -5.71130559e+00, -5.54064050e-03,
        -3.22779890e+00],
       [-1.75667056e+01, -1.44325712e+00,  4.75436194e+00,
         1.63755703e+00],
       [-1.56725021e+01, -7.09602976e+00,  1.57438138e+00,
        -1.59314608e+00],
       [-1.73126280e-01, -8.53893757e+00, -1.02905786e+01,
        -8.67512267e+00],
       [ 1.15215225e+01, -7.34196081e+00, -1.61001337e+01,
        -1.28068979e+01],
       [ 2.04961101e+00,  8.95549679e-01, -5.46079445e+00,
        -4.87788400e+00],
       [-1.08194361e+01,  6.53684892e+00,  2.45891564e+00,
         1.45447617e+01],
       [-3.96316635e+00,  3.54398700e+01, -1.45016469e+01,
        -2.38156221e+00],
       [ 3.67034634e+00,  2.89803511e+00,  4.67301999e+00,
         7.52567457e+00],
       [ 3.18690868e+00,  1.58793901e+00,  7.07597628e+00,
         9.14514414e+00],
       [-7.24216679e+00, -4.57696668e+00, -1.07752079e+00,
      

In [None]:
def positive_values(vector):
  min_val = vector.min()
  if min_val < 0:
    vector += abs(min_val)
    return vector
  else:
    return vector

transformed = positive_values(transformed)

df = pd.DataFrame(transformed, columns=['x', 'y','s','c'])

df

Unnamed: 0,x,y,s,c
0,0.662368,6.685724,19.194783,10.6286
1,4.351106,11.8554,17.561165,14.338907
2,0.0,16.123448,22.321068,19.204263
3,1.894204,10.470676,19.141087,15.97356
4,17.393579,9.027768,7.276127,8.891583
5,29.088228,10.224745,1.466572,4.759808
6,19.616317,18.462255,12.105911,12.688822
7,6.747269,24.103555,20.025621,32.111467
8,13.603539,53.006576,3.065059,15.185143
9,21.237052,20.464741,22.239726,25.09238


In [None]:
def mostra(df,title):
  fig = px.scatter(df,x='x', y='y',color='c',size='s',text=palavras,title=title)
  return fig

fig1 = mostra(df,"Spacy")

fig1.show()

### Transformer (BERT)

In [None]:
hf_model = "ricardo-filho/bert-base-portuguese-cased-nli-assin-2"
tokenizer = BertTokenizer.from_pretrained(hf_model)
model = BertModel.from_pretrained(hf_model)

# Atualize o modelo para usar o dispositivo apropriado
model.to(device)

tokenizer_config.json:   0%|          | 0.00/530 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/438k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29794, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
def vetoriza(word):

    input = tokenizer.encode(word, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model(input)

    embeddings = output.last_hidden_state

    # Retorna o embedding da primeira palavras (ignora [CLS] e [SEP])

    return embeddings[0, 1, :]

In [None]:
# Carrega os embeddings
X = [vetoriza(palavra).detach().cpu().numpy() for palavra in palavras]

# Inicia PCA para reduzir as dimensões
pca.fit(X)

# Reduz o n. de componentes e transforma todos valores para positivo
X = positive_values(pca.transform(X))


In [None]:
df_bert = pd.DataFrame(X, columns=['x', 'y','c','s'])

fig2 = mostra(df_bert,"BERT")

fig2

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Spacy', 'BERTimbau'))

for trace in fig1['data']:
    fig.add_trace(trace, row=1, col=1)

for trace in fig2['data']:
    fig.add_trace(trace, row=1, col=2)

fig.show()

# Calculando palavras

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nlp = spacy.load('pt_core_news_md')

resultado_vector = (nlp.vocab['padre'].vector - nlp.vocab['igreja'].vector) + nlp.vocab["escola"].vector

mais_proximo = None
maior_similaridade = -1

for key, vector in nlp.vocab.vectors.items():
    if vector.shape == resultado_vector.shape:
        sim = cosine_similarity([resultado_vector], [vector])[0][0]
        if sim > maior_similaridade:
            maior_similaridade = sim
            mais_proximo = nlp.vocab[key].text

print(f"A palavra mais próxima é: {mais_proximo}")


A palavra mais próxima é: professor


# Outras refs.

https://turbomaze.github.io/word2vecjson/

Vetores, o que são eles afinal? | A essência da álgebra linear, capítulo 1 (3Blue1Brown)
https://www.youtube.com/watch?v=fNk_zzaMoSs