In [None]:
!pip -q install gensim

In [11]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api

model_name = "sentence-transformers/all-MiniLM-L6-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [17]:
# ----------------------------
# Funções auxiliares
# ----------------------------
def mean_pooling(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    # Comentário: média dos embeddings dos tokens (ignorando padding) para virar embedding de sentença
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # [B, T, 1]
    summed = (last_hidden_state * mask).sum(dim=1)                  # [B, H]
    counts = mask.sum(dim=1).clamp(min=1e-9)                        # [B, 1]
    return summed / counts

@torch.no_grad()
def embed_sentence(text: str) -> torch.Tensor:
    # Comentário: gera embedding normalizado da sentença (bom para cosseno)
    batch = tokenizer(text, return_tensors="pt", truncation=True).to(device)
    out = model(**batch)
    emb = mean_pooling(out.last_hidden_state, batch["attention_mask"])
    emb = F.normalize(emb, dim=1)
    return emb[0].detach().cpu()  # [H]

def cosine_similarity(a: torch.Tensor, b: torch.Tensor) -> float:
    # Comentário: cosseno entre dois vetores já normalizados
    return float(torch.dot(a, b).item())

def token_vector(word: str, embedding_weight: torch.Tensor) -> torch.Tensor:
    # Comentário: pega o vetor do(s) token(s) que representam a palavra no vocabulário.
    # Se a palavra vira vários sub-tokens, fazemos a média (só para fins didáticos).
    toks = tokenizer.tokenize(word)
    ids = tokenizer.convert_tokens_to_ids(toks)
    vec = embedding_weight[ids].mean(dim=0)
    return F.normalize(vec, dim=0)

def show_analogy(positive, negative, topn=10):
    # Comentário: resolve a analogia: sum(positive) - sum(negative)
    # e mostra os termos mais próximos pelo cosseno.
    print(f"\nAnalogia: +{positive} -{negative} => ?")
    results = wv.most_similar(positive=positive, negative=negative, topn=topn)
    for word, score in results:
        print(f"  {word:<12} cosine={score:.4f}")

def show_direction_terms(base_word, target_word, topn=10):
    # Comentário: mostra quais palavras estão na “direção” (target - base)
    # Ex.: direção de France -> Italy tende a pegar países/atributos relacionados.
    direction = wv[target_word] - wv[base_word]
    results = wv.similar_by_vector(direction, topn=topn)
    print(f"\nDireção: ({target_word} - {base_word}) -> termos próximos")
    for word, score in results:
        print(f"  {word:<12} cosine={score:.4f}")

In [3]:
# ============================================================
# 1) Embedding model e tokenizer: saída depende da tokenização
# ============================================================
text = "Paris is the capital of France."
encoded = tokenizer(text, return_tensors="pt").to(device)
tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0].tolist())

with torch.no_grad():
    out = model(**encoded)
    token_embs = out.last_hidden_state[0].detach().cpu()  # [T, H]

print("=== (1) Tokenização e embeddings por token ===")
print("Texto:", text)
print("Tokens:", tokens)
print("Shape do output (last_hidden_state):", tuple(out.last_hidden_state.shape), "=> [batch, tokens, hidden]")
print("\nPrimeiras dimensões do embedding de cada token (só para visualizar):")
for i, (tok, tok_id) in enumerate(zip(tokens, encoded["input_ids"][0].tolist())):
    preview = token_embs[i, :6].tolist()
    print(f"  {i:02d}  token={tok:<12} id={tok_id:<6} emb[:6]={[round(x, 4) for x in preview]}")

print("\nExemplos rápidos de como a tokenização muda:")
examples = ["unbelievable", "un-believable", "ChatGPT", "chat gpt", "São Paulo", "Sao Paulo"]
for ex in examples:
    print(f"  {ex!r} -> {tokenizer.tokenize(ex)}")

=== (1) Tokenização e embeddings por token ===
Texto: Paris is the capital of France.
Tokens: ['[CLS]', 'paris', 'is', 'the', 'capital', 'of', 'france', '.', '[SEP]']
Shape do output (last_hidden_state): (1, 9, 384) => [batch, tokens, hidden]

Primeiras dimensões do embedding de cada token (só para visualizar):
  00  token=[CLS]        id=101    emb[:6]=[0.4117, 0.0816, 0.4398, -0.334, 0.4184, -0.1416]
  01  token=paris        id=3000   emb[:6]=[0.5774, 0.6534, -0.1398, -0.5575, 0.3999, -0.1288]
  02  token=is           id=2003   emb[:6]=[0.8619, 0.2646, 0.2367, -0.1776, 0.1201, -0.2658]
  03  token=the          id=1996   emb[:6]=[0.2954, 0.0948, 0.2259, -0.1332, 0.3016, -0.2222]
  04  token=capital      id=3007   emb[:6]=[1.3386, 0.2528, 0.0696, 0.0244, 0.501, -0.7151]
  05  token=of           id=1997   emb[:6]=[0.3472, 0.1532, 0.0655, -0.1498, 0.1816, -0.3127]
  06  token=france       id=2605   emb[:6]=[-0.0092, 0.1243, -0.0342, -0.5429, 0.6265, -0.1939]
  07  token=.            id=1

In [4]:
# ============================================================
# 2) Similaridade cosseno entre sentenças
# ============================================================
print("\n=== (2) Similaridade cosseno entre sentenças ===")
pairs = [
    ("A cat sits on the mat.", "A kitten is sitting on a mat."),
    ("I love pizza.", "The sky is blue."),
    ("He went to the bank to deposit money.", "He deposited cash at the bank."),
]

for s1, s2 in pairs:
    e1 = embed_sentence(s1)
    e2 = embed_sentence(s2)
    sim = cosine_similarity(e1, e2)
    print(f"- s1: {s1}")
    print(f"  s2: {s2}")
    print(f"  cosine_similarity: {sim:.4f}\n")


=== (2) Similaridade cosseno entre sentenças ===
- s1: A cat sits on the mat.
  s2: A kitten is sitting on a mat.
  cosine_similarity: 0.8582

- s1: I love pizza.
  s2: The sky is blue.
  cosine_similarity: 0.0479

- s1: He went to the bank to deposit money.
  s2: He deposited cash at the bank.
  cosine_similarity: 0.8920



In [13]:
model_name = "glove-wiki-gigaword-50"
wv = api.load(model_name)

print("Modelo carregado:", model_name)
print("Dimensão dos vetores:", wv.vector_size)
print("Tamanho do vocabulário:", len(wv.key_to_index))

Modelo carregado: glove-wiki-gigaword-50
Dimensão dos vetores: 50
Tamanho do vocabulário: 400000


In [16]:
# -----------------------------------
# 1) Demonstração: analogias clássicas
# -----------------------------------
# Comentário: nem todas as analogias funcionam em todos os modelos,
# mas em embeddings “clássicos” costuma aparecer bem.
show_analogy(positive=["paris", "italy"], negative=["france"], topn=10)   # Paris - France + Italy
show_analogy(positive=["rome", "france"], negative=["italy"], topn=10)    # Rome - Italy + France
show_analogy(positive=["king", "woman"], negative=["man"], topn=10)       # king - man + woman ~ queen


Analogia: +['paris', 'italy'] -['france'] => ?
  rome         cosine=0.8466
  milan        cosine=0.7766
  turin        cosine=0.7666
  venice       cosine=0.7592
  madrid       cosine=0.7566
  italian      cosine=0.7514
  aires        cosine=0.7429
  naples       cosine=0.7406
  buenos       cosine=0.7357
  lisbon       cosine=0.7245

Analogia: +['rome', 'france'] -['italy'] => ?
  paris        cosine=0.8582
  prohertrib   cosine=0.7304
  vienna       cosine=0.7211
  french       cosine=0.7168
  saint        cosine=0.7058
  gaulle       cosine=0.6880
  petersburg   cosine=0.6789
  berlin       cosine=0.6758
  geneva       cosine=0.6753
  strasbourg   cosine=0.6597

Analogia: +['king', 'woman'] -['man'] => ?
  queen        cosine=0.8524
  throne       cosine=0.7664
  prince       cosine=0.7592
  daughter     cosine=0.7474
  elizabeth    cosine=0.7460
  princess     cosine=0.7425
  kingdom      cosine=0.7337
  monarch      cosine=0.7214
  eldest       cosine=0.7185
  widow        cosin