In [None]:
!pip install transformers==4.41.2 --quiet
!pip install ipywidgets --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe

import ipywidgets as widgets
from transformers import AutoTokenizer, AutoModel

# 1. Cargar modelo liviano y tokenizer
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"  # Puedes cambiar a "distilbert-base-multilingual-cased" para español

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME, output_attentions=True)

# 2. Frase de ejemplo
sentence = "El perro que persiguió al gato cruzó la calle."

# 3. Tokenizar y obtener tensores
inputs = tokenizer(sentence, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
    # Extrae las matrices de atención: (n_layers, batch, n_heads, seq_len, seq_len)
    att_matrices = outputs.attentions

# 4. Extraer tokens decodificados
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

# 5. Widget para elegir capa, cabeza, y token origen
def plot_attention(layer=0, head=0, token_idx=0):
    att = att_matrices[layer][0, head].cpu().numpy()  # (seq_len, seq_len)
    plt.figure(figsize=(7, 5))
    plt.barh(tokens, att[token_idx], color='lightblue')
    plt.xlabel('Attention Score')
    plt.title(f'Layer {layer}, Head {head}: attention from "{tokens[token_idx]}"')
    plt.xlim(0, 1)
    for i, v in enumerate(att[token_idx]):
        plt.text(v + 0.01, i, f"{v:.2f}", va='center')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()




tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe

def plot_attention_lines_ultra_wide(layer=0, head=0, token_idx=0, thresh=0.02):
    att = att_matrices[layer][0, head].cpu().numpy()
    tokens_col = [t for t in tokens]
    N = len(tokens_col)

    # Aumenta mucho el ancho
    fig, ax = plt.subplots(figsize=(15, max(7, N * 0.48)))
    ax.axis('off')

    y_pos = np.arange(N)[::-1]
    norm = plt.Normalize(att[token_idx].min(), att[token_idx].max())
    cmap = plt.get_cmap('coolwarm')

    # Columna izquierda (tokens origen)
    for i, token in enumerate(tokens_col):
        color = '#2176ae' if i == token_idx else 'black'
        fw = 'bold' if i == token_idx else 'normal'
        bbox = dict(facecolor='#e3f3ff', edgecolor='#2176ae', boxstyle='round,pad=0.23') if i == token_idx else None
        ax.text(0.08, y_pos[i]/N, token, fontsize=19, ha='right', va='center',
                color=color, fontweight=fw, family='monospace', bbox=bbox,
                path_effects=[pe.withStroke(linewidth=3, foreground='white')] if i==token_idx else None)

    # Columna derecha (tokens destino)
    for i, token in enumerate(tokens_col):
        max_dest = np.argmax(att[token_idx])
        fw = 'bold' if i == max_dest else 'normal'
        bbox = dict(facecolor='#ffe6e6', edgecolor='#c44741', boxstyle='round,pad=0.23') if i == max_dest else None
        ax.text(0.92, y_pos[i]/N, token, fontsize=19, ha='left', va='center',
                color='black', fontweight=fw, family='monospace', bbox=bbox,
                path_effects=[pe.withStroke(linewidth=3, foreground='white')] if i==max_dest else None)
        # El valor numérico mucho más a la derecha
        if att[token_idx][i] > thresh:
            ax.text(1.38, y_pos[i]/N, f"{att[token_idx][i]:.2f}", fontsize=18, color='#c44741',
                    ha='right', va='center', family='monospace', fontweight='bold' if i == max_dest else 'normal')

    # Líneas de atención
    for j in range(N):
        score = att[token_idx][j]
        if score > thresh:
            line_color = cmap(norm(score))
            linewidth = 1.3 + 7 * score
            ax.plot([0.12, 0.91], [y_pos[token_idx]/N, y_pos[j]/N], color=line_color, alpha=0.88, linewidth=linewidth, solid_capstyle='round')

    # Más margen derecho para no cortar el texto
    ax.set_xlim(0, 1.45)
    ax.set_ylim(-0.1, 1.1)
    plt.title(f'Attention from {tokens_col[token_idx]}', fontsize=24, fontweight='bold', color='#2176ae', pad=20)
    plt.figtext(1.32, 0.025, f"Σ attention: {att[token_idx].sum():.2f}", fontsize=13, color='gray', ha='right')
    plt.tight_layout(rect=[0, 0.04, 1, 0.98])
    plt.show()




In [None]:
# Widgets para seleccionar capa, cabeza, token
layer_selector = widgets.IntSlider(value=0, min=0, max=len(att_matrices)-1, step=1, description='Layer:')
head_selector = widgets.IntSlider(value=0, min=0, max=att_matrices[0].shape[1]-1, step=1, description='Head:')
token_selector = widgets.IntSlider(value=1, min=0, max=len(tokens)-1, step=1, description='Token:')

ui = widgets.VBox([layer_selector, head_selector, token_selector])
out = widgets.interactive_output(
    plot_attention_lines_ultra_wide, {'layer': layer_selector, 'head': head_selector, 'token_idx': token_selector}
)

display(ui, out)


VBox(children=(IntSlider(value=0, description='Layer:', max=11), IntSlider(value=0, description='Head:', max=1…

Output()