In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys

In [3]:
sys.path.append('/content/drive/MyDrive/transformers_demo')
os.chdir('/content/drive/MyDrive/transformers_demo')

import torch
import torch.nn as nn
import altair as alt
import pandas as pd
import numpy as np

from model import Transformer
from config import get_config,get_weights_file_path
from train import get_model,get_ds,greedy_decode






In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (f'using device {device}')
config = get_config()
train_dataloader,val_dataloader,vocab_src ,vocab_tgt = get_ds(config)
model = get_model(config,vocab_src.get_vocab_size(),vocab_tgt.get_vocab_size()).to(device)


model_filename = get_weights_file_path(config , f"19")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

using device cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

en-it/train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

max length of source:309
max length of target:274


<All keys matched successfully>

In [5]:
def load_next_batch():
  batch = next(iter(val_dataloader))
  encoder_input = batch['encoder_input'].to(device)
  encoder_mask = batch['encoder_mask'].to(device)
  decoder_input = batch['decoder_input'].to(device)
  decoder_mask = batch['decoder_mask'].to(device)

  encoder_input_tokens = [vocab_src.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()]
  decoder_input_tokens = [vocab_tgt.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()]

  model_out = greedy_decode(model,encoder_input,encoder_mask,vocab_src,vocab_tgt,config['seq_len'],device)
  return batch ,encoder_input_tokens,decoder_input_tokens



In [6]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
    return pd.DataFrame(
        [
            (
                r,
                c,
                float(m[r, c]),
                "%.3d %s" % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
                "%.3d %s" % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
            )
            for r in range(m.shape[0])
            for c in range(m.shape[1])
            if r < max_row and c < max_col
        ],
        columns=["row", "column", "value", "row_token", "col_token"],
    )

def get_attn_map(attn_type: str, layer: int, head: int):
    if attn_type == "encoder":
        attn = model.encoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "decoder":
        attn = model.decoder.layers[layer].self_attention_block.attention_scores
    elif attn_type == "encoder-decoder":
        attn = model.decoder.layers[layer].cross_attention_block.attention_scores
    return attn[0, head].data

def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):
    df = mtx2df(
        get_attn_map(attn_type, layer, head),
        max_sentence_len,
        max_sentence_len,
        row_tokens,
        col_tokens,
    )
    return (
        alt.Chart(data=df)
        .mark_rect()
        .encode(
            x=alt.X("col_token", axis=alt.Axis(title="")),
            y=alt.Y("row_token", axis=alt.Axis(title="")),
            color="value",
            tooltip=["row", "column", "value", "row_token", "col_token"],
        )
        #.title(f"Layer {layer} Head {head}")
        .properties(height=400, width=400, title=f"Layer {layer} Head {head}")
        .interactive()
    )

def get_all_attention_maps(attn_type: str, layers: list[int], heads: list[int], row_tokens: list, col_tokens, max_sentence_len: int):
    charts = []
    for layer in layers:
        rowCharts = []
        for head in heads:
            rowCharts.append(attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len))
        charts.append(alt.hconcat(*rowCharts))
    return alt.vconcat(*charts)

In [7]:
batch, encoder_input_tokens,decoder_input_tokens= load_next_batch()


batch, encoder_input_tokens, decoder_input_tokens = load_next_batch()
print(f'Source: {batch["src_text"][0]}')
print(f'Target: {batch["tgt_text"][0]}')
sentence_len = encoder_input_tokens.index("[PAD]")




Source: And up the slope of Cooper's Hill, just opposite, are gathered the wondering rustics and curious townsfolk, who have run from Staines, and none are quite sure what the bustle is about, but each one has a different version of the great event that they have come to see; and some say that much good to all the people will come from this day's work; but the old men shake their heads, for they have heard such tales before.
Target: E su per il declivio della collina di Cooper, precisamente di fronte, si son raccolti i villani stupiti e gli abitanti della città, incuriositi, accorsi da Staines. Nessuno è certo di che si tratti, e ciascuno ha una versione del grande evento che si deve vedere; e alcuni dicono che molto bene verrà al popolo dall’opera di quel giorno; ma i vecchi scuotono il capo, perchè da tempo hanno sentito le stesse cose.


In [8]:
layers = [1,2,3]
heads = [0,1,2,3,4,5,6,7]

get_all_attention_maps("encoder",layers,heads,encoder_input_tokens,encoder_input_tokens,min(20,sentence_len))

In [9]:
get_all_attention_maps("decoder",layers,heads,decoder_input_tokens,decoder_input_tokens,min(20,sentence_len))

In [10]:
get_all_attention_maps("encoder-decoder",layers,heads,encoder_input_tokens,decoder_input_tokens,min(20,sentence_len))