In [3]:
%pip install -r ../requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [105]:
import sys
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view, head_view

utils.logging.set_verbosity_error()

# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [71]:
MODEL_MSFT_L12_H384 =  "microsoft/xtremedistil-l12-h384-uncased"
MODEL_MSFT_L6_H384 = 'microsoft/xtremedistil-l6-h384-uncased'
MODEL_DISTILL = 'distilbert-base-uncased'
MODEL_BERT_BASE = 'bert-base-uncased'
MODEL_BERT_TINY = "prajjwal1/bert-tiny"'

In [54]:
DATA_DIR = '../data'
SPECIFIC_ABSTRACT_CSV = f'{DATA_DIR}/merge/specific_abstract.csv'
SPECIFIC_ABSTRACT_DATA = pd.read_csv(SPECIFIC_ABSTRACT_CSV)
SPECIFIC_ABSTRACT_DATA

Unnamed: 0,word,specific,abstract
0,Beautiful,Beautiful girl,Beautiful soul
1,World,The world is very old,He lives in his own world
2,School,The school is near the park.,School is a garden to nurture the mind
3,Oxygen,Oxygen is crucial to life,Music is my oxygen


In [16]:
def model_overview(model = None, model_name = None):
    if model is None:
        if model_name is None:
            raise ValueError("One of model or model_name should be provided")
        model = AutoModel.from_pretrained(model_name, output_attentions=True)
    
    num_layers = model.config.num_hidden_layers
    num_heads = model.config.num_attention_heads
    params = model.num_parameters()

    df = pd.DataFrame({
        'Layers': num_layers,
        'Heads': num_heads,
        'Parameters': params
    }, index=[model_name])
    return df

In [17]:
model_overview(model_name=MODEL_MSFT_L6_H384)

config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Unnamed: 0,Layers,Heads,Parameters
microsoft/xtremedistil-l6-h384-uncased,6,12,22713216


In [104]:
def prepare_tf(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=True)
    model = AutoModel.from_pretrained(model_name, resume_download=True, output_attentions=True)
    return (model, tokenizer, config)


def show_head_view(model, tokenizer, sentence_a, sentence_b=None):
    """
    Show head view of attention. visualizing sentence pair is model specific and not supported by all models.
    Models that support pair visualization: 
        - bert-base-uncased
    """
    inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
    input_ids = inputs['input_ids']
    if sentence_b:
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]
        sentence_b_start = token_type_ids[0].tolist().index(1)
        
    else:
        attention = model(input_ids)[-1]
        sentence_b_start = None
    input_id_list = input_ids[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)    
    head_view(attention, tokens, sentence_b_start)

In [106]:
model, tokenizer, config = prepare_tf(MODEL_MSFT_L6_H384)

In [110]:
beautiful = SPECIFIC_ABSTRACT_DATA[['specific', 'abstract']].iloc[0]
show_head_view(model, tokenizer, beautiful.specific, beautiful.abstract)

<IPython.core.display.Javascript object>

In [109]:
school = SPECIFIC_ABSTRACT_DATA[['specific', 'abstract']].iloc[2]
show_head_view(model, tokenizer, school.specific, school.abstract)

<IPython.core.display.Javascript object>

In [111]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

In [112]:
from bertviz import model_view

def visualize_attention(attention, input_ids):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    model_view(attention, tokens)

def visualize_embeddings(embeddings):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    model_view(embeddings, tokens)

In [26]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

#### References