In [863]:
import numpy as np

In [112]:
from transformers import BartTokenizer, BartForCausalLM

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False)
# assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."

Some weights of the model checkpoint at facebook/bart-base were not used when initializing BartForCausalLM: ['encoder.layers.1.self_attn.v_proj.weight', 'encoder.layers.5.fc2.bias', 'encoder.layers.1.final_layer_norm.bias', 'encoder.layers.4.fc1.weight', 'encoder.layers.2.self_attn.k_proj.bias', 'encoder.layers.3.self_attn.k_proj.bias', 'encoder.layers.5.self_attn.q_proj.bias', 'encoder.layers.1.self_attn.out_proj.weight', 'encoder.layers.0.fc1.bias', 'encoder.layers.1.self_attn_layer_norm.weight', 'encoder.layers.0.fc2.weight', 'encoder.layers.0.self_attn_layer_norm.bias', 'encoder.layers.0.self_attn.q_proj.bias', 'encoder.layers.3.fc2.weight', 'encoder.layers.1.fc2.weight', 'encoder.layers.3.final_layer_norm.bias', 'encoder.layers.3.self_attn.q_proj.weight', 'encoder.layers.5.final_layer_norm.weight', 'encoder.layers.2.self_attn.q_proj.weight', 'encoder.embed_tokens.weight', 'encoder.layers.2.final_layer_norm.weight', 'encoder.layers.4.final_layer_norm.bias', 'encoder.layers.2.self_a

In [4]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

logits = outputs.logits

In [5]:
expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
list(logits.shape) == expected_shape

True

In [7]:
logits.shape 

torch.Size([1, 8, 50265])

# Roberta Causal LM

In [94]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForMaskedLM.from_pretrained("roberta-base")

In [None]:
model.generate()

In [110]:
inputs = tokenizer("He said Jonathan <mask>", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

':'

In [91]:
from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
config = RobertaConfig.from_pretrained("roberta-base")
config.is_decoder = True
model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [48]:
outputs.logits

tensor([[[27.7075, -4.6183, 25.2726,  ...,  0.9433,  2.5662, 11.1329],
         [12.9955, -3.3435, 32.1472,  ...,  2.9392,  0.1569,  9.3820],
         [ 9.7384, -3.5373, 21.8192,  ...,  0.4942, -0.2668,  6.7838],
         ...,
         [ 5.6973, -4.3181, 21.6467,  ..., -3.2017, -4.4629,  4.5802],
         [ 4.9998, -4.6587, 15.6738,  ..., -3.9533, -4.5533,  2.9168],
         [24.6395, -4.3410, 23.1517,  ...,  0.3120,  2.7200,  9.1151]]],
       grad_fn=<AddBackward0>)

In [53]:
outputs.logits.argmax(axis=-1)[0]

tensor([0, 2, 6, 2, 2, 2, 2, 0])

In [52]:
inputs['input_ids'].shape 

torch.Size([1, 8])

In [128]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<ANS>']})

0

In [121]:
tokenizer.add_tokens('<ANS>')

1

In [131]:
from transformers import DataCollatorForLanguageModeling

In [136]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [137]:
x_seq = tokenizer.encode('hello my name is')

In [138]:
data_collator([x_seq])

{'input_ids': tensor([[31373,   616,  1438,   318]]),
 'labels': tensor([[31373,   616,  1438,   318]])}

In [145]:
tokenizer.sep_token_id

In [129]:
tokenizer.additional_special_tokens_ids

[50257]

In [101]:
tokenizer('Hello <ANS>')

{'input_ids': [0, 31414, 50265, 2], 'attention_mask': [1, 1, 1, 1]}

In [104]:
tokenizer.mask_token

'<mask>'

In [147]:
tokenizer.eos_token_id

50256

In [76]:
tokenizer.encode('[SOURCE]')

[0, 10975, 48560, 742, 2]

In [67]:
import pandas as pd 
training_df = pd.read_csv('../data/our-annotated-data-full.csv.gz')

In [186]:
from transformers import RobertaForCausalLM

In [225]:
from transformers import RobertaModel, BertModel
from transformers import BertTokenizer

In [257]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [259]:
model.embeddings.token_type_embeddings

Embedding(2, 768)

In [169]:
from sklearn.model_selection import train_test_split
entry_ids = training_df['entry_id'].unique().tolist()

train_files, test_files = train_test_split(entry_ids)

split_df = pd.concat([
    pd.Series(train_files).to_frame('file_id').assign(split='train'),
    pd.Series(test_files).to_frame('file_id').assign(split='test')
])

(
    training_df[['sentence', 'head', 'entry_id', 'sent_idx']]
     .assign(head=lambda df: df['head'].fillna('None'))
     .merge(split_df, left_on='entry_id', right_on='file_id')
     .assign(entry_id=lambda df: '/' + df['split'] + '/' + df['entry_id'])
     .to_csv('../models_neural/quote_attribution/data/our-annotated-data__stage-2.tsv', sep='\t', index=False, header=False)
)

In [228]:
input_x = t('hello there', 'goodbye there', return_tensors='pt')

In [229]:
model(**input_x)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4661,  0.4146, -0.2632,  ..., -0.6092,  0.3328,  0.2058],
         [ 0.0529, -0.1565,  0.6466,  ..., -0.1399,  0.5327, -0.0591],
         [-0.5668,  0.3817,  0.0310,  ...,  0.1186,  0.1656, -0.8779],
         ...,
         [ 0.5606, -0.7884,  0.8330,  ...,  0.1023, -0.4338,  0.1270],
         [-0.3168,  0.7927,  0.4076,  ..., -0.2205, -0.1307, -0.7212],
         [ 0.7880, -0.0190, -0.3237,  ...,  0.2183, -0.6140, -0.4414]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.9465, -0.6739, -0.9829,  0.9434,  0.8044, -0.4242,  0.9825,  0.4936,
         -0.9570, -1.0000, -0.6956,  0.9828,  0.9738,  0.8625,  0.9699, -0.8913,
         -0.7156, -0.7530,  0.5997, -0.8519,  0.8314,  1.0000, -0.2099,  0.5029,
          0.6551,  0.9992, -0.9150,  0.9447,  0.9711,  0.6582, -0.9089,  0.3807,
         -0.9853, -0.4662, -0.9850, -0.9959,  0.6584, -0.8156, -0.2740, -0.2343,
         -0.9203,  0.5705,  1.000

In [260]:
model = RobertaModel.from_pretrained('/Users/alex/.cache/torch/transformers/named-models/roberta-base-expanded-embeddings')
tokenizer = RobertaTokenizer.from_pretrained('/Users/alex/.cache/torch/transformers/named-models/roberta-base-expanded-embeddings') 

model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [242]:
x = tokenizer.encode('hello there', return_tensors='pt')

In [243]:
x = tokenizer('hello there. goodbye there', return_tensors='pt')

In [244]:
x

{'input_ids': tensor([[    0, 42891,    89,     4, 15364,    89,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [271]:
token_type_ids = torch.tensor([[0, 0, 0, 1, 1, 2, 2]])

In [272]:
token_type_ids.shape 

torch.Size([1, 7])

In [273]:
x['input_ids'].shape 

torch.Size([1, 7])

In [274]:
model.forward(**x, token_type_ids=token_type_ids)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0946,  0.1196, -0.0174,  ..., -0.0560, -0.0533,  0.0211],
         [-0.1686,  0.0156, -0.0141,  ..., -0.2361,  0.0356,  0.2221],
         [-0.0015, -0.0080, -0.1995,  ..., -0.2882, -0.0861,  0.4210],
         ...,
         [-0.1293,  0.1659,  0.0207,  ..., -0.6263,  0.0744,  0.2371],
         [-0.0430,  0.0038, -0.1387,  ..., -0.2977, -0.0662,  0.3205],
         [-0.0884,  0.1214, -0.0362,  ..., -0.1134, -0.0571, -0.0015]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 1.5123e-03, -2.0938e-01, -2.2946e-01, -6.5062e-02,  1.2375e-01,
          1.8425e-01,  2.6301e-01, -6.8447e-02, -6.4302e-02, -1.9072e-01,
          2.1776e-01, -5.5137e-03, -1.2040e-01,  1.1931e-01, -1.3478e-01,
          4.9272e-01,  2.0823e-01, -4.7860e-01,  6.7879e-02, -3.0249e-02,
         -2.8252e-01,  9.0588e-02,  4.7406e-01,  3.5760e-01,  1.2559e-01,
          6.0991e-02, -1.4532e-01,  6.8363e-03,  1.7409e-01,  2.3602

In [256]:
model.embeddings.token_type_embeddings

Embedding(1, 768)

In [267]:
t = model.embeddings.token_type_embeddings

In [268]:
model.embeddings.token_type_embeddings

Embedding(1, 768)

In [269]:
model.config.type_vocab_size = 3
single_emb = model.embeddings.token_type_embeddings
model.embeddings.token_type_embeddings = torch.nn.Embedding(3, single_emb.embedding_dim)
model.embeddings.token_type_embeddings.weight = torch.nn.Parameter(single_emb.weight.repeat([3, 1]))

In [None]:
model

In [283]:
from transformers import RobertaForQuestionAnswering
tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [284]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

In [288]:
tokenizer.decode(inputs['input_ids'][0])

'<s>Who was Jim Henson?</s></s>Jim Henson was a nice puppet</s>'

# GPT2

In [120]:
tokenizer.encode('hello my name is')

[31373, 616, 1438, 318]

In [275]:
from transformers import GPT2Model

In [276]:
model = GPT2Model.from_pretrained('gpt2')

In [119]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
input_context = "My cute dog"

# get tokens of words that should not be generated
bad_words_ids = [
    tokenizer(bad_word, add_prefix_space=True).input_ids 
    for bad_word in ["idiot", "stupid", "shut up"]
]

# encode input context
input_ids = tokenizer(input_context, return_tensors="pt").input_ids

# generate sequences without allowing bad_words to be generated
outputs = model.generate(
    input_ids=input_ids, 
    max_length=20, 
    do_sample=True, 
    bad_words_ids=bad_words_ids
)

TypeError: _batch_encode_plus() got an unexpected keyword argument 'add_prefix_space'

# Multiheaded Self-Attention

In [327]:
import torch

In [935]:
S = 5
N = 10
E = 512

L = 1
x = torch.rand(( N, S, E))

query_layer = torch.nn.Linear(S, L)
key_layer = torch.nn.Linear(S, S)
value_layer = torch.nn.Linear(S, S)
attention = torch.nn.MultiheadAttention(E, 8, batch_first=True)

batch_size, seq_len, e_dim = x.shape

x_t = x.view(batch_size, e_dim, seq_len)
Q = query_layer(x_t).view(batch_size, L, e_dim)
K = key_layer(x_t).view(batch_size, seq_len, e_dim)
V = value_layer(x_t).view(batch_size, seq_len, e_dim)

o, attn_weights = attention.forward(Q, K, V)

In [937]:
x_t.shape 

torch.Size([10, 512, 5])

In [938]:
Q.shape 

torch.Size([10, 1, 512])

In [939]:
o.shape

torch.Size([10, 1, 512])

In [454]:
o.shape

torch.Size([10, 1, 512])

In [455]:
attn_weights[0].shape

torch.Size([1, 5])

In [456]:
attn_weights[0].sum(axis=0)

tensor([0.2011, 0.1972, 0.2087, 0.1930, 0.2000], grad_fn=<SumBackward1>)

# Process Special Tokens

In [894]:
import spacy
import sys
import utils_params as params
from importlib import reload
reload(utils_params)
import jellyfish


nlp = spacy.load('en_core_web_lg')
sys.path.append('../models_neural/quote_attribution/')

In [504]:
example_doc = [
    ['BANGKOK', 'None', '/test/doc_902', '0', 'doc_902', 'test'], 
    ['—', 'None', '/test/doc_902', '1', 'doc_902', 'test'], 
    ['A plane carrying key senior Laotian government officials crashed Saturday morning , leaving at least four people dead , Laotian diplomats said Saturday .  ', 'Laotian diplomats', '/test/doc_902', '2', 'doc_902', 'test'], 
    ['Killed in the crash were two top figures in the security apparatus of the authoritarian Lao government : the deputy prime minister , Douangchay Phichit , and Thongbane Sengaphone , the minister of public security , according to two Lao diplomats .  ', 'None', '/test/doc_902', '3', 'doc_902', 'test'], ['For a Communist party that relies on force and intimidation to stay in power , the loss of what were arguably the two most powerful people in the security apparatus was a significant blow .  ', 'None', '/test/doc_902', '4', 'doc_902', 'test'], ['The governor of Vientiane province was also killed in the crash .  ', 'None', '/test/doc_902', '5', 'doc_902', 'test'], ['In addition to his post as deputy prime minister Mr. Douangchay was defense minister and a member of the Politburo , the highest decision - making body of the Communist party .', 'None', '/test/doc_902', '6', 'doc_902', 'test'], ['Mr. Thongbane , the public security head , was feared in the country and was said to be one of the officials leading a crackdown against dissent over the past year and half .  ', 'None', '/test/doc_902', '7', 'doc_902', 'test'], ['That crackdown included the disappearance of the most prominent civic leader in the country , Sombath Somphone , a United States - trained agriculture specialist who led efforts to liberalize the hermetic communist leadership .  ', 'None', '/test/doc_902', '8', 'doc_902', 'test'], ['Mr. Sombath was stopped at a police checkpoint in Dec. 2012 and has not been seen again .  ', 'None', '/test/doc_902', '9', 'doc_902', 'test'], ['The Lao news agency posted photos to a web site Saturday showing the mangled wreckage of the plane , a Russian - made Antonov AN-74TK300 .  ', 'The Lao news agency', '/test/doc_902', '10', 'doc_902', 'test'], ['The news agency , citing a statement from the prime minister ’s office , said it was a Laotian Air Force aircraft', 'The Lao news agency', '/test/doc_902', '11', 'doc_902', 'test'], ['and it had crashed in Nadi village , west of the Xiangkhouang airport and not far from a major archeological site of prehistoric carved stone vessels , the Plain of Jars .  ', 'The Lao news agency', '/test/doc_902', '12', 'doc_902', 'test'], ['The authorities were “ helping to rescue the survivors , ” the news agency said , without offering details on the number of people killed .  ', 'The Lao news agency', '/test/doc_902', '13', 'doc_902', 'test'], ['The plane was traveling from the capital , Vientiane , to the mountainous northeastern province of Xiangkhouang , where the officials were due to attend a military ceremony .  ', 'None', '/test/doc_902', '14', 'doc_902', 'test'], ['State television in Laos showed footage of rescue workers recovering debris from the aircraft , which appeared to have crashed in a jungle - covered area .  ', 'State television', '/test/doc_902', '15', 'doc_902', 'test'], ['A news presenter said the crash occurred at around 7 a.m.', 'State television', '/test/doc_902', '16', 'doc_902', 'test'], ['“', 'None', '/test/doc_902', '17', 'doc_902', 'test'], ['The cause of the plane crash is still unknown , ” the presenter said .  ', 'State television', '/test/doc_902', '18', 'doc_902', 'test'], ['A Lao Facebook page showed images of thick black smoke rising up near what appeared to be an airport runway .', 'The Lao news agency', '/test/doc_902', '19', 'doc_902', 'test'], ['The images could not be independently confirmed .  ', 'None', '/test/doc_902', '20', 'doc_902', 'test'], ['The crash was the second in Laos in the last year .', 'None', '/test/doc_902', '21', 'doc_902', 'test'], ['Last October 49 people were killed when a Lao Airlines flight crashed in the south of the country .  ', 'None', '/test/doc_902', '22', 'doc_902', 'test'], ['The Foreign Ministry in neighboring Thailand said it “ received reports ” about the crash .  ', 'Foreign Ministry', '/test/doc_902', '23', 'doc_902', 'test'], ['“', 'None', '/test/doc_902', '24', 'doc_902', 'test'], ['There were about 20 passengers on board of which most were of high stature , ” said Sek Wannamethee , a spokesman for the Thai Foreign Ministry .', 'Sek Wannamethee', '/test/doc_902', '25', 'doc_902', 'test']]

In [736]:
# step 1. extract a candidate list
# step 2. reconcile the extracted candidate list with the list of annotated sources to see 
#             if any are slight deviations. Use the extracted source version
# step 3. get the word offsets of the first full mention of the source in the document
# step 4. generate token sequences for each source, generate token sequences for each sentences
# step 5. get the cross-product of source and sentence token sequences

In [738]:
ent_candidates = get_unique_spacy_ents(ent_candidates)

In [None]:
all_ent_candidates_df['source_tokenized'] = output

In [None]:
all_ent_candidates_df = pd.DataFrame(all_ent_candidates).drop_duplicates('candidate')

In [1012]:
pd.Series(output_data[-1])

source_ind_tokens      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
sentence_ind_tokens    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
doc_tokens             [0, 387, 9298, 530, 9335, 2, 0, 578, 2, 0, 250...
label                                                              False
dtype: object

In [2064]:
len(output_data)

951

In [1018]:
candidate_set

['BANGKOK',
 'Laotian',
 'government officials',
 'Laotian diplomats',
 'four people',
 'Lao',
 'Douangchay Phichit',
 'Thongbane Sengaphone',
 'diplomats',
 'Communist',
 'people',
 'Vientiane province',
 'Douangchay',
 'Politburo',
 'Thongbane',
 'officials',
 'a United States',
 'Sombath',
 'police',
 'Russian',
 'Antonov',
 'the lao news agency',
 'Air Force',
 'Nadi',
 'Xiangkhouang',
 'authorities',
 'Vientiane',
 'Laos',
 'state television',
 'Lao Airlines',
 'The Foreign Ministry',
 'Thailand',
 'reports',
 'Sek Wannamethee',
 'the Thai Foreign Ministry',
 'spokesman']

In [1029]:
from transformers import AutoConfig
from transformers import BertModel

In [1030]:
c = AutoConfig.from_pretrained('bert-base-uncased')

In [1042]:
c

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [1053]:
from transformers import AutoTokenizer

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [1043]:
S = 5
N = 10
E = 768

In [1044]:
hidden = torch.rand((N, S, E))

In [1047]:
model.encoder.layer[0].attention.self.forward(hidden)[0].shape 

torch.Size([10, 5, 768])

In [1295]:
input_ids = list(map(lambda x: tokenizer.encode( x, return_tensors='pt').squeeze() , [
    'Hello my name is Alex',
    'Alex is my name'
]))

input_ids = pad_sequence( input_ids, batch_first=True)

In [1064]:
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pad_sequence

In [1080]:
model.embeddings.forward(input_ids).shape

torch.Size([2, 7, 768])

In [1083]:
import torch 
max_seq_len = 10        # S
emb_dim = 512           # E
hidden_dim = 768        # H
output_seq_len = 1      # L
num_heads = 12          # M

query_layer = torch.nn.Linear(emb_dim, hidden_dim)
compressor_layer = torch.nn.Linear(max_seq_len, output_seq_len)
key_layer = torch.nn.Linear(emb_dim, hidden_dim)
value_layer = torch.nn.Linear(emb_dim, hidden_dim)
attention = torch.nn.MultiheadAttention(hidden_dim, num_heads, batch_first=True)

batch_size = 15
x_seq_len = 5
x = torch.rand((batch_size, x_seq_len, emb_dim))

z = torch.zeros(batch_size, max_seq_len - x_seq_len, emb_dim)
x_inp = torch.hstack([x, z])

Q = query_layer(x_inp)
V = value_layer(x_inp)
K = key_layer(x_inp)

Q_s = compressor_layer(Q.permute(0, 2, 1)).permute(0, 2, 1)

K.shape 

Q.shape 

o, _ = attention.forward(Q_s, K, V, need_weights=False)

o.squeeze()

In [1161]:
o.squeeze().shape

torch.Size([15, 768])

In [1184]:
from torch import nn
model_dim = 512

In [1193]:
x.shape 

torch.Size([15, 5, 512])

In [1209]:
attn_mask = torch.tensor([[[1.,1.,1.,0.,0.]]]*15)

In [1210]:
attn_mask.shape 

torch.Size([15, 1, 5])

In [1211]:
query_layer = torch.nn.Linear(emb_dim, hidden_dim)
key_layer = torch.nn.Linear(emb_dim, hidden_dim)
value_layer = torch.nn.Linear(emb_dim, hidden_dim)

In [1212]:
cls_repr = nn.Parameter(torch.zeros(hidden_dim))
attn = torch.nn.MultiheadAttention(hidden_dim, 8, batch_first=True)

B, T, D = x.size()  # [Batch, Time, Dim]
query = cls_repr.view(1, 1, hidden_dim).repeat(B, 1, 1)

key = key_layer(x)
value = value_layer(x)

In [1213]:
# Args: Query, Key, Value, Mask
cls_repr, _  = attn.forward(query, key, value, need_weights=False, attn_mask=attn_mask)

RuntimeError: The shape of the 3D attn_mask is torch.Size([15, 1, 5]), but should be (120, 1, 5).

In [1192]:
cls_repr.view(B, hidden_dim)  # [B, D]

tensor([[-0.0860,  0.1208,  0.0414,  ...,  0.0887,  0.1254, -0.0819],
        [-0.1348,  0.1030,  0.0248,  ...,  0.1277,  0.0863, -0.0379],
        [-0.1329,  0.1024,  0.0467,  ...,  0.1019,  0.1245, -0.0619],
        ...,
        [-0.1245,  0.0934,  0.0171,  ...,  0.1142,  0.1566, -0.0690],
        [-0.1655,  0.0969,  0.0866,  ...,  0.1305,  0.1565, -0.0559],
        [-0.0945,  0.1202,  0.0599,  ...,  0.0963,  0.0844, -0.0768]],
       grad_fn=<ViewBackward>)

In [1181]:
cls_repr.shape

torch.Size([15, 1, 512])

In [1217]:
a = torch.randn(1, 2, 3, 4)
b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
c = a.view(1, 3, 2, 4) 

In [1224]:
ordered_inp = torch.arange(15*768).reshape(1, 15, 768)

In [1225]:
ordered_inp

tensor([[[    0,     1,     2,  ...,   765,   766,   767],
         [  768,   769,   770,  ...,  1533,  1534,  1535],
         [ 1536,  1537,  1538,  ...,  2301,  2302,  2303],
         ...,
         [ 9216,  9217,  9218,  ...,  9981,  9982,  9983],
         [ 9984,  9985,  9986,  ..., 10749, 10750, 10751],
         [10752, 10753, 10754,  ..., 11517, 11518, 11519]]])

In [1226]:
ordered_inp.contiguous().view(1, 15 * 12, 64)

tensor([[[    0,     1,     2,  ...,    61,    62,    63],
         [   64,    65,    66,  ...,   125,   126,   127],
         [  128,   129,   130,  ...,   189,   190,   191],
         ...,
         [11328, 11329, 11330,  ..., 11389, 11390, 11391],
         [11392, 11393, 11394,  ..., 11453, 11454, 11455],
         [11456, 11457, 11458,  ..., 11517, 11518, 11519]]])

In [None]:
.transpose(0, 1).shape 
Out[2]: torch.Size([180, 1, 64])
q.contiguous().shape 
Out[3]: torch.Size([1, 15, 768])
q.contiguous()

In [1229]:
s = nn.Softmax()

In [1238]:
s(torch.tensor([1.,2.,3., -np.inf, -np.inf]))

  """Entry point for launching an IPython kernel.


tensor([0.0900, 0.2447, 0.6652, 0.0000, 0.0000])

In [1510]:
emb_dim = 768
hidden_dim = 768

query_layer = torch.nn.Linear(emb_dim, hidden_dim)
key_layer = torch.nn.Linear(emb_dim, hidden_dim)
value_layer = torch.nn.Linear(emb_dim, hidden_dim)
compressor_layer = torch.nn.Linear(max_sequence_len, 1)
attn = torch.nn.MultiheadAttention(hidden_dim, 12, batch_first=True)

In [1882]:
input_ids.shape 

torch.Size([3, 10])

In [2039]:
input_ids = list(map(lambda x:
                     tokenizer.encode(x, return_tensors='pt').squeeze(),
                     ['Hello my name is Alex there man',
                      'Alex is there, man, man, man.',
                      'Alex is there, man.',
                     ]
                    )
)

max_sequence_len = 13

seq_lens = list(map(len, input_ids))
max_seq_len = max(seq_lens)
attn_mask = list(map(lambda x: [1] * x + [0] * (max_seq_len - x), seq_lens))
attn_mask = torch.tensor(attn_mask)
input_ids = pad_sequence(input_ids, batch_first=True)
batch_size, _ = input_ids.shape 
hidden = model.embeddings(input_ids)

num_cols_zeros = max_sequence_len - hidden.shape[1]
hidden = torch.hstack([hidden, torch.zeros((batch_size, num_cols_zeros, 768))])
attn_mask = torch.hstack([attn_mask, torch.zeros((batch_size, num_cols_zeros))])

In [2040]:
hidden_test = hidden

In [2053]:
hidden_test = hidden.where(hidden != 0, torch.ones_like(hidden) * 100)

In [2054]:
Q = query_layer(hidden_test)
K = key_layer(hidden_test)
V = value_layer(hidden_test)

t_a = attn_mask.unsqueeze(-1).repeat(1, 1, hidden_dim)
masked_Q = Q + torch.where(t_a != 0, torch.zeros_like(Q), -Q)
masked_K = K + torch.where(t_a != 0, torch.zeros_like(K), -K)
masked_V = V + torch.where(t_a != 0, torch.zeros_like(V), -V)

Q_s = compressor_layer(masked_Q.permute(0, 2, 1)).permute(0, 2, 1)

In [2055]:
Q_first_row_old = Q_s[0]
V_first_row_old = masked_V[0]
K_first_row_old = masked_K[0]

In [2056]:
Q_first_row_old.isclose(Q_s[0]).all()

tensor(True)

In [2057]:
V_first_row_old.isclose(masked_V[0]).all()

tensor(True)

In [2058]:
K_first_row_old.isclose(masked_K[0]).all()

tensor(True)

In [2059]:
# attn_mask.repeat(1, 1, 12).reshape(24, 1, 13)

In [2060]:
t = attn_mask.unsqueeze(1).repeat(12, 1, 1)

In [2061]:
t_a_2 = attn_mask.unsqueeze(1).repeat(12, 1, 1)
t_a_2 = torch.log(t_a_2)

In [2062]:
o, _ = attn.forward(Q_s, masked_K, masked_V, need_weights=False)#, attn_mask=t_a_2)

In [2051]:
t_o = o

In [2063]:
o

tensor([[[-0.0601, -0.0689, -0.0302,  ...,  0.0335,  0.0272,  0.0230]],

        [[-0.0581, -0.0296, -0.0408,  ..., -0.0189,  0.0677,  0.0551]],

        [[-0.0462, -0.0207, -0.0120,  ...,  0.0096,  0.0374,  0.0389]]],
       grad_fn=<TransposeBackward0>)

In [2038]:
o

tensor([[[-0.0601, -0.0689, -0.0302,  ...,  0.0335,  0.0272,  0.0230]],

        [[-0.0510, -0.0283, -0.0287,  ..., -0.0005,  0.0516,  0.0473]],

        [[-0.0462, -0.0207, -0.0120,  ...,  0.0096,  0.0374,  0.0389]]],
       grad_fn=<TransposeBackward0>)

In [1964]:
o.shape 

torch.Size([3, 1, 768])

In [1931]:
input_ids = list(map(lambda x:
                     tokenizer.encode(x, return_tensors='pt').squeeze(),
                     ['Hello my name is Alex there man',
                      'Alex is there, man man man.',
                      'Alex is there, man man man.'
                     ]
                    )
)

max_sequence_len = 13

seq_lens = list(map(len, input_ids))
max_seq_len = max(seq_lens)
attn_mask = list(map(lambda x: [1] * x + [0] * (max_seq_len - x), seq_lens))
attn_mask = torch.tensor(attn_mask)
input_ids = pad_sequence(input_ids, batch_first=True)
batch_size, _ = input_ids.shape 
hidden = model.embeddings(input_ids)

In [1932]:
Q = query_layer(hidden)
K = key_layer(hidden)
V = value_layer(hidden)

In [1933]:
o, _ = attention.forward(Q, K, V)

In [1930]:
old_o = o

In [1937]:
old_o.shape 

torch.Size([2, 10, 768])

In [1945]:
o[0].isclose( old_o[0]).sum()

tensor(7649)

In [1920]:
o.shape 

torch.Size([3, 10, 768])

In [1280]:
small_attn = torch.nn.MultiheadAttention(4, 2, batch_first=True)

In [1288]:
attn_mask = torch.tensor([
    [[1, 0]],
    [[1, 0]]
])
attn_mask = torch.log(attn_mask)

In [1291]:
o, _ = small_attn(q, k, v, attn_mask=attn_mask)

In [1292]:
o

tensor([[[-2.7300e-02,  8.3342e-05, -2.8119e-02,  3.0445e-02]]],
       grad_fn=<TransposeBackward0>)

In [1311]:
from torch.nn.utils.rnn import pad_sequence

In [1318]:
def get_extended_attention_mask(attention_mask, dtype=torch.float32):
    """
    Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

    Arguments:
        attention_mask (:obj:`torch.Tensor`):
            Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
        device: (:obj:`torch.device`):
            The device of the input to the model.

    Returns:
        :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
    """
    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        # Provided a padding mask of dimensions [batch_size, seq_length]
        # - make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
        extended_attention_mask = attention_mask[:, None, None, :]
    extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask

In [1321]:
get_extended_attention_mask(attn_mask)

torch.Size([2, 1, 1, 7])