In [None]:
!pip install transformers
from google.colab import drive  # to mount Google Drive to Colab notebook
drive.mount('/content/gdrive')
path = './gdrive/My Drive/datasets/'
import collections
import pickle
import numpy as np
import json
import pandas as pd
from transformers import LongformerModel, LongformerTokenizer, LongformerForMaskedLM
import warnings
warnings.filterwarnings('ignore')

from typing import List, Tuple, Dict, Iterable, Any, Callable, Union
import torch

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 38.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [None]:
path_reg = './gdrive/My Drive/cd-mlm'
# path_rand = './gdrive/My Drive/random_cdlm'
# path_bb = './gdrive/My Drive/prefix_cdlm'
path = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(path_reg)

In [None]:
"""
Compute BERT’s attention matrices for each sequence
"""
def get_attention_for_sentence_aut(sentence: list, model: object, token_lst:list=None):
  """ Returns a tensor of all BERT's attentions - shape (num_layers, num_heads, seq_length, seq_length) """
  inputs = torch.tensor(sentence, dtype=torch.long).unsqueeze(0)
  input_ids = inputs
  global_attention = torch.zeros_like(input_ids)
  if token_lst is not None:
    for tok in token_lst:
      global_attention[:,tok] = 1
  device = model.device
  global_attention = global_attention.to(device)
  input_ids = input_ids.to(device)
  attentions = model(input_ids, global_attention_mask=global_attention)[-1] # attentions here is list of layers, 
  model.cpu()
  attentions = torch.stack(attentions).squeeze()
  if len(attentions.size()) == 4:
    attentions = attentions.mean(-1)
  attentions = attentions.mean((0,1)).detach().cpu().numpy()
  return attentions 

def get_all_attention_maps_aut(sentences: List[int], token_lst:List[int], model:object) -> torch.Tensor:
  return get_attention_for_sentence_aut(sentences[0], model, token_lst)

In [None]:
def check_word_relations_aut(first_w, second_w, what_doc, model, doc_to_look=2):
  models = {'cdlm':'./gdrive/My Drive/cdlm', 'rand cdlm':'./gdrive/My Drive/random_cdlm', 'longformer':'allenai/longformer-base-4096'}
  scores = {}
  to_cut_idx = len(what_doc)
  attn_data = get_all_attention_maps_aut([what_doc],first_w, model)
  a = attn_data
  cut_a = a[:to_cut_idx]
  del a
  torch.cuda.empty_cache()

  mean_of_rel = np.mean(cut_a[second_w])
  cut_a[second_w[0]] = mean_of_rel
  cut_a=np.delete(cut_a, second_w[1:] + first_w)

  if len(second_w) > 1:
    what_doc_trunc = np.delete(what_doc, second_w[1:])
  else:
    what_doc_trunc = what_doc
  new_a = np.argsort(cut_a)
  start = np.where(np.array(what_doc)==50266)[0][0]
  forbidden = [0,2,50266,50265] + tokenizer.encode('.', add_special_tokens=False,  add_prefix_space=False)\
  + tokenizer.encode(',', add_special_tokens=False,  add_prefix_space=False) + tokenizer.encode(':', add_special_tokens=False,  add_prefix_space=False)\
  + tokenizer.encode('?', add_special_tokens=False,  add_prefix_space=False) + tokenizer.encode('!', add_special_tokens=False,  add_prefix_space=False)

  if doc_to_look == 1:
    new_a = np.array([x for i,x in enumerate(new_a) if x < start and what_doc_trunc[x] not in forbidden])
  elif doc_to_look==2:
    new_a = np.array([x for i,x in enumerate(new_a) if x > start+1 and what_doc_trunc[x] not in forbidden])
  elif doc_to_look==3:
    new_a = np.array([x for i,x in enumerate(new_a) if what_doc_trunc[x] not in forbidden])
  cands_res = np.where(new_a==second_w[0])[0]
  scores = cands_res/len(new_a)
  return scores


In [None]:
from tqdm import tqdm
with open('./gdrive/My Drive/datasets/ecb+/ecb_qual_2k_events.json') as json_file:
  data = json.load(json_file)

cdmlm_pos = []
cdmlm_neg = []
rand_pos = []
rand_neg = []
long_pos = []
long_neg = []

df = pd.DataFrame(columns=['CD-LM', 'Longformer', 'label'])
models = {'CD-LM':'./gdrive/My Drive/cdlm', 'Longformer':'allenai/longformer-base-4096'}
for m, path in models.items():
  model = LongformerModel.from_pretrained(path, output_attentions=True)
  model.resize_token_embeddings(len(tokenizer))
  model.cuda()
  for i,d in tqdm(enumerate(data)):
    if len(d['toks']) > 2000:
      continue
    scores = check_word_relations_aut(d['start'],d['end'],d['toks'], model)
    df.loc[i, m] = scores
    if d['label'] == 1:
      df.loc[i, 'label'] = 1
    else:
      df.loc[i, 'label'] = 0
  df.to_csv('./gdrive/My Drive/datasets/ecb+/qual_results_2k.csv')

In [None]:
len(data)

2000

In [None]:
poss = df[df['label']==1]
negs = df[df['label']==0]

In [None]:
poss.mean()

CD-LM         0.702665
Rand CD-LM    0.691109
Longformer    0.597223
label         1.000000
dtype: float64

In [None]:
negs.mean()

CD-LM         0.556482
Rand CD-LM    0.544480
Longformer    0.505287
label         0.000000
dtype: float64