In [1]:
# Setup
%matplotlib inline
%load_ext autoreload
%autoreload 2
import warnings
import spacy
from optimized_anchor import anchor_text, anchor_base
import pickle
import myUtils
from myUtils import *
from transformer.utils import *
from dataset.dataset_loader import *
import datetime
%load_ext line_profiler

SEED = 84
torch.manual_seed(SEED)
warnings.simplefilter("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# can be sentiment/spam/offensive
dataset_name = 'sentiment'
text_parser, label_parser, ds_train, ds_val = get_dataset(dataset_name)

Number of tokens in training samples: 6407
Number of tokens in training labels: 2


In [3]:
len(ds_val)

1433

In [4]:
model = load_model('gru' , f'transformer/{dataset_name}/gru.pt', text_parser)
model = torch.jit.script(model)
myUtils.model = model
myUtils.text_parser = text_parser

{'embedding_dim': 100, 'batch_size': 32, 'hidden_dim': 256, 'num_layers': 2, 'dropout': 0.3, 'lr': 0.0001, 'early_stopping': 5, 'output_classes': 2}
VanillaGRU(
  (embedding_layer): Embedding(6407, 100)
  (GRU_layer): GRU(100, 256, num_layers=2, dropout=0.3)
  (dropout_layer): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)


In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
train, train_labels, test, test_labels, anchor_examples = preprocess_examples(ds_train)

In [7]:
ignored = get_ignored(anchor_examples)
normal_occurences = get_occurences(anchor_examples)
anchor_base.AnchorBaseBeam.best_group = BestGroup('check', normal_occurences, filter_anchors = False, desired_optimize = False)

## notice!

In [8]:
ignored = []

In [9]:
anchor_examples = anchor_examples[1:2]

In [150]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Aug  5 19:17:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54       Driver Version: 510.54       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     Off  | 00000000:AF:00.0 Off |                  Off |
| 33%   31C    P8    16W / 260W |   6393MiB / 24576MiB |      0%      Default |
|                               |            

In [129]:
optimize = True
anchor_text.AnchorText.set_optimize(optimize)
explainer = anchor_text.AnchorText(nlp, ['positive', 'negative'], use_unk_distribution=False)

In [12]:
from transformers import AutoTokenizer
model = torch.jit.load(f'transformer/electra/{dataset_name}/traced.pt').to(device)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator')
myUtils.model = model
myUtils.distilTokenizer = tokenizer
myUtils.text_parser = text_parser

In [146]:
def predict_sentences_bert(sentences):
    encoded = [[101] +[tokenizer._convert_token_to_id_with_added_voc(token) for token in tokens] + [102]         
               for tokens in sentences]
    #encoded = tokenizer.encode(sentences, add_special_tokens=True, return_tensors="pt").to(device)
    to_pred = torch.tensor(encoded, device=device)
    outputs = model(to_pred)[0]
    return torch.argmax(outputs, dim=1).cpu().numpy()

In [148]:
#torch._C._jit_set_texpr_fuser_enabled(False)
my_utils = TextUtils(anchor_examples, test, explainer, predict_sentences_bert, ignored, f"profile.pickle", optimize=optimize)
myUtils.model = model
myUtils.text_parser = text_parser
set_seed()
%prun -s cumtime -T profile.txt my_utils.compute_explanations(list(range(len(anchor_examples))))

number 0
[0.2727272727272727]
[0.09090909090909091]
[0.09090909090909091]
[0.5454545454545454]
[0.2727272727272727]
[0.5454545454545454]
[0.18181818181818182]
[0.18181818181818182]
[0.09090909090909091]
[0.18181818181818182]
[0.2727272727272727]
[0.18181818181818182]
[0.2727272727272727]
[0.2727272727272727]
 
*** Profile printout saved to text file 'profile.txt'. 


         1270552 function calls (1270452 primitive calls) in 0.947 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.948    0.948 {built-in method builtins.exec}
        1    0.000    0.000    0.948    0.948 <string>:1(<module>)
        1    0.000    0.000    0.948    0.948 myUtils.py:314(compute_explanations)
        1    0.002    0.002    0.847    0.847 myUtils.py:291(get_exp)
        1    0.000    0.000    0.846    0.846 anchor_text.py:219(explain_instance)
        1    0.000    0.000    0.774    0.774 anchor_base.py:255(anchor_beam)
       29    0.002    0.000    0.767    0.026 anchor_text.py:178(sample_fn)
       27    0.000    0.000    0.753    0.028 anchor_base.py:217(<lambda>)
       27    0.001    0.000    0.753    0.028 anchor_base.py:186(complete_sample_fn)
      155    0.003    0.000    0.643    0.004 anchor_text.py:97(sample)
      169    0.004    0.000    0.639    0.004 anchor_

In [13]:
###### my_utils = TextUtils(anchor_examples, test, explainer, predict_sentences, ignored,f"profile.pickle")
#%lprun -s -m modified_anchor.anchor_text -m modified_anchor.anchor_base -m myUtils -T profile.txt  my_utils.compute_explanations(list(range(len(anchor_examples))))

In [14]:
print(datetime.datetime.now())

2022-08-02 21:00:39.480614


In [91]:
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator', use_fast=False)

In [57]:
tokenizer.is_fast

False