Imports

In [1]:
import sys
sys.path.append("..")

from lstm import ExperimentRunner
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = 'cuda' if torch.cuda.is_available() else 'cpu'

Load the tokenizer and the finetuned model from HuggingFace

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("artemis13fowl/bert-base-uncased-imdb")

Load the trained tokenizer. It has only been trained for 2 epochs on 64k sentences, so it's not mind-blowing.

For now you need to manually get it and put it in the root of the resilient_nlp repo, sorry. You can get the model here: https://resilient-nlp.s3.us-west-2.amazonaws.com/tok_bert_base_uncased_64k_sentences_2_epochs.pth and the vocab here: https://resilient-nlp.s3.us-west-2.amazonaws.com/tok_bert_base_uncased_64k_sentences_2_epochs_vocab.json

In [3]:
runner = ExperimentRunner(device, model_name="bert-base-uncased")
runner.model.load("../tok_bert_base_uncased_64k_sentences_2_epochs.pth", device)
runner.char_tokenizer.load_vocab("../tok_bert_base_uncased_64k_sentences_2_epochs_vocab.json")

Some simple test sentences for sanity checking

In [4]:
sentences = [
  "I really enjoyed this movie",
  "Worst two hours I have spent in my life",
  "My worries and fears about this movie were swept away within the first fifteen minutes",
  "My worries and fears about this movie were fully confirmed within the first fifteen minutes",
  "My worries and fears about this movie were rendered meaningless within the first fifteen minutes",
  "Movie was very good",
  "It was enjoyable",
  "Very boring unfortunately",
  "lt w@s 3njo yabl3",
]

First, use the regular bert-base-uncased tokenizer and model

In [5]:
tokenizer_result = tokenizer(sentences, return_tensors='pt', padding=True)

model_result = model(
    input_ids=tokenizer_result['input_ids'],
    attention_mask=tokenizer_result['attention_mask'],
    output_hidden_states=True,
)

print(model_result['hidden_states'][0][0,1,:16].tolist())

[0.010627184063196182, 0.5175461769104004, -0.2631071209907532, 0.04238038882613182, 0.40291687846183777, 0.19839873909950256, -0.7841276526451111, 0.3104897141456604, 0.25195378065109253, 0.1873702108860016, -0.5451260209083557, 0.27750512957572937, 0.20487289130687714, -0.6549283266067505, -0.551720142364502, -0.5405393242835999]


Next, manually embed using BERT's embedding before passing to the model. This is to verify that the model behaves identically (i.e. it turns out that positional embeddings, etc. are added automatically)

In [6]:
cf_embedding = model.base_model.embeddings.word_embeddings
cf_embedding_result = cf_embedding(tokenizer_result['input_ids'])

model_result_2 = model(
    inputs_embeds=cf_embedding_result,
    attention_mask=tokenizer_result['attention_mask'],
    output_hidden_states=True,
)

print(model_result_2['hidden_states'][0][0,1,:16].tolist())

[0.010627184063196182, 0.5175461769104004, -0.2631071209907532, 0.04238038882613182, 0.40291687846183777, 0.19839873909950256, -0.7841276526451111, 0.3104897141456604, 0.25195378065109253, 0.1873702108860016, -0.5451260209083557, 0.27750512957572937, 0.20487289130687714, -0.6549283266067505, -0.551720142364502, -0.5405393242835999]


Now the fun part :). Let's use our trained embeddings. First though let's get the representations of the [CLS] and [SEP] tokens (not predicted by the model)

In [7]:
cls_token_id = tokenizer.vocab['[CLS]']
sep_token_id = tokenizer.vocab['[SEP]']
pad_token_id = tokenizer.vocab['[PAD]']
cls_embedding = cf_embedding(torch.tensor([cls_token_id])).view(768)
sep_embedding = cf_embedding(torch.tensor([sep_token_id])).view(768)
pad_embedding = cf_embedding(torch.tensor([pad_token_id])).view(768)

Now actually embed the input sentences

In [8]:
our_embedding = runner.embed([s.lower() for s in sentences], start_token=cls_embedding, end_token=sep_embedding,
    pad_token=pad_embedding)

and run the transformer stack

In [9]:
model_result_3 = model(
    inputs_embeds=our_embedding['inputs_embeds'],
    attention_mask=our_embedding['attention_mask'],
    output_hidden_states=True,
)

print(model_result_3['hidden_states'][0][0,1,:16].tolist())

[-0.033297598361968994, 0.5032650232315063, -0.23978081345558167, 0.024146944284439087, 0.40309470891952515, 0.21676689386367798, -0.7777954936027527, 0.3020963668823242, 0.20578446984291077, 0.2136598527431488, -0.5167685151100159, 0.2698840796947479, 0.2521650791168213, -0.6605544686317444, -0.5743234157562256, -0.5343013405799866]


Finally let's print out the predictions in a readable format

In [10]:
results = torch.argmax(model_result.logits, dim=1).tolist()
results_2 = torch.argmax(model_result_2.logits, dim=1).tolist()
results_3 = torch.argmax(model_result_3.logits, dim=1).tolist()

for i in range(len(sentences)):
    print("Sentence                        : {}".format(sentences[i]))
    print(" Vanilla tokenization/embedding : {}".format(bool(results[i])))
    print(" MockingBERT                    : {}".format(bool(results_3[i])))
    print(" Sanitized                      : {}".format(runner.sanitize([sentences[i]])[0]))
    print("")

Sentence                        : I really enjoyed this movie
 Vanilla tokenization/embedding : True
 MockingBERT                    : True
 Sanitized                      : i really enjoyed this movie

Sentence                        : Worst two hours I have spent in my life
 Vanilla tokenization/embedding : False
 MockingBERT                    : False
 Sanitized                      : 1879 two hours i have spent in my life

Sentence                        : My worries and fears about this movie were swept away within the first fifteen minutes
 Vanilla tokenization/embedding : True
 MockingBERT                    : True
 Sanitized                      : john worries and fears about this movie were swept away within the first fifteen minutes

Sentence                        : My worries and fears about this movie were fully confirmed within the first fifteen minutes
 Vanilla tokenization/embedding : True
 MockingBERT                    : True
 Sanitized                      : john wor