In [62]:
import typing as tp
import numpy as np
import pandas as pd
import os

from collections import defaultdict
from tqdm import tqdm
import torch.optim as optim
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

if device == 'cpu':
    print("Fine-tuning BERT without an accelerator is not party-approved.")

In [36]:
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#bert_mlm_positive = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Load MLMS

In [178]:
bert_mlm_positive = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

bert_mlm_positive.load_state_dict(torch.load('weights/bert_positive.pth'))     

bert_mlm_positive.to('cuda:0')

print()

bert_mlm_negative = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

bert_mlm_negative.load_state_dict(torch.load('weights/bert_negative.pth'))     

bert_mlm_negative.to('cuda:0')

print()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [63]:
tokenizer("hello how are y=u", return_tensors='pt')

{'input_ids': tensor([[ 101, 7592, 2129, 2024, 1061, 1027, 1057,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [52]:
kk = tokenizer("hello how are y=u")
kk['input_ids'] = torch.tensor(kk['input_ids'])
kk['token_type_ids'] = torch.tensor(kk['token_type_ids'])
kk['attention_mask'] = torch.tensor(kk['attention_mask'])
kk['input_ids'].shape[0]

8

In [58]:
ll = tokenizer("hello how are y=u", return_tensors='pt')

In [59]:
#bert_mlm_positive(input_ids=kk['input_ids'], attention_mask=kk['attention_mask'], )
bert_mlm_positive(**ll)

MaskedLMOutput(loss=None, logits=tensor([[[ -0.6051, -15.2065, -15.2381,  ..., -16.1269, -15.7733, -14.8123],
         [  0.6833,  -1.9597,  -2.0821,  ...,  -2.1163,  -2.1506,  -2.3511],
         [ -4.2548, -12.1528, -12.2661,  ..., -11.9906, -11.9359, -12.0329],
         ...,
         [  0.7712,  -9.6940,  -9.7155,  ...,  -9.5029, -10.1179,  -8.0905],
         [ -1.4938,  -4.2537,  -4.3743,  ...,  -4.2071,  -4.7285,  -4.5678],
         [ -2.1274, -17.9070, -17.9082,  ..., -17.8705, -17.7919, -17.2441]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [194]:
from copy import deepcopy 
from transformers import pipeline
import random

def get_replacements(sentence: str, num_tokens, k_best, epsilon=1e-3, device='cuda:0'):
    
    bert_mlm_positive.to(device)
    bert_mlm_negative.to(device)
   
    bert_mlm_positive.eval()
    bert_mlm_negative.eval()
    
    """
    - split the sentence into tokens using the INGSOC-approved BERT tokenizer
    - find :num_tokens: tokens with the highest ratio (see above)
    - replace them with :k_best: words according to bert_mlm_positive
    :return: a list of all possible strings (up to k_best * num_tokens)
    """

    example = tokenizer(sentence, return_tensors='pt').to(device)
    sample_dict = {}
    for i in range(1, example['input_ids'].shape[1] - 1):
        masked_example = deepcopy(example)
        
        masked_example['input_ids'][0][i] = 103
        label = example['input_ids'][0][i].cpu()
        
        with torch.no_grad():
            positive_outputs = bert_mlm_positive(input_ids=masked_example['input_ids'],
                        attention_mask=masked_example['attention_mask'], 
                       ).logits.cpu()
            
            negative_outputs = bert_mlm_negative(input_ids=masked_example['input_ids'],
                                                 attention_mask=example['attention_mask'], 
                                                ).logits.cpu()
        
        print("\n", tokenizer.decode(label))   
        positive_score = positive_outputs[0][i][label]
        
        negative_score = negative_outputs[0][i][label]
        
        score = (positive_score + epsilon) / (negative_score + epsilon)
        print("score:", score, "\n")             
        sample_dict[label] = [score, i]
          
    score_dict = dict(sorted(sample_dict.items(), key=lambda item: item[1], reverse=False))  # sort dict
    
    #print(score_dict)
    
    token_dict = {}
    token_list = list(score_dict)[: num_tokens]
    
    with torch.no_grad():
        unmasker = pipeline('fill-mask', model=bert_mlm_positive.cpu(), tokenizer=tokenizer)
    
    for token in token_list:
        modified = example['input_ids'].clone()
        modified = torch.where(modified==token,torch.tensor(103).to(device),modified).cpu()
        modified = list(modified[0])
        modified = tokenizer.decode(modified)
        new_sentences = unmasker(modified)[:k_best]
        for i in range(k_best):
            val_arr = []
            for i, val in enumerate(new_sentences):
                val_arr.append(val['token'])
            token_dict[token] = val_arr
            
    #print(token_dict)
    
    modified = example['input_ids'].clone().cpu()
    for key in token_dict:
        elem = random.choice(token_dict[key])
        modified = torch.where(modified==key,torch.tensor(elem), modified).cpu()
    
    modified = list(modified[0])
    modified = tokenizer.decode(modified, skip_special_tokens=True) 
    
    return modified

get_replacements("great wings and decent drinks but the wait staff is horrible !", 4, 2)
#get_replacements("food is bad and quite horrible.", 4, 3)


 g r e a t
score: tensor(1.3592) 


 w i n g s
score: tensor(1.0392) 


 a n d
score: tensor(0.9051) 


 d e c e n t
score: tensor(1.5033) 


 d r i n k s
score: tensor(1.3830) 


 b u t
score: tensor(0.5739) 


 t h e
score: tensor(1.0714) 


 w a i t
score: tensor(1.2563) 


 s t a f f
score: tensor(1.0368) 


 i s
score: tensor(0.9056) 


 h o r r i b l e
score: tensor(-0.0260) 


 !
score: tensor(1.0639) 



'great wings and decent drinks and the wait staff is friendly!'

In [136]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model=bert_mlm_positive.cpu(), tokenizer=tokenizer)
kk = unmasker("[CLS] food is [MASK] and shit [SEP]")[:2]
for i, val in enumerate(kk):
    print (i, ",",val['token'])
#unmasker("great wings and decent drinks but the wait staff is [MASK] !")[:2]

0 , 2307
1 , 2204


In [112]:
kk = tokenizer("hello how are u")
print(kk['input_ids'])
tokenizer.decode(kk['input_ids'],skip_special_tokens = True )

[101, 7592, 2129, 2024, 1057, 102]


'hello how are u'

In [111]:
unmasker("[CLS] food is [MASK] and shit [SEP]")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking arugment for argument index in method wrapper_index_select)