In [1]:
'''
Prepare the model and tokenizer
'''
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForMaskedLM.from_pretrained("roberta-large")



  from .autonotebook import tqdm as notebook_tqdm


In [8]:
'''
roberta tokenizer id to continous
Download the roberta-large embeddings
'''
import pickle
import os
embeddings = []
for i in range(tokenizer.decoder.__len__()):
    embeddings.append(model.roberta.embeddings.word_embeddings(torch.tensor(i)))
embeddings = torch.stack(embeddings)

with open(f'/model_file/roberta_large_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)
embeddings

FileNotFoundError: [Errno 2] No such file or directory: '/model_file/roberta_large_embeddings.pkl'

In [16]:
with open('/workspace/dataset/roberta_large_embeddings.pkl', "rb") as f:
    data = pickle.load(f)

In [9]:
with open(f'/dataset/roberta_large_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

FileNotFoundError: [Errno 2] No such file or directory: '/dataset/roberta_large_embeddings.pkl'

In [6]:
'''
load the sst2 dataset
'''

import os
import pickle
dataset_name =  'sst2'

data_file_path = '%s/dataset/%s_dataset.pkl'%(os.getcwd(), dataset_name)
with open(data_file_path, "rb") as f:
    sst2 = pickle.load(f)

sst2_train = sst2['train']
sst2_train_sentence = sst2_train['sentence']
sst2_train_sentence[:10]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films "]

In [7]:
'''
Load the VQ model and the total embeddings
'''

import pickle
with open('cache_VQ_model/cache_for_sst2_renewal/roberta-large_sst2_2e-05_1024_42.pkl', 'rb') as f:
    vq_model = pickle.load(f).to('cuda')    
with open(f'{os.getcwd()}/model_file/roberta_large_embeddings.pkl', 'rb') as f:
    total_embeddings = pickle.load(f)


In [9]:
'''
Function to convert embedding to token


total_embeddings : The total embeddings of the roberta-large
embedded_outputs : The output of the model
inputs_token : To check the original token decoding
'''

from sklearn.metrics.pairwise import cosine_similarity

def get_embedding_to_token(total_embeddings, embedded_outputs, inputs_token):
    for i in range(len(embedded_outputs)):
        word_embedding = embedded_outputs[i].detach()
        print(tokenizer.decode(inputs_token['input_ids'][i]))
        similarity = cosine_similarity(total_embeddings, word_embedding)
        for i in range(similarity.shape[1]):
            print(tokenizer.decode(similarity[:,i].argmax()), end = ' ')
        print('\n')


In [10]:
'''
Set the datasets


sentences_data : The sentences to be converted
outputs_embedd : The output of the model
'''
total_embeddings = total_embeddings.detach()
sentences_data = sst2_train_sentence[:20]
outputs_embedd = []

inputs_token = tokenizer(sentences_data, 
                    add_special_tokens=True, 
                    padding= 'max_length',  # 'max_length'
                    max_length= 128,  # args.max_seq_length
                    truncation=True,
                    return_attention_mask=True, 
                    return_tensors='pt')

with torch.no_grad():
    for i in range(len(sentences_data)):
        temp = model.roberta.embeddings.word_embeddings(inputs_token['input_ids'][i])
        outputs_embedd.append(temp)
len(outputs_embedd)

20

In [11]:
'''
Check the embedding to token worked well
'''

get_embedding_to_token(total_embeddings, outputs_embedd, inputs_token)

<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<s> hide  new  secret ions  from  the  parental  units   </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <

### Analysis the data made by VQ_model

In [12]:
'''
VQ_Augmentation Input Data setting
'''

vq_input = outputs_embedd
labels = sst2_train['label'][:len(vq_input)]

# vq_input_ = [torch.tensor(i) for i in vq_input]
labels_ = [torch.tensor(i) for i in labels]

  


In [13]:
'''
Process VQ_Augmentation 
outputs_vq : VQ_Augmentation output
'''

vq_model.eval()
augmented_data = []
outputs_vq = []
for embs, label in zip(vq_input, labels_):
    embs = embs.to(device = 'cuda:0')
    label = label.to(device = 'cuda:0')
    decoded_sentence = []
    for j in range(embs.shape[0]):
        text = embs[j,:].view(1,-1)
        label = label.view(-1)
        decoded_text = vq_model(text, label)[0]
        decoded_sentence.append(decoded_text)
        # break
    augmented_data = torch.cat(decoded_sentence, dim=0)
    outputs_vq.append(augmented_data)
outputs_vq = torch.stack(outputs_vq, dim=0)
# attention_mask = inputs_token['attention_mask']

In [14]:
'''
VQ_Augmentation output to word
'''

outputs_vq = outputs_vq.to(device = 'cpu')
get_embedding_to_token(total_embeddings, outputs_vq, inputs_token)

<s>hide new secretions from the parental units </s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

In [20]:
for i in range(5):
    word_embedding = outputs_vq[i].detach()
    print(sentences_data[i])
    similarity = cosine_similarity(total_embeddings, word_embedding)
    for i in range(similarity.shape[1]):
        print(tokenizer.decode(similarity[:,i].argmax()), end = ' ')    
        # print(tokenizer.decode(similarity[:,i].argmax()), end = ' ')
    print('\n')

hide new secretions from the parental units 
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

contains no wit , only labored gags 
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [31]:
for i in range(5):
    word_embedding = total_augmented_data[i].detach()
    print(sentences_data[i])
    similarity = cosine_similarity(embeddings, word_embedding)
    for i in range(similarity.shape[1]):
        print(tokenizer.decode(similarity[:,i].argmax()), end = ' ')    
        # print(tokenizer.decode(similarity[:,i].argmax()), end = ' ')
    print('\n')

hide new secretions from the parental units 
 in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in 

contains no wit , only labored gags 
 in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in  in 

In [27]:
outputs_vq[0]

tensor([[  20.9179, -282.2273,  118.6781,  ...,  126.8360,  -90.4233,
          160.1697],
        [  22.5677, -282.0784,  120.0120,  ...,  125.6524,  -88.9789,
          159.9776],
        [  20.8950, -282.3316,  118.0566,  ...,  126.3345,  -90.3176,
          160.0466],
        ...,
        [  21.1212, -282.0103,  117.8725,  ...,  126.2891,  -90.3391,
          159.7347],
        [  21.1212, -282.0103,  117.8725,  ...,  126.2891,  -90.3391,
          159.7347],
        [  21.1212, -282.0103,  117.8725,  ...,  126.2891,  -90.3391,
          159.7347]], grad_fn=<SelectBackward0>)

In [21]:
model = model.to(device = 'cuda:0')

In [29]:
outputs_vq.shape

torch.Size([20, 128, 1024])

In [32]:
vq_input[0].shape

torch.Size([128, 1024])

In [31]:
len(vq_input)

20

In [23]:
# vq_input과 vq_output 간의 cosine similarity

for j in range(outputs_vq.shape[0]):
    for i in range(outputs_vq.shape[1]):
        print(cosine_similarity(outputs_vq[j][i].detach().numpy().reshape(1,-1),vq_input[j][i].detach().numpy().reshape(1,-1))[0], end = ' ')
    print('\n')

[0.00379766] [-0.02158549] [-0.01386246] [-0.01153744] [-0.03962187] [-0.03730913] [-0.02265387] [-0.05428257] [-0.05692041] [-0.00336626] [-0.01846707] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0.17039952] [0