In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import warnings
warnings.filterwarnings('ignore')

In [4]:
sentences = ['The cat sits outside',
             'I love pasta',
             'The cat plays in the garden',
             'Do you like pizza?']
labels = [0,1,0,1]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
embeddings = []
for i in range(len(sentences)):
    embeddings.append(tokenizer.encode(sentences[i], add_special_tokens=True))

In [7]:
embeddings

[[0, 581, 7515, 1661, 7, 50782, 2],
 [0, 87, 5161, 14324, 2],
 [0, 581, 7515, 11301, 7, 23, 70, 80583, 2],
 [0, 984, 398, 1884, 32960, 32, 2]]

In [8]:
for i in range(len(sentences)):
    print(embeddings[i])
    for j in range(len(embeddings[i])):
        print(embeddings[i][j], " - ",tokenizer.convert_ids_to_tokens(embeddings[i][j]))
    print(" ")

[0, 581, 7515, 1661, 7, 50782, 2]
0  -  <s>
581  -  ▁The
7515  -  ▁cat
1661  -  ▁sit
7  -  s
50782  -  ▁outside
2  -  </s>
 
[0, 87, 5161, 14324, 2]
0  -  <s>
87  -  ▁I
5161  -  ▁love
14324  -  ▁pasta
2  -  </s>
 
[0, 581, 7515, 11301, 7, 23, 70, 80583, 2]
0  -  <s>
581  -  ▁The
7515  -  ▁cat
11301  -  ▁play
7  -  s
23  -  ▁in
70  -  ▁the
80583  -  ▁garden
2  -  </s>
 
[0, 984, 398, 1884, 32960, 32, 2]
0  -  <s>
984  -  ▁Do
398  -  ▁you
1884  -  ▁like
32960  -  ▁pizza
32  -  ?
2  -  </s>
 


In [9]:
max_len=0
for i in embeddings:
    if len(i) > max_len:
        max_len = len(i)
print('max size: ', max_len)

max size:  9


In [10]:
padded = np.array([i + [0]*(max_len - len(i)) for i in embeddings])

In [11]:
padded

array([[    0,   581,  7515,  1661,     7, 50782,     2,     0,     0],
       [    0,    87,  5161, 14324,     2,     0,     0,     0,     0],
       [    0,   581,  7515, 11301,     7,    23,    70, 80583,     2],
       [    0,   984,   398,  1884, 32960,    32,     2,     0,     0]])

In [12]:
attention_mask = np.where(padded != 0, 1, 0)

In [13]:
attention_mask

array([[0, 1, 1, 1, 1, 1, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 0, 0]])

In [14]:
attention_mask.shape

(4, 9)

In [15]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [16]:
# Pegando resultados da ultima camada oculta
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [17]:
last_hidden_states[0][:,:,:].numpy().shape

(4, 9, 250002)

In [70]:
xml_roberta_base_embeddings = last_hidden_states[0][:,0,:].numpy()

In [71]:
xml_roberta_base_embeddings.shape

(4, 250002)

In [72]:
xml_roberta_base_embeddings

array([[53.679016  , -0.396734  , 47.879303  , ..., 27.64236   ,
        16.092344  , 24.076544  ],
       [52.8779    , -0.26781884, 45.571747  , ..., 25.977713  ,
        18.595173  , 23.0229    ],
       [55.4995    , -0.48153916, 45.48795   , ..., 28.58248   ,
        14.803785  , 24.128199  ],
       [62.54553   , -0.24973875, 48.278133  , ..., 30.388872  ,
        18.933002  , 25.117058  ]], dtype=float32)

In [75]:
mean_cls_embedding = np.mean(xml_roberta_base_embeddings, axis=1)
mean_values = np.mean(xml_roberta_base_embeddings, axis=0)
std_values = np.std(xml_roberta_base_embeddings, axis=0)

In [76]:
print(mean_cls_embedding)
print(mean_values)
print(std_values)

[21.005106 18.167103 20.553898 20.723627]
[56.150486   -0.34895766 46.804283   ... 28.147856   17.106075
 24.086174  ]
[3.812425   0.09524748 1.2825544  ... 1.5949998  1.7236055  0.740799  ]


In [None]:
concatenated_features = torch.cat((mean_cls_embedding, mean_values, std_values), dim=1)