In [1]:
import transformers

In [2]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
import pandas as pd

In [4]:
df_vocab = pd.read_csv("vocab.csv")

In [5]:
df_vocab["Word"][0]

'a'

In [6]:
text = "[CLS] " + df_vocab["Word"][0] + " [SEP]"

In [7]:
tokenizer_text = tokenizer.tokenize(text)

In [8]:
print(tokenizer_text)

['[CLS]', 'a', '[SEP]']


In [9]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenizer_text)

In [10]:
indexed_tokens

[101, 1037, 102]

In [11]:
list(tokenizer.vocab.keys())[1037]

'a'

In [12]:
for text, index in zip(tokenizer_text, indexed_tokens):
    print(text,index)

[CLS] 101
a 1037
[SEP] 102


In [13]:
segments_ids = [1] * len(tokenizer_text)

In [14]:
segments_ids

[1, 1, 1]

In [15]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [16]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [17]:
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [18]:
hidden_states

(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [ 0.5956,  0.5420,  0.0412,  ...,  0.4376,  0.5639,  0.3365],
          [-0.4815, -0.0189,  0.0092,  ..., -0.2806,  0.3895, -0.2815]]]),
 tensor([[[ 0.1383, -0.0017, -0.1345,  ...,  0.0213, -0.0027,  0.0546],
          [ 0.7541,  0.5487,  0.1515,  ...,  0.2371,  0.1910,  0.0636],
          [-0.3558,  0.3080, -0.1537,  ..., -0.4325,  0.8373, -0.1724]]]),
 tensor([[[-0.0382, -0.1733, -0.1631,  ...,  0.0534,  0.0795,  0.0761],
          [ 0.3919,  0.5963,  0.4106,  ...,  0.5559,  0.2573,  0.3806],
          [-0.3174,  0.0915,  0.0514,  ..., -0.2584,  0.7475, -0.0926]]]),
 tensor([[[ 0.0025, -0.2923, -0.0254,  ...,  0.2050,  0.1381,  0.2798],
          [ 0.2570,  0.4708,  0.6375,  ...,  0.5920,  0.1055, -0.1956],
          [-0.0878, -0.0809,  0.0926,  ..., -0.0043,  0.1657,  0.0037]]]),
 tensor([[[ 0.0535, -0.4204, -0.5296,  ...,  0.3501, -0.0072,  0.6119],
          [ 0.1004,  0.4128, -0.0307,  ..., -0.1951,

In [19]:
cat_vec = []
sum_vec = []
for index, row in df_vocab.iterrows():
    text = row["Word"]
    marked_text = "[CLS] " + text + " [SEP]"
    
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)
    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)  
        hidden_states = outputs[2]
        
        token_embeddings = torch.stack(hidden_states, dim=0)
        
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        
        token_embeddings = token_embeddings.permute(1,0,2)
        
    token_vecs_cat = [torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0) for token in token_embeddings]
    
    token_vecs_sum = [torch.sum(token[-4:], dim=0) for token in token_embeddings]
    
    main_cat_vec = token_vecs_cat[-2]
    
    main_sum_vec = token_vecs_sum[-2]
    
    cat_vec.append(main_cat_vec)
    sum_vec.append(main_sum_vec)

In [62]:
import pickle
file_name_1 = "cat_vec.pkl"
file_name_2 = "sum_vec.pkl"
open_file_1 = open(file_name_1, "wb")
open_file_2 = open(file_name_2, "wb")
pickle.dump(cat_vec, open_file_1)
pickle.dump(sum_vec, open_file_2)
open_file_1.close()
open_file_2.close()

In [20]:
df_word_pair = pd.read_csv("connector_wordpairs_boards.csv")

In [54]:
min_cat_diff = 10000000000.0
min_sum_diff = 10000000000.0
similar_word_indices_cat = []
similar_word_indices_sum = []
for index,row in df_word_pair.iterrows():
    word1, word2 = row["Word1"],row["Word2"]
    avg_cat = torch.divide(torch.add(cat_vec[list(df_vocab["Word"]).index(word1) + 1],cat_vec[list(df_vocab["Word"]).index(word2) + 1]),2)
    avg_sum = torch.divide(torch.add(sum_vec[list(df_vocab["Word"]).index(word1) + 1],sum_vec[list(df_vocab["Word"]).index(word2) + 1]),2)
    for idx, (word_cat, word_sum) in enumerate(zip(cat_vec, sum_vec)):
        if torch.abs(cat_vec[idx] - avg_cat).sum().item() < min_cat_diff:
            min_cat_diff = torch.abs(cat_vec[idx] - avg_cat).sum().item()
            min_cat_idx = idx
        elif torch.abs(sum_vec[idx] - avg_sum).sum().item() < min_sum_diff:
            min_sum_diff = torch.abs(sum_vec[idx] - avg_sum).sum().item()
            min_sum_idx = idx
    similar_word_indices_cat.append(min_cat_idx)
    similar_word_indices_sum.append(min_sum_idx)

In [55]:
similar_words_cat = []
similar_words_sum = []
for idx1,idx2 in zip(similar_word_indices_cat, similar_word_indices_sum):
    similar_words_cat.append(df_vocab["Word"][idx1])
    similar_words_sum.append(df_vocab["Word"][idx2])

In [56]:
similar_words_cat

['cougar',
 'cougar',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'algorithm',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'eggnog',
 'Easter',
 'Easter',
 'Easter',
 'Easter',
 'Easter',
 'Easter',
 'Easter',
 'Easter',
 'Easter']

In [57]:
similar_words_sum

['volcano',
 'volcano',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'examination',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'kite',
 'shortage',
 'shortage',
 'shortage',
 'shortage',
 'shortage',
 'shortage',
 'shortage',
 'shortage',
 'shortage']

In [58]:
df_word_pair["Similar Word Cat"] = similar_words_cat
df_word_pair["Similar Word Sum"] = similar_words_sum

In [60]:
df_word_pair.head(60)

Unnamed: 0,Word1,Word2,Experiment,boardnames,Similar Word Cat,Similar Word Sum
0,void,couch,E1,e1_board1_words,cougar,volcano
1,giggle,abnormal,E1,e1_board1_words,cougar,volcano
2,exam,algebra,E1,e1_board1_words,algorithm,examination
3,tea,bean,E1,e1_board10_words,algorithm,examination
4,tourist,comedy,E1,e1_board10_words,algorithm,examination
5,pendulum,dusk,E1,e1_board10_words,algorithm,examination
6,beginning,brake,E1,e1_board2_words,algorithm,examination
7,birds,aircraft,E1,e1_board2_words,algorithm,examination
8,school,stop,E1,e1_board2_words,algorithm,examination
9,circle,dance,E1,e1_board3_words,algorithm,examination
