## 1 - BERT Embeddings

## Importing Libraries

In [29]:
import transformers
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
import pandas as pd

## Loading the Vocabulary File

In [4]:
df_vocab = pd.read_csv("vocab.csv")

In [5]:
df_vocab["Word"][0]

'a'

## Adding the CLS and SEP token to word in the vocab list

In [6]:
text = "[CLS] " + df_vocab["Word"][0] + " [SEP]"

## Tokenize the Text

In [7]:
tokenizer_text = tokenizer.tokenize(text)

In [8]:
print(tokenizer_text)

['[CLS]', 'a', '[SEP]']


## Getting indexes of the items in the tokenized text list

In [9]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenizer_text)

In [10]:
indexed_tokens

[101, 1037, 102]

In [11]:
list(tokenizer.vocab.keys())[1037]

'a'

## Returning pair of tokenized text item list and their corresponding index in the vocab

In [12]:
for text, index in zip(tokenizer_text, indexed_tokens):
    print(text,index)

[CLS] 101
a 1037
[SEP] 102


## Creating Segmentation IDs for the tokenzied text, since we all have single words, it'll be set to 1

In [13]:
segments_ids = [1] * len(tokenizer_text)

In [14]:
segments_ids

[1, 1, 1]

## Converting list of tokens and segments to their corresponding tensor to pass through the BERT model

In [15]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

## Loading the pre-trained BERT Model

In [16]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Testing single word embedding output from the BERT model

In [17]:
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2] 

## Printing hidden state tensor

In [18]:
hidden_states

(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [ 0.5956,  0.5420,  0.0412,  ...,  0.4376,  0.5639,  0.3365],
          [-0.4815, -0.0189,  0.0092,  ..., -0.2806,  0.3895, -0.2815]]]),
 tensor([[[ 0.1383, -0.0017, -0.1345,  ...,  0.0213, -0.0027,  0.0546],
          [ 0.7541,  0.5487,  0.1515,  ...,  0.2371,  0.1910,  0.0636],
          [-0.3558,  0.3080, -0.1537,  ..., -0.4325,  0.8373, -0.1724]]]),
 tensor([[[-0.0382, -0.1733, -0.1631,  ...,  0.0534,  0.0795,  0.0761],
          [ 0.3919,  0.5963,  0.4106,  ...,  0.5559,  0.2573,  0.3806],
          [-0.3174,  0.0915,  0.0514,  ..., -0.2584,  0.7475, -0.0926]]]),
 tensor([[[ 0.0025, -0.2923, -0.0254,  ...,  0.2050,  0.1381,  0.2798],
          [ 0.2570,  0.4708,  0.6375,  ...,  0.5920,  0.1055, -0.1956],
          [-0.0878, -0.0809,  0.0926,  ..., -0.0043,  0.1657,  0.0037]]]),
 tensor([[[ 0.0535, -0.4204, -0.5296,  ...,  0.3501, -0.0072,  0.6119],
          [ 0.1004,  0.4128, -0.0307,  ..., -0.1951,

## Creating the concatenation Vector and the Sum Vector for every word in the vocab

In [19]:
# cat_vec = []
# sum_vec = []
# for index, row in df_vocab.iterrows():
#     text = row["Word"]
#     marked_text = "[CLS] " + text + " [SEP]"
    
#     tokenized_text = tokenizer.tokenize(marked_text)
#     indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
#     segments_ids = [1] * len(tokenized_text)
    
#     tokens_tensor = torch.tensor([indexed_tokens])
#     segments_tensors = torch.tensor([segments_ids])
    
#     with torch.no_grad():

#         outputs = model(tokens_tensor, segments_tensors)  
#         hidden_states = outputs[2]
        
#         token_embeddings = torch.stack(hidden_states, dim=0)
        
#         token_embeddings = torch.squeeze(token_embeddings, dim=1)
        
#         token_embeddings = token_embeddings.permute(1,0,2)
        
#     token_vecs_cat = [torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0) for token in token_embeddings]
    
#     token_vecs_sum = [torch.sum(token[-4:], dim=0) for token in token_embeddings]
    
#     main_cat_vec = token_vecs_cat[-2]
    
#     main_sum_vec = token_vecs_sum[-2]
    
#     cat_vec.append(main_cat_vec)
#     sum_vec.append(main_sum_vec)

## Saving the concatenation and sum vectors as pkl file for easy access

In [20]:
# import pickle
# file_name_1 = "cat_vec.pkl"
# file_name_2 = "sum_vec.pkl"
# open_file_1 = open(file_name_1, "wb")
# open_file_2 = open(file_name_2, "wb")
# pickle.dump(cat_vec, open_file_1)
# pickle.dump(sum_vec, open_file_2)
# open_file_1.close()
# open_file_2.close()

## Loading the saved pkl files of sum and concatenation vector

In [21]:
import pickle
f1 = open("cat_vec.pkl","rb")
f2 = open("sum_vec.pkl","rb")
cat_vec  = pickle.load(f1)
sum_vec = pickle.load(f2)

## Task 1: Create a function to compute cosine similarity between any two word vectors:

In [37]:
def cosine_similarity(word1, word2, vec_type):
    return 1 - cosine(vec_type[list(df_vocab["Word"]).index(word1)], vec_type[list(df_vocab["Word"]).index(word2)])

## Example

In [39]:
print(cosine_similarity("brain","skull",cat_vec))
print(cosine_similarity("brain","skull",sum_vec))

0.816037654876709
0.8221496939659119


## Task 2: Create a function to find the word closest to another word in the vocab using cosines:

In [42]:
def find_closest(word, vec_type):
  sims = [cosine_similarity(word,x,vec_type) for x in list(df_vocab["Word"])[:len(vec_type)]]
  y = np.array(sims)
  y_sorted = np.argsort(-y) ## gives sorted indices
  top20_indices = y_sorted[:5]
  w1 = [list(df_vocab["Word"])[i] for i in top20_indices]
  return w1

## Examples

In [43]:
print(find_closest('brain', cat_vec))
print(find_closest('cat', cat_vec))
print(find_closest('apple', cat_vec))
print(find_closest('exam', cat_vec))

print(find_closest('brain', sum_vec))
print(find_closest('cat', sum_vec))
print(find_closest('apple', sum_vec))
print(find_closest('exam', sum_vec))

['brain', 'brains', 'skull', 'consciousness', 'liver']
['cat', 'kitten', 'cats', 'squirrel', 'doll']
['apple', 'banana', 'tomato', 'iPad', 'chocolate']
['exam', 'examination', 'exams', 'quiz', 'degree']
['brain', 'brains', 'skull', 'consciousness', 'liver']
['cat', 'kitten', 'cats', 'squirrel', 'rat']
['apple', 'banana', 'tomato', 'iPad', 'chocolate']
['exam', 'examination', 'exams', 'quiz', 'degree']


## Task 3: Create a function to compute the average vector of any two vectors:

In [60]:
def avg_vector(word1, word2, vec_type):
    return torch.divide(torch.add(vec_type[list(df_vocab["Word"]).index(word1)],vec_type[list(df_vocab["Word"]).index(word2)]),2)

## Example:

In [61]:
print(avg_vector("brain","skull",cat_vec).size())
print(avg_vector("brain","skull",sum_vec).size())

torch.Size([3072])
torch.Size([768])


## Task 4: Create a function to find the word in the vocab closest to an average vector for each wordpair:

In [64]:
def closest_word(word1, word2, vec_type):
    avg_vector_tensor = avg_vector(word1, word2, vec_type)
    sims_cosine = [1 - cosine(avg_vector_tensor,vec_type[list(df_vocab["Word"]).index(x)]) for x in list(df_vocab["Word"])[:len(vec_type)]]
    y = np.array(sims_cosine)
    y_sorted = np.argsort(-y) ## gives sorted indices
    top20_indices = y_sorted[:5]
    w1 = [list(df_vocab["Word"])[i] for i in top20_indices]
    return w1

## Examples

In [68]:
print(closest_word("lion","tiger",cat_vec))
print(closest_word("lion","tiger",sum_vec))

['lion', 'tiger', 'monkey', 'panther', 'squirrel']
['lion', 'tiger', 'monkey', 'panther', 'leopard']


## Testing function on a csv file with pair of words:

In [69]:
df_word_pair = pd.read_csv("connector_wordpairs_boards.csv")

In [70]:
df_word_pair.head(5)

Unnamed: 0,Word1,Word2,Experiment,boardnames
0,void,couch,E1,e1_board1_words
1,giggle,abnormal,E1,e1_board1_words
2,exam,algebra,E1,e1_board1_words
3,tea,bean,E1,e1_board10_words
4,tourist,comedy,E1,e1_board10_words


In [71]:
len(df_word_pair)

60

In [83]:
cat_similarity = []
sum_similarity = []
for index, row in df_word_pair.iterrows():
    word1 = row["Word1"]
    word2 = row["Word2"]
    cat_similarity.append(closest_word(word1, word2, cat_vec)[2:])
    sum_similarity.append(closest_word(word1, word2, sum_vec)[2:])

In [84]:
df_word_pair["top 3 similar words(sum)"] = sum_similarity
df_word_pair["top 3 similar words(cat)"] = cat_similarity

## Results

In [85]:
df_word_pair.head(60)

Unnamed: 0,Word1,Word2,Experiment,boardnames,top 3 similar words(sum),top 3 similar words(cat)
0,void,couch,E1,e1_board1_words,"[sofa, tile, furniture]","[sofa, tile, furniture]"
1,giggle,abnormal,E1,e1_board1_words,"[confuse, giggle, clown]","[giggle, confuse, clown]"
2,exam,algebra,E1,e1_board1_words,"[examination, exams, analysis]","[examination, analysis, calculus]"
3,tea,bean,E1,e1_board10_words,"[spaghetti, stink, doll]","[spaghetti, doll, stink]"
4,tourist,comedy,E1,e1_board10_words,"[sitcom, tourists, nursing]","[sitcom, tourists, nursing]"
5,pendulum,dusk,E1,e1_board10_words,"[midnight, sunrise, handcuffs]","[midnight, sunrise, handcuffs]"
6,beginning,brake,E1,e1_board2_words,"[steer, fireplace, harness]","[steer, harness, fireplace]"
7,birds,aircraft,E1,e1_board2_words,"[animals, planes, vehicles]","[animals, planes, vehicles]"
8,school,stop,E1,e1_board2_words,"[garage, stopping, trouble]","[garage, stopping, dodge]"
9,circle,dance,E1,e1_board3_words,"[jump, rhythm, jumping]","[jump, jumping, rhythm]"


In [86]:
df_word_pair.to_csv("word_pair_results.csv")