## Anticausative Verbs Embedding Extraction

In [6]:
import pandas as pd
import numpy as np
import os
import re
import pickle
import torch

from tqdm import tqdm

import spacy

In [None]:
assert torch.cuda.is_available()
device = "cuda"

In [22]:
anticausative_df = pd.read_csv("..\\Data\\df_50_causative_alt.csv", index_col = "Sent.id")

In [23]:
anticausative_df

Unnamed: 0_level_0,Sent.,Lemma,Token.id,Use,Token.Form
Sent.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Tony broke the window.,break,1,tr,broke
1,The window broke.,break,2,intr,broke
2,Tony bent the rod.,bend,1,tr,bent
3,The trees bend during windy days.,bend,2,intr,bend
4,Jennifer baked the potatoes.,bake,1,tr,baked
...,...,...,...,...,...
95,Many revolve around the latest trendy silver b...,revolve,1,intr,revolve
96,I smash it into boulders.,smash,1,tr,smash
97,A tentacle smashes into Koopark's head.,smash,2,intr,smashes
98,We macerated for about 4 hours and then presse...,macerate,1,tr,macerated


In [None]:
#word pieces combination based on tokens ids
def combine_subwords(ids, words):
    
    """The function takes in two arguments both obtained with hf TokenizerFast class:\
    ids: a list of successive non-single ids \
    word_pieces: a list of word pieces
    
    return:
    a dictionary mapping the ids to their respective subwords
    a list of the reconstructed words"""
    #remove the special char in bpe tokeniers
    words = list(map(lambda x:x.replace("Ġ", "").strip(), words))
    #remove the special char for bert-like tokenizers
    words = list(map(lambda x:x.replace("#", "").strip(), words))
    
    # Ensure both input lists have the same length
    if len(ids) != len(words):
        raise ValueError("Input lists must have the same length")

    # Create a dictionary to store word pieces by id
    id_word_dict = {}

    # Iterate through the lists and populate the dictionary
    for id, word in zip(ids, words):
        if id not in id_word_dict:
            id_word_dict[id] = []
        id_word_dict[id].append(word)

    # Create the list of tuples by joining the word pieces
    result = [(id, ''.join(word_pieces)) for id, word_pieces in id_word_dict.items() if not id == None]

    #get rid of None key if any. If present is for the special tokens /s\s
    if id_word_dict.get(None):
      del id_word_dict[None]

    return id_word_dict, result

#word pieces embedding combination based on tokens ids
def combine_subembeddings(ids, embeddings, device = None):

    # Ensure both input lists have the same length
    if len(ids) != len(embeddings):
        raise ValueError("Input lists must have the same length")

    # Create a dictionary to store embedding of word pieces by id
    id_emb_dict = {}

    # Iterate through the lists and populate the dictionary
    for id, sub_emb in zip(ids, embeddings):
        if id not in id_emb_dict:
            id_emb_dict[id] = []
        if device:
          id_emb_dict[id].append(sub_emb.cpu().numpy().astype(float))

        else:
          id_emb_dict[id].append(sub_emb.numpy().astype(float))

    # Create the list of tuples by averaging embedding pieces
    result = [(id, np.mean(sub_emb, axis = 0)) for id, sub_emb in id_emb_dict.items() if not id == None]

    #get rid of None key if any. If present is for the special tokens /s\s
    if id_emb_dict.get(None):
      del id_emb_dict[None]

    return id_emb_dict, result

# helper function to extract representations with a given model
def feature_extractor(sent, token, tokenizer, model, device = None):    #token
    tokenized_sent = tokenizer(sent, return_tensors = "pt", truncation = True)
    word_ids = tokenized_sent.word_ids()
    #dynamically get the target token id
    _, combined_words = combine_subwords(word_ids, tokenized_sent.tokens())
    combined_words = [i[1] for i in combined_words]
    #ensure to get both lower cased and non-lower cased tokens (different between tokenizers)
    try:
      tokid = combined_words.index(token.lower())
    except:
      tokid = combined_words.index(token)
    
    #insert code for the gpu
    if device:
      with torch.no_grad():
        output = model(**tokenized_sent.to(device))
    else:
      with torch.no_grad():
          output = model(**tokenized_sent)
    embeddings = output["last_hidden_state"][0,:]
    embs_dict, encoded_sent_fw = combine_subembeddings(word_ids, embeddings, device = device)
    
    return  embs_dict, encoded_sent_fw, tokid

#helper function to select the target embeddings
def extract_target_embs(encoded_sent_fw, tokid, embs_dict):
    target= encoded_sent_fw[tokid][1]
    target_sub_embs = embs_dict[tokid]

    return target, target_sub_embs

#main function to loop over all the sentences and get the target representations
def get_target_embeddings(sents, tokens, sent_ids, lemmas, tokenizer, model, device = None):  
    if device:
      device = device
    target_embeddings = {}
    total_sub_embs = {}
    #loop over the sentences to extract each representation
    for i in tqdm(range(len(sents))):
        
        sent_id = str(sent_ids[i])
        token = tokens[i]
        sent = sents[i]
        lemma = lemmas[i]
        #extract the features for the whole sentence
        embs_dict, encoded_sent_fw, target_tokid = feature_extractor(sent, token, tokenizer, model, device =device)   #token
    
        #extract the target embeddings from the given sentence
        target, target_sub_embs = extract_target_embs(encoded_sent_fw, target_tokid, embs_dict)
        #join the token and sent id to create a key for the dict
        key = lemma +"."+sent_id
        #add value to the key
        target_embeddings[key] = target
        #store the sub embs in a dictionary with k=word.semt_id: [e1...en]
        total_sub_embs[key] = target_sub_embs
  
    return target_embeddings, total_sub_embs

In [None]:
#helper func to write the results
def serialize_embs(embs, file_name:str, model_ckp:str, output_path = ""):
    #write the output in the dedicated directory
    if not os.path.exists(output_path):
        os.mkdir(output_path)

    if "/" in model_ckp:
        model_ckp = model_ckp.split("/")[-1]

    with open(os.path.join(output_path, file_name)+model_ckp.split("/")[-1]+".pkl", "wb") as outfile:
        pickle.dump(embs, outfile)
    return print("Done")

In [25]:
#extract the necessary data and metadata
full_sents = anticausative_df["Sent."].tolist()
tokens = anticausative_df["Token.Form"].tolist()
sent_ids = anticausative_df.index.tolist()
lemmas = anticausative_df["Lemma"].tolist()

## BabyBERTa

In [None]:
from transformers import RobertaModel, RobertaTokenizerFast
#Model initialization
model_ckp = "phueb/BabyBERTa-2"
tokenizer = RobertaTokenizerFast.from_pretrained(model_ckp, add_prefix_space = True, truncation = True, max_length = 130)
model = RobertaModel.from_pretrained(model_ckp)
#set the model max_length
tokenizer.model_max_length= 128

#extract the embeddings for each word, by averaging when needed and
#separately store the sub-word embeddings for each token
target_embeddings, total_sub_embs = get_target_embeddings(full_sents, tokens, sent_ids,lemmas,tokenizer,model)

#serialize the target embeddings
serialize_embs(target_embeddings, "target_AC_embeddings_", model_ckp, output_path = "Extracted_Embeddings_AC")
serialize_embs(total_sub_embs, "total_AC_subembs_",model_ckp, output_path = "Extracted_Embeddings_AC")

## Pythia 70m

In [None]:
from transformers import AutoModel, AutoTokenizer
model_ckp = "EleutherAI/pythia-70m-deduped"

model = AutoModel.from_pretrained(model_ckp, revision="step3000", cache_dir="./pythia-70m-deduped/step3000")
tokenizer = AutoTokenizer.from_pretrained(model_ckp, revision="step3000", cache_dir="./pythia-70m-deduped/step3000")

#extract the embeddings for each word, by averaging when needed and
#separately store the sub-word embeddings for each token
target_embeddings, total_sub_embs = get_target_embeddings(full_sents, tokens, sent_ids,lemmas,tokenizer,model)

#serialize the target embeddings
serialize_embs(target_embeddings, "target_AC_embeddings_", model_ckp, output_path = "Extracted_Embeddings_AC")
serialize_embs(total_sub_embs, "total_AC_subembs_",model_ckp, output_path = "Extracted_Embeddings_AC")

## GPT2-XL

In [None]:
from transformers import GPT2TokenizerFast, GPT2Model
model_ckp = 'gpt2-xl'
tokenizer = GPT2TokenizerFast.from_pretrained(model_ckp, return_tensors = "pt")
model = GPT2Model.from_pretrained(model_ckp, device_map = "auto")
target_embeddings, total_sub_embs = get_target_embeddings(full_sents, tokens, sent_ids,lemmas,tokenizer,model, device = device)

#serialize the target embeddings
serialize_embs(target_embeddings, "target_AC_embeddings_", model_ckp, output_path = "Extracted_Embeddings_AC")
serialize_embs(total_sub_embs, "total_AC_subembs_",model_ckp, output_path = "Extracted_Embeddings_AC")

## DistilBERT

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertModel
#Model initialization
model_ckp = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_ckp)
model = DistilBertModel.from_pretrained(model_ckp) #.to(device)

#extract the embeddings for each word, by averaging when needed and
#separately store the sub-word embeddings for each token
target_embeddings, total_sub_embs = get_target_embeddings(full_sents, tokens,sent_ids,lemmas,tokenizer,model) # ,device =device)

# #serialize the target embeddings
#serialize the target embeddings
serialize_embs(target_embeddings, "target_AC_embeddings_", model_ckp, output_path = "Extracted_Embeddings_AC")
serialize_embs(total_sub_embs, "total_AC_subembs_",model_ckp, output_path = "Extracted_Embeddings_AC")