In [88]:
import pandas as pd
import numpy as np
import os 
import re
import pickle
import torch

from transformers import RobertaModel, RobertaTokenizerFast

from tqdm import tqdm

In [2]:
with open("..\\Data\\complete_df.pkl", "rb") as infile:
    complete_df = pickle.load(infile)

In [3]:
complete_df.head(3)

Unnamed: 0_level_0,Sent._x,Split,Pred.Token,Pred.Lemma,Sent._y,hf_tknzd_sents,Token,hf_tnzd_ids
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12,The police commander of Ninevah Province annou...,train,6,announce,"[The, police, commander, of, Ninevah, Province...","[the, police, commander, of, ninevah, province...",announced,6
18,"Guerrillas killed an engineer, Asi Ali, from T...",train,1,kill,"[Guerrillas, killed, an, engineer, ,, Asi, Ali...","[guerrillas, killed, an, engineer, ,, asi, ali...",killed,1
22,Guerrillas near Hawijah launched an attack tha...,train,3,launch,"[Guerrillas, near, Hawijah, launched, an, atta...","[guerrillas, near, hawijah, launched, an, atta...",launched,3


In [68]:
#extract the necessary data and metadata
full_sents = complete_df["Sent._x"].tolist()
ids2extract = complete_df["hf_tnzd_ids"].tolist()
tokens = complete_df["Token"].tolist()
sent_ids = complete_df.index.tolist()

In [5]:
#word pieces combination based on tokens ids
def combine_subembeddings(ids, embeddings):

    # Ensure both input lists have the same length
    if len(ids) != len(embeddings):
        raise ValueError("Input lists must have the same length")

    # Create a dictionary to store embedding of word pieces by id
    id_emb_dict = {}

    # Iterate through the lists and populate the dictionary
    for id, sub_emb in zip(ids, embeddings):
        if id not in id_emb_dict:
            id_emb_dict[id] = []
        id_emb_dict[id].append(sub_emb.numpy().astype(float))

    # Create the list of tuples by joining the word pieces
    result = [(id, np.mean(sub_emb, axis = 0)) for id, sub_emb in id_emb_dict.items() if not id == None]

    #get rid of None key if any. If present is for the special tokens /s\s
    if id_emb_dict.get(None):
      del id_emb_dict[None]

    return id_emb_dict, result

In [89]:
# helper function to extract representations with a given model
def feature_extractor(sent, tokenizer, model):
    tokenized_sent = tokenizer(sent, return_tensors = "pt")
    word_ids = tokenized_sent.word_ids()
    with torch.no_grad():
        output = model(**tokenized_sent)
    embeddings = output["last_hidden_state"][0,:]
    embs_dict, encoded_sent_fw = combine_subembeddings(word_ids, embeddings)
    
    return  embs_dict, encoded_sent_fw

#helper function to select the target embeddings
def extract_target_embs(encoded_sent_fw, tokid, embs_dict):
    target= encoded_sent_fw[tokid][1]
    target_sub_embs = embs_dict[tokid]

    return target, target_sub_embs

#main function to loop over all the sentences and get the target representations
def get_target_embeddings(sents, tokens, sent_ids, ids2extract, tokenizer, model):  
    
    target_embeddings = {}
    total_sub_embs = {}
    #loop over the sentences to extract each representation
    for i in tqdm(range(len(sents))):
        try:
            sent_id = str(sent_ids[i])
            token = tokens[i]
            sent = sents[i]
            target_tokid = ids2extract[i]
        
            try:
                #extract the features for the whole sentence
                embs_dict, encoded_sent_fw = feature_extractor(sent, tokenizer, model)   
            except:
                print(f"Tensor problem for sent {sent_id}")
                pass
            #extract the target embeddings from the given sentence
            target, target_sub_embs = extract_target_embs(encoded_sent_fw, target_tokid, embs_dict)
            #join the token and sent id to create a key for the dict
            key = token +"."+sent_id
            #add value to the key
            target_embeddings[key] = target
            #store 
            
            total_sub_embs[key] = target_sub_embs
        except:
            print(f"something went wrong for sent {sent_id}")
            pass
        
    return target_embeddings, total_sub_embs

## Baby Berta

In [90]:
#Model initialization
model_ckp = "phueb/BabyBERTa-2"
tokenizer = RobertaTokenizerFast.from_pretrained(model_ckp, add_prefix_space = True, truncation = True)
model = RobertaModel.from_pretrained(model_ckp)

#extract the embeddings for each word, by averaging when needed and
#separately store the sub-word embeddings for each token
target_embeddings, total_sub_embs = get_target_embeddings(full_sents, tokens, sent_ids,ids2extract,tokenizer,model)

Some weights of RobertaModel were not initialized from the model checkpoint at phueb/BabyBERTa-2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 52%|█████▏    | 1012/1955 [00:25<00:26, 35.84it/s]

Tensor problem for sent 6780


100%|██████████| 1955/1955 [00:53<00:00, 36.83it/s]


In [103]:
total_sub_embs.keys()

dict_keys(['announced.12', 'killed.18', 'launched.22', 'said.27', 'reports.38', 'managed.39', 'were.46', 'is.48', 'are.52', 'guess.56', 'said.63', 'mentioned.64', 'added.65', 'know.67', 'is.74', 'published.78', 'continues.81', 'is.85', 'are.87', 'present.89', 'had.118', 'offers.122', 'have.125', 'are.132', 'held.135', 'attempted.145', 'generates.146', 'owes.153', 'have.159', 'need.160', 'is.163', 'alleged.164', 'is.165', 'fails.167', 'faltered.172', 'asked.174', 'asked.178', 'asked.179', 'replied.180', 'released.183', 'said.184', 'said.187', 'seems.188', 'is.200', 'is.202', 'had.223', 'created.226', 'chose.228', 'publishes.248', 'feel.249', 'are.250', 'explains.257', 'said.259', 'told.261', 'held.262', 'preceded.264', 'said.265', 'lay.268', 'said.269', 'said.270', 'told.271', 'had.276', 'failed.277', 'complained.279', 'added.284', 'pointed.285', 'calls.293', 'is.298', 'understand.300', 'understood.303', 'remains.317', 'is.319', 'know.329', 'flew.332', 'saw.333', 'follows.342', 'joined.

In [100]:
#write the output in the dedicated directory
output_path = "..\\Data\\Extracted_Embeddings"
if not os.path.exists(output_path):
    os.mkdir(output_path)

with open(os.path.join(output_path, "target_embs_")+model_ckp.split("/")[-1]+".pkl", "wb") as outfile:
    pickle.dump(target_embeddings, outfile)

with open(os.path.join(output_path, "target_subembs_")+model_ckp.split("/")[-1]+".pkl", "wb") as outfile:
    pickle.dump(total_sub_embs, outfile)

## Pythia

In [None]:

model_ckp = "EleutherAI/pythia-70m-deduped"
model = 

## GPT-2

In [101]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)


Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.46MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.41MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 339kB/s]
Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

KeyboardInterrupt: 