In [1]:
from transformers import BertTokenizer, BertForTokenClassification
import os, pickle, torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [3]:
dynahate_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\DynaHate\\"
latenthatred_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\Latent_Hatred\\"
olid_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\OLID\\"

dynahate_embeddings_path = "Model_Embeddings\\DynaHate\\"
latenthatred_embeddings_path = "Model_Embeddings\\Latent_Hatred\\"
olid_embeddings_path = "Model_Embeddings\\OLID\\"

In [6]:
def dump_embeddings(dataset = "dynahate", task = "train"):
    dataset_path = None
    curr_task = None
    embedding_path = None
    text_column = 0
    text = []
    labels = []
    embeddings = []
    
    if dataset == "dynahate":
        dataset_path = dynahate_dataset_path
        if task == "train":
            curr_task = "DynaHate_Training"
        elif task == "dev":
            curr_task = "DynaHate_Val"
        else:
            curr_task = "DynaHate_Test"
        embedding_path = os.path.join(dynahate_embeddings_path, curr_task)
    elif dataset == "latenthatred":
        dataset_path = latenthatred_dataset_path
        if task == "train":
            curr_task = "LatentHatred_Training"
        elif task == "dev":
            curr_task = "LatentHatred_Val"
        else:
            curr_task = "LatentHatred_Test"
        embedding_path = os.path.join(latenthatred_embeddings_path, curr_task)
    else:
        text_column = 1
        dataset_path = olid_dataset_path
        if task == "train":
            curr_task = "OLID_Training"
        elif task == "dev":
            curr_task = "OLID_Val"
        else:
            curr_task = "OLID_Test"
        embedding_path = os.path.join(olid_embeddings_path, curr_task)
    dataset_path = os.path.join(dataset_path, curr_task + ".txt")
    with open(dataset_path, "r", encoding="utf8") as file:
        temp = file.readlines()
    file.close()
    for each in temp[1:]:
        curr = each.split()
        text.append(' '.join(curr[text_column:-1]))
        labels.append(curr[-1])
    
    for each in tqdm(text):
        tokenized_text = tokenizer(each, return_tensors = "pt")
        inputs = {k: v for k, v in tokenized_text.items()}
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs[-1]
        cls_state = hidden_states[0][0, 0, :]
        embeddings.append(cls_state)
    
    embeddings = torch.stack(embeddings)
    print(embeddings.shape)

    with open(embedding_path, "wb+") as file:
        pickle.dump(embeddings, file)
    file.close()

In [7]:
for dataset in ["dynahate", "latenthatred", "olid"]:
    for task in ["train", "dev", "test"]:
        dump_embeddings(dataset = dataset, task = task)

100%|██████████| 32924/32924 [1:23:53<00:00,  6.54it/s]  


torch.Size([32924, 768])


100%|██████████| 4100/4100 [07:47<00:00,  8.77it/s]


torch.Size([4100, 768])


100%|██████████| 4120/4120 [07:42<00:00,  8.90it/s]


torch.Size([4120, 768])


100%|██████████| 12082/12082 [19:32<00:00, 10.31it/s]


torch.Size([12082, 768])


100%|██████████| 4028/4028 [05:49<00:00, 11.53it/s]


torch.Size([4028, 768])


100%|██████████| 5370/5370 [07:50<00:00, 11.42it/s]


torch.Size([5370, 768])


100%|██████████| 10592/10592 [23:54<00:00,  7.38it/s]


torch.Size([10592, 768])


100%|██████████| 2648/2648 [05:31<00:00,  7.99it/s]


torch.Size([2648, 768])


100%|██████████| 860/860 [01:58<00:00,  7.25it/s]


torch.Size([860, 768])
