In [1]:
from transformers import BertTokenizer, BertForTokenClassification
import os, pickle, torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('GroNLP/hateBERT')
model = BertForTokenClassification.from_pretrained('GroNLP/hateBERT')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 202kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 41.2kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 151/151 [00:00<00:00, 74.8kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.24k/1.24k [00:00<00:00, 624kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 440M/440M [00:32<00:00, 13.7MB/s] 
Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.

In [3]:
dynahate_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\DynaHate\\"
latenthatred_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\Latent_Hatred\\"
olid_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\OLID\\"

dynahate_embeddings_path = "Model_Embeddings\\HateBERT\\DynaHate\\"
latenthatred_embeddings_path = "Model_Embeddings\\HateBERT\\Latent_Hatred\\"
olid_embeddings_path = "Model_Embeddings\\HateBERT\\OLID\\"

In [4]:
def dump_embeddings(dataset = "dynahate", task = "train"):
    dataset_path = None
    curr_task = None
    embedding_path = None
    text_column = 0
    text = []
    labels = []
    embeddings = []
    
    if dataset == "dynahate":
        dataset_path = dynahate_dataset_path
        if task == "train":
            curr_task = "DynaHate_Training"
        elif task == "dev":
            curr_task = "DynaHate_Val"
        else:
            curr_task = "DynaHate_Test"
        embedding_path = os.path.join(dynahate_embeddings_path, curr_task)
    elif dataset == "latenthatred":
        dataset_path = latenthatred_dataset_path
        if task == "train":
            curr_task = "LatentHatred_Training"
        elif task == "dev":
            curr_task = "LatentHatred_Val"
        else:
            curr_task = "LatentHatred_Test"
        embedding_path = os.path.join(latenthatred_embeddings_path, curr_task)
    else:
        text_column = 1
        dataset_path = olid_dataset_path
        if task == "train":
            curr_task = "OLID_Training"
        elif task == "dev":
            curr_task = "OLID_Val"
        else:
            curr_task = "OLID_Test"
        embedding_path = os.path.join(olid_embeddings_path, curr_task)
    dataset_path = os.path.join(dataset_path, curr_task + ".txt")
    with open(dataset_path, "r", encoding="utf8") as file:
        temp = file.readlines()
    file.close()
    for each in temp[1:]:
        curr = each.split()
        text.append(' '.join(curr[text_column:-1]))
        labels.append(curr[-1])
    
    for each in tqdm(text):
        tokenized_text = tokenizer(each, return_tensors = "pt")
        inputs = {k: v for k, v in tokenized_text.items()}
        outputs = model(**inputs, output_hidden_states=True)
        hidden_states = outputs[-1]
        cls_state = hidden_states[0][0, 0, :]
        embeddings.append(cls_state)
    
    embeddings = torch.stack(embeddings)
    print(embeddings.shape)

    with open(embedding_path, "wb+") as file:
        pickle.dump(embeddings, file)
    file.close()

In [5]:
for dataset in ["dynahate", "latenthatred", "olid"]:
    for task in ["train", "dev", "test"]:
        dump_embeddings(dataset = dataset, task = task)

100%|██████████| 32924/32924 [1:27:45<00:00,  6.25it/s]  


torch.Size([32924, 768])


100%|██████████| 4100/4100 [08:09<00:00,  8.38it/s]


torch.Size([4100, 768])


100%|██████████| 4120/4120 [08:09<00:00,  8.41it/s]


torch.Size([4120, 768])


100%|██████████| 12082/12082 [21:46<00:00,  9.24it/s]


torch.Size([12082, 768])


100%|██████████| 4028/4028 [06:35<00:00, 10.18it/s]


torch.Size([4028, 768])


100%|██████████| 5370/5370 [08:39<00:00, 10.34it/s]


torch.Size([5370, 768])


100%|██████████| 10592/10592 [25:36<00:00,  6.89it/s]


torch.Size([10592, 768])


100%|██████████| 2648/2648 [06:16<00:00,  7.03it/s]


torch.Size([2648, 768])


100%|██████████| 860/860 [02:24<00:00,  5.96it/s]


torch.Size([860, 768])
