In [1]:
from transformers import BertTokenizer, AutoModel
import os, pickle, torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('GroNLP/hateBERT')
model = AutoModel.from_pretrained('GroNLP/hateBERT')

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [4]:
dynahate_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\DynaHate\\"
latenthatred_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\Latent_Hatred\\"
olid_dataset_path = "..\\Data_Preprocessing\\PreProcessed_Data\\OLID\\"

dynahate_embeddings_path = "Model_Embeddings\\HateBERT\\DynaHate\\"
latenthatred_embeddings_path = "Model_Embeddings\\HateBERT\\Latent_Hatred\\"
olid_embeddings_path = "Model_Embeddings\\HateBERT\\OLID\\"

In [5]:
def dump_embeddings(dataset = "dynahate", task = "train"):
    dataset_path = None
    curr_task = None
    embedding_path = None
    text_column = 0
    text = []
    labels = []
    embeddings = []
    
    if dataset == "dynahate":
        dataset_path = dynahate_dataset_path
        if task == "train":
            curr_task = "DynaHate_Training"
        elif task == "dev":
            curr_task = "DynaHate_Val"
        else:
            curr_task = "DynaHate_Test"
        embedding_path = os.path.join(dynahate_embeddings_path, curr_task)
    elif dataset == "latenthatred":
        dataset_path = latenthatred_dataset_path
        if task == "train":
            curr_task = "LatentHatred_Training"
        elif task == "dev":
            curr_task = "LatentHatred_Val"
        else:
            curr_task = "LatentHatred_Test"
        embedding_path = os.path.join(latenthatred_embeddings_path, curr_task)
    else:
        text_column = 1
        dataset_path = olid_dataset_path
        if task == "train":
            curr_task = "OLID_Training"
        elif task == "dev":
            curr_task = "OLID_Val"
        else:
            curr_task = "OLID_Test"
        embedding_path = os.path.join(olid_embeddings_path, curr_task)
    dataset_path = os.path.join(dataset_path, curr_task + ".txt")
    with open(dataset_path, "r", encoding="utf8") as file:
        temp = file.readlines()
    file.close()
    for each in temp[1:]:
        curr = each.split()
        text.append(' '.join(curr[text_column:-1]))
        labels.append(curr[-1])
    with torch.no_grad():
        for each in tqdm(text):
            tokenized_text = tokenizer(each, return_tensors = "pt")
            inputs = {k: v.cuda() for k, v in tokenized_text.items()}
            outputs = model(**inputs)
            embeddings.append(outputs.pooler_output)
    
    embeddings = torch.stack(embeddings)

    with open(embedding_path, "wb+") as file:
        pickle.dump(embeddings, file)
    file.close()

In [6]:
for dataset in ["dynahate", "latenthatred", "olid"]:
    for task in ["train", "dev", "test"]:
        dump_embeddings(dataset = dataset, task = task)

100%|██████████| 32924/32924 [11:26<00:00, 47.99it/s]
100%|██████████| 4100/4100 [01:19<00:00, 51.60it/s]
100%|██████████| 4120/4120 [01:21<00:00, 50.66it/s]
100%|██████████| 12082/12082 [03:52<00:00, 52.03it/s]
100%|██████████| 4028/4028 [01:15<00:00, 53.10it/s]
100%|██████████| 5370/5370 [01:41<00:00, 52.81it/s]
100%|██████████| 10592/10592 [03:39<00:00, 48.36it/s]
100%|██████████| 2648/2648 [00:52<00:00, 50.33it/s]
100%|██████████| 860/860 [00:17<00:00, 49.91it/s]
