In [1]:
#Library used for fine tuning
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
# Pandas Dataframe Library
import json
import pandas as pd
# HateBert Libarary
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")


def load_data():
    # Open train jsonl file
    with open('train.jsonl', 'r') as f:
        train_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    train_df = pd.DataFrame(train_data)

    # Open validation jsonl file
    with open('val.jsonl', 'r') as f:
        val_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    val_df = pd.DataFrame(val_data)

    # Open test jsonl file
    with open('test.jsonl', 'r') as f:
        test_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    test_df = pd.DataFrame(test_data)


    return train_data, val_data, test_data


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def tokenize_data(data):
    tokenized_data = tokenizer(
        data["context"],
        data["target"],
        padding="max_length",
        max_length=20,
        truncation=True
    )
    tokenized_data["label"] = int(data["label"])
    return tokenized_data

def list_of_dicts_to_dict_of_lists(d):
    dic = d[0]
    keys = dic.keys()
    values = [dic.values() for dic in d]
    return {k: list(v) for k, v in zip(keys, zip(*values))}

def load_tokenized_data():
    train_data, val_data, test_data = load_data()
    tokenized_train = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in train_data])
    tokenized_val = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in val_data])
    tokenized_test = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in test_data])
    
    
    return tokenized_train, tokenized_val, tokenized_test






tokenized_train_data, tokenized_val_data, tokenized_test_data = load_tokenized_data()

# load pre-trained HateBert
model = AutoModelForSequenceClassification.from_pretrained("GroNLP/hateBERT" ,num_labels=5)


Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly

In [3]:
from torch.utils.data import Dataset, DataLoader

class TokenizedDataset(Dataset):
    def __init__(self, data):
        self.data = data
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        print(type(item))
        print(item)
        print(item['input_ids'].size())
        print(item['token_type_ids'].size())
        print(item['attention_mask'].size())
        return item

# Define the training and validation datasets using DataLoader

train_dataset = TokenizedDataset(tokenized_train_data)
val_dataset = TokenizedDataset(tokenized_val_data)



# Define the training arguments for the Trainer
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=3,               # total number of training epochs
    per_device_train_batch_size=4,   # batch size per device during training
    per_device_eval_batch_size=4,    # batch size for evaluation
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,
    evaluation_strategy='epoch',      # evaluation strategy to adopt during training
)




trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


***** Running training *****
  Num examples = 4
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 109486085


<class 'dict'>
{'input_ids': tensor([  101,  1996,  2866,  2003, 21746,  1012,   102,  1028,  1996,  1066,
         1066,  2866,  1066,  1066,  2088,  2003, 21746,  3027, 12031,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([  101,  1000,  2008,  1005,  1055,  2367,  1012,  1000,   102,  2017,
         1005,  2128,  2157,  1012,  1996, 27312,  2758,  2009,  1005,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(1)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([ 101, 4952, 2000, 2023, 9866, 1012,  102, 2073, 1996, 6616, 2106, 2017,
        2131, 2008, 203

Epoch,Training Loss,Validation Loss
1,No log,1.413535
2,No log,1.393437
3,No log,1.383625


***** Running Evaluation *****
  Num examples = 4
  Batch size = 4


<class 'dict'>
{'input_ids': tensor([ 101, 1996, 2755, 2008, 2017, 2228, 2008, 2003, 7182, 2005,  102, 2025,
        2108, 2583, 2000, 2424, 1037, 3105, 2005,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([  101,  2138,  2009,  1005,  1055,  2025,  2995,  2017,  8239, 16374,
          102,  2064,  1005,  1056,  5047,  1996,  3606, 17012,  7382,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([ 101, 2054, 2055, 1996, 2033, 4168, 1000, 8840, 2140, 3287,  102, 3287,
        4000, 2003, 7455, 2243, 1048, 2213,

***** Running Evaluation *****
  Num examples = 4
  Batch size = 4


<class 'dict'>
{'input_ids': tensor([ 101, 1996, 2755, 2008, 2017, 2228, 2008, 2003, 7182, 2005,  102, 2025,
        2108, 2583, 2000, 2424, 1037, 3105, 2005,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([  101,  2138,  2009,  1005,  1055,  2025,  2995,  2017,  8239, 16374,
          102,  2064,  1005,  1056,  5047,  1996,  3606, 17012,  7382,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([ 101, 2054, 2055, 1996, 2033, 4168, 1000, 8840, 2140, 3287,  102, 3287,
        4000, 2003, 7455, 2243, 1048, 2213,

***** Running Evaluation *****
  Num examples = 4
  Batch size = 4


<class 'dict'>
{'input_ids': tensor([ 101, 1996, 2755, 2008, 2017, 2228, 2008, 2003, 7182, 2005,  102, 2025,
        2108, 2583, 2000, 2424, 1037, 3105, 2005,  102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([  101,  2138,  2009,  1005,  1055,  2025,  2995,  2017,  8239, 16374,
          102,  2064,  1005,  1056,  5047,  1996,  3606, 17012,  7382,   102]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'label': tensor(2)}
torch.Size([20])
torch.Size([20])
torch.Size([20])
<class 'dict'>
{'input_ids': tensor([ 101, 2054, 2055, 1996, 2033, 4168, 1000, 8840, 2140, 3287,  102, 3287,
        4000, 2003, 7455, 2243, 1048, 2213,



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3, training_loss=1.396104335784912, metrics={'train_runtime': 109.0168, 'train_samples_per_second': 0.11, 'train_steps_per_second': 0.028, 'total_flos': 123336629280.0, 'train_loss': 1.396104335784912, 'epoch': 3.0})