In [None]:
#Library used for fine tuning
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
# Pandas Dataframe Library
import json
import pandas as pd
# HateBert Libarary
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")



def load_data():
    # Open train jsonl file
    with open('train.json', 'r') as f:
        train_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    train_df = pd.DataFrame(train_data)

    # Open validation jsonl file
    with open('val.json', 'r') as f:
        val_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    val_df = pd.DataFrame(val_data)

    # Open test jsonl file
    with open('test.json', 'r') as f:
        test_data = [json.loads(line) for line in f.readlines()]

    # Create a pandas DataFrame from the parsed json data
    test_df = pd.DataFrame(test_data)


    return train_data, val_data, test_data

def tokenize_data(data):
    tokenized_data = tokenizer(
        data["context"],
        data["target"],
        padding="max_length",
        max_length=50,
        truncation=True
    )
    tokenized_data["label"] = int(data["label"])
    return tokenized_data

def list_of_dicts_to_dict_of_lists(d):
    dic = d[0]
    keys = dic.keys()
    values = [dic.values() for dic in d]
    return {k: list(v) for k, v in zip(keys, zip(*values))}
    

def load_tokenized_data():
    train_data, val_data, test_data = load_data()
    print(train_data)
    tokenized_train = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in train_data])
    tokenized_val = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in val_data])
    tokenized_test = list_of_dicts_to_dict_of_lists([tokenize_data(data) for data in test_data])
    
    
    return tokenized_train, tokenized_val, tokenized_test


tokenized_train_data, tokenized_val_data, tokenized_test_data = load_tokenized_data()


In [None]:

# load pre-trained HateBert
model = AutoModelForSequenceClassification.from_pretrained("GroNLP/hateBERT" ,num_labels=3)


In [None]:
from torch.utils.data import Dataset




class TokenizedDataset(Dataset):
    def __init__(self, data):
        self.data = data
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        print(item['input_ids'].size())
        print(item['token_type_ids'].size())
        print(item['attention_mask'].size())
        return item

# Define the training and validation datasets using DataLoader

train_dataset = TokenizedDataset(tokenized_train_data)
val_dataset = TokenizedDataset(tokenized_val_data)



# Define the training arguments for the Trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True
)




trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


In [None]:
def evaluate_model(model, data_loader):
    # Set the model to evaluation mode
    model.eval()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    # Define the lists to store the true labels and predicted labels
    true_labels = []
    pred_labels = []
    
    # Iterate over the data loader
    for batch in data_loader:
        # Load the input tensors
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)
        
        # Disable gradient computation to speed up inference
        with torch.no_grad():
            # Forward pass through the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            
        # Get the predicted labels and true labels
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        true_labels.extend(labels.cpu().numpy())
        pred_labels.extend(preds.cpu().numpy())
    
    # Calculate the evaluation metrics
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='weighted')
    recall = recall_score(true_labels, pred_labels, average='weighted')
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    
    # Print the evaluation results
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')
    
    # Return the evaluation metrics
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }




# Create a PyTorch dataset from the tokenized test data
test_dataset = TokenizedDataset(tokenized_test_data)

# Create a PyTorch data loader from the test dataset
test_data_loader = DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False
)



# Evaluate the model on the test data
evaluate_model(model, test_data_loader)

