<a href="https://colab.research.google.com/github/andreunilux/BSP_6/blob/main/Bert_with_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install missing dependancies
!pip install transformers accelerate


#Library used for fine tuning

from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments

# Pandas Dataframe Library
import json
import pandas as pd
import os
from IPython.display import display
import numpy as np 
import datetime


# HuggingFace Libarary
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
from transformers import AutoTokenizer, BertConfig, BertTokenizer
from transformers import EarlyStoppingCallback


# Model HyperParameters
current_model="roberta-base"

#All tested models 
bert="bert-base-uncased"
HateBert="GroNLP/hateBERT"
DistilBert="distilbert-base-uncased"
RoBERTa="roberta-base" #'eval_f1': 0.5518856032046312 on gold
HateRoBERTa="facebook/roberta-hate-speech-dynabench-r4-target"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')
os.getcwd()
gold='/content/drive/MyDrive/BSP6/init_dataset_gold'
gold_silver='/content/drive/MyDrive/BSP6/init_dataset_gold_silver'
gold_silver_train_gold_val='/content/drive/MyDrive/BSP6/init_dataset_gold_silver_train_gold_val'
current_dataset=gold_silver_train_gold_val
os.chdir(current_dataset) 
now = str(datetime.datetime.now())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_data = pd.read_json('train.json', lines=True)
val_data = pd.read_json('val.json', lines=True)
test_data = pd.read_json('test.json', lines=True)

In [4]:
# load pre-trained HateBert
if(current_model=="roberta-base"):
  model = AutoModelForSequenceClassification.from_pretrained('roberta-base',num_labels=3)
  tokenizer = AutoTokenizer.from_pretrained('roberta-base')
elif(current_model=="facebook/roberta-hate-speech-dynabench-r4-target"):
  tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
  model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target",num_labels=3, ignore_mismatched_sizes=True,hidden_dropout_prob=0.3,attention_probs_dropout_prob=0.3)
else:
  model = BertForSequenceClassification.from_pretrained(current_model,num_labels=3 )
  tokenizer = BertTokenizer.from_pretrained(current_model)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [5]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [6]:
#Train Data
max_length_X=50
max_length_V=50
max_length_T=50


X = []
for context, target in zip(train_data.context, train_data.target):
  X.append(tokenizer( target, padding='max_length', truncation='longest_first', max_length=max_length_X))
y_train = list(train_data.label)
print(X)
X_train_tokenized = X


#val Data
V = []
for context, target in zip(val_data.context, val_data.target):
   V.append(tokenizer( context,target, padding='max_length', truncation='longest_first', max_length=max_length_V))
y_val = list(val_data.label)
V_val_tokenized = V

#test Data
T = []
for context, target in zip(test_data.context, test_data.target):
   T.append(tokenizer(context, target, padding='max_length', truncation='longest_first', max_length=max_length_T))
y_test = list(test_data.label)
T_test_tokenized = T





[{'input_ids': [0, 15698, 133, 1437, 48256, 10494, 48256, 232, 16, 42647, 1437, 274, 20249, 975, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, {'input_ids': [0, 13841, 5, 43774, 222, 47, 120, 14, 62, 27899, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, {'input_ids': [0, 1185, 214, 235, 4, 20, 29976, 161, 24, 18, 10, 1816, 4, 7698, 52, 847, 69, 3741, 405, 160, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [7]:
#print the token
print(tokenizer.decode(X_train_tokenized[0].input_ids))
print(X_train_tokenized[0])
print(len(X_train_tokenized[0].input_ids))
print(y_train[0])


print(tokenizer.decode(X_train_tokenized[1].input_ids))
print(X_train_tokenized[1])
print(len(X_train_tokenized[1].input_ids))
print(y_train[1])


<s>>The ~~UK~~ world is fucked  FTFY</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
{'input_ids': [0, 15698, 133, 1437, 48256, 10494, 48256, 232, 16, 42647, 1437, 274, 20249, 975, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
50
2
<s>Where the Fuck did you get that up arrow?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
{'input_ids': [0, 13841, 5, 43774, 222, 47, 120, 14, 62, 27899, 116, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [8]:
from torch.utils.data import DataLoader, RandomSampler

# Create torch dataset
#need a dictionary
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {'input_ids':torch.tensor(self.encodings[idx].input_ids),
                'attention_mask':torch.tensor(self.encodings[idx].attention_mask),
                'labels':torch.tensor(self.labels[idx])}
        
        return item

    def __len__(self):
        return len(self.encodings)

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(V_val_tokenized, y_val)
test_dataset = Dataset(T_test_tokenized, y_test)


In [9]:
train_dataset[0]

{'input_ids': tensor([    0, 15698,   133,  1437, 48256, 10494, 48256,   232,    16, 42647,
          1437,   274, 20249,   975,     2,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'labels': tensor(2)}

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
#importing confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
f = open(current_dataset+"/Report/output"+now+".txt","w")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    confusion = confusion_matrix(labels, preds)
    print('Confusion Matrix\n')
    print(confusion)
    print('\nClassification Report\n')
    print(classification_report(labels, preds, target_names=['Hate', 'Neutral', 'Counter']))
    f.write('Confusion Matrix\n')
    f.write(str(confusion))
    f.write('\nClassification Report\n')
    f.write(str(classification_report(labels, preds, target_names=['Hate', 'Neutral', 'Counter'])))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



In [17]:

# Define Trainer
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,               # total number of training epochs
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    learning_rate = 1e-5,
    evaluation_strategy="steps",
    logging_steps = 100,
    weight_decay=0.2,
    load_best_model_at_end=True,
    overwrite_output_dir=True
)            



trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)


In [18]:


f.write("model"+ current_model)
f.write("dataset: "+ current_dataset)
f.write("\n")
f.write("structure of the model: \n")
f.write(str(trainer.model))
f.write("\n")
f.write("Tokenizer max length train:"+str(max_length_X))
f.write("Tokenizer max length val:"+str(max_length_V))
f.write("Tokenizer max length test:"+str(max_length_T))
f.write("\n")
f.write(str(trainer.args))
f.write("\n")

1

In [19]:
trainer.train()
f.write(str(trainer.state.log_history))
eval=trainer.evaluate(test_dataset)



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.8262,1.196609,0.396914,0.511484,0.396914,0.412278
200,0.8522,1.101896,0.489481,0.524919,0.489481,0.499826
300,0.8109,1.189251,0.429173,0.531236,0.429173,0.445554
400,0.7199,1.241385,0.462833,0.508389,0.462833,0.468713
500,0.6873,1.401675,0.43338,0.515305,0.43338,0.43908
600,0.6469,1.32006,0.43338,0.529407,0.43338,0.45118
700,0.5988,1.496605,0.391304,0.497784,0.391304,0.405045
800,0.5307,1.592043,0.399719,0.517385,0.399719,0.41255
900,0.4941,1.727605,0.389902,0.500256,0.389902,0.403692
1000,0.4407,1.723505,0.430575,0.50801,0.430575,0.441371


Confusion Matrix

[[ 38  28 136]
 [ 29 157 170]
 [ 27  40  88]]

Classification Report

              precision    recall  f1-score   support

        Hate       0.40      0.19      0.26       202
     Neutral       0.70      0.44      0.54       356
     Counter       0.22      0.57      0.32       155

    accuracy                           0.40       713
   macro avg       0.44      0.40      0.37       713
weighted avg       0.51      0.40      0.41       713

Confusion Matrix

[[109  44  49]
 [ 84 187  85]
 [ 59  43  53]]

Classification Report

              precision    recall  f1-score   support

        Hate       0.43      0.54      0.48       202
     Neutral       0.68      0.53      0.59       356
     Counter       0.28      0.34      0.31       155

    accuracy                           0.49       713
   macro avg       0.47      0.47      0.46       713
weighted avg       0.52      0.49      0.50       713

Confusion Matrix

[[101  23  78]
 [ 89 134 133]
 [ 55  29  71]

Confusion Matrix

[[126  21  37]
 [133 134  94]
 [ 79  18  71]]

Classification Report

              precision    recall  f1-score   support

        Hate       0.37      0.68      0.48       184
     Neutral       0.77      0.37      0.50       361
     Counter       0.35      0.42      0.38       168

    accuracy                           0.46       713
   macro avg       0.50      0.49      0.46       713
weighted avg       0.57      0.46      0.47       713



In [20]:
f.close()