<a href="https://colab.research.google.com/github/andreunilux/BSP-6/blob/master/BSP6_(version_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install missing dependancies
!pip install transformers accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#Library used for fine tuning

from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments

# Pandas Dataframe Library
import json
import pandas as pd
import os
from IPython.display import display
import numpy as np 

# HateBert Libarary
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, BertConfig, BertTokenizer

In [3]:

from google.colab import drive
drive.mount('/content/drive')
os.getcwd()
os.chdir('/content/drive/MyDrive/BSP6')   

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_data = pd.read_json('train.json', lines=True)
val_data = pd.read_json('val.json', lines=True)
test_data = pd.read_json('test.json', lines=True)

In [5]:
print("Look at example representative entries of the dataset")
display(train_data.head(2))
print("")
print("Is the dataset complete or are some values missing?")
display(train_data.count())
print("")
print("How balanced is the dataset?")
display(train_data.label.value_counts())
print("")
print("How long are the context sentences?")
display(train_data.context.apply(len).describe())
print("Notice there a high  standard diviation (std)\n")
print("How long are the target sentences?")
display(train_data.target.apply(len).describe())

Look at example representative entries of the dataset


Unnamed: 0,idx,label,context,target
0,0,2,The UK is fucked.,>The ~~UK~~ world is fucked FTFY
1,1,0,Listen to this wisdom.,Where the Fuck did you get that up arrow?



Is the dataset complete or are some values missing?


idx        3325
label      3325
context    3325
target     3325
dtype: int64


How balanced is the dataset?


1    1627
0     922
2     776
Name: label, dtype: int64


How long are the context sentences?


count    3325.000000
mean      236.940752
std       203.202595
min        13.000000
25%        92.000000
50%       170.000000
75%       319.000000
max      1821.000000
Name: context, dtype: float64

Notice there a high  standard diviation (std)

How long are the target sentences?


count    3325.000000
mean       57.216842
std        24.528212
min        12.000000
25%        39.000000
50%        56.000000
75%        74.000000
max       326.000000
Name: target, dtype: float64

In [6]:
print("Look at example representative entries of the dataset")
display(val_data.head(2))
print("")
print("Is the dataset complete or are some values missing?")
display(val_data.count())
print("")
print("How balanced is the dataset?")
display(val_data.label.value_counts())
print("")
print("How long are the context sentences?")
display(val_data.context.apply(len).describe())
print("Notice there a high  standard diviation (std)\n")
print("How long are the target sentences?")
display(val_data.target.apply(len).describe())

Look at example representative entries of the dataset


Unnamed: 0,idx,label,context,target
0,0,2,The fact that you think that is sufficient for...,Not being able to find a job for 20 years soun...
1,1,2,Because it's not true you fucking liar. Not ev...,Can't handle the truth hmmmm?



Is the dataset complete or are some values missing?


idx        713
label      713
context    713
target     713
dtype: int64


How balanced is the dataset?


1    356
0    202
2    155
Name: label, dtype: int64


How long are the context sentences?


count    713.000000
mean     238.929874
std      191.393634
min       19.000000
25%       98.000000
50%      174.000000
75%      330.000000
max      989.000000
Name: context, dtype: float64

Notice there a high  standard diviation (std)

How long are the target sentences?


count    713.000000
mean      56.067321
std       22.943163
min       10.000000
25%       39.000000
50%       54.000000
75%       71.000000
max      202.000000
Name: target, dtype: float64

In [7]:
print("Look at example representative entries of the dataset")
display(test_data.head(2))
print("")
print("Is the dataset complete or are some values missing?")
display(test_data.count())
print("")
print("How balanced is the dataset?")
display(test_data.label.value_counts())
print("")
print("How long are the context sentences?")
display(test_data.context.apply(len).describe())
print("Notice there a high  standard diviation (std)\n")
print("How long are the target sentences?")
display(test_data.target.apply(len).describe())

Look at example representative entries of the dataset


Unnamed: 0,idx,label,context,target
0,0,2,Someone on Tumblr actually complied a list and...,Can I get a link to that?
1,1,1,"She has no trouble using Feminism to help her,...",Rand Paul thinks so too. Maybe you should vote...



Is the dataset complete or are some values missing?


idx        713
label      713
context    713
target     713
dtype: int64


How balanced is the dataset?


1    361
0    184
2    168
Name: label, dtype: int64


How long are the context sentences?


count     713.000000
mean      240.396914
std       205.402724
min        13.000000
25%        93.000000
50%       173.000000
75%       323.000000
max      1066.000000
Name: context, dtype: float64

Notice there a high  standard diviation (std)

How long are the target sentences?


count    713.000000
mean      54.382889
std       21.071323
min       13.000000
25%       37.000000
50%       53.000000
75%       70.000000
max      147.000000
Name: target, dtype: float64

In [8]:
# load pre-trained HateBert
config = BertConfig.from_pretrained("GroNLP/hateBERT", num_labels=3)
model = BertForSequenceClassification.from_pretrained("GroNLP/hateBERT", config=config)
tokenizer = BertTokenizer.from_pretrained("GroNLP/hateBERT")

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly

In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [36]:
#lock the parameter on the pre-trained Hate Bert model
for para in model.parameters():
  pre_last_layer = model.get_parameter("classifier.bias")
  last_layer = model.get_parameter("classifier.weight")
  if(torch.equal(para,pre_last_layer)):
    print("second last layer was not forzen.")
  elif(torch.equal(para,last_layer)):
    print("last layer was not forzen.")
  else:
    para.requires_grad = False

for name, module in model.named_modules():
    if 'classifier' in name:
        print('found')

torch.nn.Sequential(model, torch.nn.Softmax(3))


last layer was not forzen.
second last layer was not forzen.
found


Sequential(
  (0): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tr

In [11]:
#Train Data

X = []
for context, target in zip(train_data.context, train_data.target):
  X.append(tokenizer.cls_token +context + tokenizer.sep_token + target)
y_train = list(train_data.label)
X_train_tokenized = tokenizer(X, padding=True, truncation=True, max_length=250)


#val Data
X = []
for context, target in zip(val_data.context, val_data.target):
   X.append(tokenizer.cls_token +context + tokenizer.sep_token + target)
y_val = list(val_data.label)
X_val_tokenized = tokenizer(X, padding=True, truncation=True, max_length=250)

#test Data
X = []
for context, target in zip(test_data.context, test_data.target):
   X.append(tokenizer.cls_token +context + tokenizer.sep_token + target)
y_test = list(test_data.label)
X_test_tokenized = tokenizer(X, padding=True, truncation=True, max_length=250)


In [12]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized, y_test)


In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



In [37]:


X_train_tokenized.keys()


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [38]:
# Define Trainer
args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=16,               # total number of training epochs
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    learning_rate = 5e-4,
    logging_steps = 50
)            
                  
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



In [39]:
trainer.train()
trainer.save_model('/content/drive/MyDrive/BSP6/model6')



Step,Training Loss
50,0.9979
100,0.9995
150,0.9813


In [17]:
eval=trainer.evaluate(val_dataset)

  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
eval


{'eval_loss': 1.0171899795532227,
 'eval_accuracy': 0.4992987377279102,
 'eval_precision': 0.24929922949668448,
 'eval_recall': 0.4992987377279102,
 'eval_f1': 0.3325544445858485,
 'eval_runtime': 11.0144,
 'eval_samples_per_second': 64.733,
 'eval_steps_per_second': 4.086,
 'epoch': 16.0}