# BERT

### Import Dataset

In [9]:
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

In [10]:
train_df = pd.read_csv('../../data/train_tweets_Transformers.csv')
eval_df = pd.read_csv('../../data/eval_tweets_Transformers.csv')
test_df = pd.read_csv('../../data/test_tweets_Transformers.csv')

### Labels encoding

In [16]:
possible_labels = train_df.cyberbullying_type.unique()

le = LabelEncoder()
label_dict = le.fit_transform(possible_labels)

Sostituiamo nel dataset

In [27]:
train_df['label'] = le.fit_transform(train_df['cyberbullying_type'])
eval_df['label'] = le.fit_transform(eval_df['cyberbullying_type'])
test_df['label'] = le.fit_transform(test_df['cyberbullying_type'])


In [28]:
train_df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,label
0,not true but when world colored by bigotry/rac...,ethnicity,1
1,u bully one white kid in ur school's christian...,age,0
2,its not a gay rape joke which i often complain...,gender,2


In [29]:
eval_df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,label
0,"hey, do you have a good way to consume multipl...",not_cyberbullying,3
1,argg need sleep xx fuckin school today and foc...,not_cyberbullying,3
2,they fired vic for a jellybean joke while maki...,gender,2


In [30]:
test_df.head(3)

Unnamed: 0,tweet_text,cyberbullying_type,label
0,2much blacks died for da right 2 vote go vote....,ethnicity,1
1,"mitt is gonna win , cant wait to shackle up so...",ethnicity,1
2,it's nikki's hot pot but she has katie make th...,not_cyberbullying,3


### BertTokenizer and Encoding the Data

**We have to perform _Tokenization_ --> take raw texts and split into tokens, which are numeric data to represent words**

In [31]:
# Inizializzazione del tokenizer BERT basato su WordPiece, 
# instanziando una configurazione bert-base (12 layer) e uncased, dato che durante il preprocessing abbiamo eliminato le lettere maiuscole
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [32]:
MAX_LEN = 150

In [36]:
def encode_data(df):
    encoded_data = tokenizer.batch_encode_plus(
        df.tweet_text.values, 
        add_special_tokens = True,         # Add [CLS] and [SEP] special tokens
        return_attention_mask = True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute
        max_length = MAX_LEN,
        padding = 'max_length', 
        truncation = True,
        return_tensors = 'pt'              # return pytorch, i tensori servono a rappresentare e manipolare dati multidimensionali in modo efficiente
    )
    return encoded_data

In [37]:
# Codifica i dati
encoded_data_train = encode_data(train_df)
encoded_data_val = encode_data(eval_df)

Split the data into input_ids, attention_masks and labels.

In [15]:
'''input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == ENCODE_DATA_TYPES[0]].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == ENCODE_DATA_TYPES[1]].label.values)'''

In [46]:
encoded_data_train.set_format("torch")

AttributeError: 

Finally, after we get encoded data set, we can create training data and validation data.

In [17]:
# TensorDataset consente di creare un dataset basato su tensori, 
# utile soprattutto quando si lavora con dati che possono essere rappresentati come tensori
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [18]:
len(dataset_train), len(dataset_val)

(33303, 5878)

### BERT Pre-trained Model

_bert-base-uncased_ is a smaller pre-trained model

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# gli ultimi due non sono necessari + settando a False riduciamo il peso computazionale

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- DataLoader combines a dataset and a sampler, and provides an iterable over the given dataset.
- We use RandomSampler for training and SequentialSampler for validation.
- Given the limited memory in my environment, I set batch_size=3

In [20]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 64

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

##### Optimizer & Scheduler

- To construct an optimizer, we have to give it an iterable containing the parameters to optimize. Then, we can specify optimizer-specific options such as the learning rate, epsilon, etc
- Search for epochs=X which works well for this dataset
- Create a schedule with a learning rate that decreases linearly from the initial learning rate set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial learning rate set in the optimizer

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=2e-5,   #learning rate
                  eps=1e-8)
                  
epochs = 3 # CONTROLLARE VALORE (5)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



**nota**: epsilon è un piccolo valore aggiunto al denominatore per evitare la divisione per zero o instabilità numerica durante il training

### Performance 

We will use f1 score and accuracy per class as performance metrics.

In [22]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [23]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(predictions, true_vals):
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()
    return accuracy_score(labels_flat, preds_flat)

### Training

In [24]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [25]:
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())

True
False


In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda:0


In [27]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [28]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Average Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_accuracy = calculate_accuracy(predictions, true_vals)  # Calcolo dell'accuratezza
        
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation Accuracy: {val_accuracy}')

torch.save(model.state_dict(), f'../../data/BERT/finetuned_BERT.model')


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/521 [00:00<?, ?it/s]


Epoch 1
Average Training loss: 0.32646875974370054
Validation loss: 0.1982333490361824
F1 Score (Weighted): 0.933089944182388
Validation Accuracy: 0.93297039809459


Epoch 2:   0%|          | 0/521 [00:00<?, ?it/s]


Epoch 2
Average Training loss: 0.1465318143839685
Validation loss: 0.17283179407873514
F1 Score (Weighted): 0.9438388515838645
Validation Accuracy: 0.9436883293637292


Epoch 3:   0%|          | 0/521 [00:00<?, ?it/s]


Epoch 3
Average Training loss: 0.10174632099887888
Validation loss: 0.18258128317614572
F1 Score (Weighted): 0.9426537458830463
Validation Accuracy: 0.9424974481116026


In [29]:

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('../../data/BERT/finetuned_BERT.model', map_location=device))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: not_cyberbullying
Accuracy: 1046/1183

Class: gender
Accuracy: 1043/1160

Class: religion
Accuracy: 1144/1185

Class: age
Accuracy: 1175/1199

Class: ethnicity
Accuracy: 1132/1151



In [33]:
# Confusion Matrix and Classification Report with checks

preds_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_vals.flatten()

# Confusion Matrix
print(confusion_matrix(labels_flat, preds_flat))

# Classification Report
print(classification_report(labels_flat, preds_flat))

[[1046   69   45   12   11]
 [ 110 1043    5    1    1]
 [  33    4 1144    1    3]
 [  20    4    0 1175    0]
 [  10    4    4    1 1132]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1183
           1       0.93      0.90      0.91      1160
           2       0.95      0.97      0.96      1185
           3       0.99      0.98      0.98      1199
           4       0.99      0.98      0.99      1151

    accuracy                           0.94      5878
   macro avg       0.94      0.94      0.94      5878
weighted avg       0.94      0.94      0.94      5878

