# BERT

In [41]:
import sys
sys.path.append('..')

import import_ipynb
from data_preparation import Preprocessing

In [42]:
#from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

In [43]:
df = pd.read_csv('../../data/updated_tweets.csv')

### Normalize

In [44]:
def normalize_tweet_BERT(tweet):
    tweet = Preprocessing.remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = Preprocessing.remove_hashtag(tweet)
    tweet = Preprocessing.remove_special_characters(tweet)
 
    tweet = Preprocessing.remove_spaces(tweet)
    tweet = Preprocessing.remove_textual_emojis(tweet)
    tweet = Preprocessing.remove_not_ASCII(tweet)

    return tweet

In [45]:
df['tweet_text'] = df['tweet_text'].apply(normalize_tweet_BERT)
df = Preprocessing.clean_normalized_df(df)

In [46]:
df['cyberbullying_type'].value_counts()

cyberbullying_type
religion             7963
age                  7949
ethnicity            7893
not_cyberbullying    7711
gender               7665
Name: count, dtype: int64

In seguito a questo risultato, confermiamo che i dati sono bilanciati

### Labels encoding

In [47]:
possible_labels = df.cyberbullying_type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'not_cyberbullying': 0, 'gender': 1, 'religion': 2, 'age': 3, 'ethnicity': 4}

Sostituiamo nel dataset

In [48]:
df['label'] = df.cyberbullying_type.replace(label_dict)
df.head(3)

  df['label'] = df.cyberbullying_type.replace(label_dict)


Unnamed: 0,tweet_text,cyberbullying_type,label
0,"in other words katandandre, your food was crap...",not_cyberbullying,0
1,why is aussietv so white?,not_cyberbullying,0
2,a classy whore? or more red velvet cupcakes?,not_cyberbullying,0


### Dataset split

In [49]:
from sklearn.model_selection import train_test_split

# Divido il dataset in training e validation set mantenendo la distribuzione delle classi

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=None)

# Creare una colonna per segnare il tipo di dato
df['data_type'] = ['not_set']*df.shape[0]

# Assegna 'train' ai dati di addestramento e 'val' ai dati di validazione nel DataFrame
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

# Conta i valori per ogni combinazione di categoria, etichetta e tipo di dato
df.groupby(['cyberbullying_type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweet_text
cyberbullying_type,label,data_type,Unnamed: 3_level_1
age,3,train,6750
age,3,val,1199
ethnicity,4,train,6742
ethnicity,4,val,1151
gender,1,train,6505
gender,1,val,1160
not_cyberbullying,0,train,6528
not_cyberbullying,0,val,1183
religion,2,train,6778
religion,2,val,1185


### BertTokenizer and Encoding the Data

**We have to perform _Tokenization_ --> take raw texts and split into tokens, which are numeric data to represent words**

In [50]:
model_name = 'bert-large-uncased-whole-word-masking'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [51]:
ENCODE_DATA_TYPES = ['train', 'val']
MAX_LEN = 256             #CONTROLLARE il valore

In [52]:
'''encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].tweet_text.values, 
    add_special_tokens=True,         # Add [CLS] and [SEP] special tokens
    return_attention_mask=True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute
    padding='max_length', 
    max_length=256,             #CONTROLLARE il valore
    truncation=True,
    return_tensors='pt'              # return pytorch
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].tweet_text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    padding='max_length', 
    max_length=256, 
    truncation=True,
    return_tensors='pt'
)
'''

"encoded_data_train = tokenizer.batch_encode_plus(\n    df[df.data_type=='train'].tweet_text.values, \n    add_special_tokens=True,         # Add [CLS] and [SEP] special tokens\n    return_attention_mask=True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute\n    padding='max_length', \n    max_length=256,             #CONTROLLARE il valore\n    truncation=True,\n    return_tensors='pt'              # return pytorch\n)\n\nencoded_data_val = tokenizer.batch_encode_plus(\n    df[df.data_type=='val'].tweet_text.values, \n    add_special_tokens=True, \n    return_attention_mask=True, \n    padding='max_length', \n    max_length=256, \n    truncation=True,\n    return_tensors='pt'\n)\n"

In [53]:
def encode_data(data_type):
    encoded_data = tokenizer.batch_encode_plus(
        df[df.data_type == data_type].tweet_text.values, 
        add_special_tokens = True,         # Add [CLS] and [SEP] special tokens
        return_attention_mask = True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute
        max_length = MAX_LEN,
        padding = 'max_length', 
        truncation = True,
        return_tensors = 'pt'              # return pytorch, i tensori servono a rappresentare e manipolare dati multidimensionali in modo efficiente
    )
    return encoded_data

In [54]:
# Codifica i dati
encoded_data_train = encode_data(ENCODE_DATA_TYPES[0])
encoded_data_val = encode_data(ENCODE_DATA_TYPES[1])

Split the data into input_ids, attention_masks and labels.

In [55]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == ENCODE_DATA_TYPES[0]].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == ENCODE_DATA_TYPES[1]].label.values)

In [56]:
'''if encoded_data_train2.equal(encoded_data_train):
    print('OKAY')
else:
    print('NO')'''

"if encoded_data_train2.equal(encoded_data_train):\n    print('OKAY')\nelse:\n    print('NO')"

Finally, after we get encoded data set, we can create training data and validation data.

In [57]:
# TensorDataset consente di creare un dataset basato su tensori, 
# utile soprattutto quando si lavora con dati che possono essere rappresentati come tensori
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [58]:
len(dataset_train), len(dataset_val)

(33303, 5878)

### BERT Pre-trained Model

In [59]:
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# gli ultimi due non sono necessari + settando a False riduciamo il peso computazionale

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- DataLoader combines a dataset and a sampler, and provides an iterable over the given dataset.
- We use RandomSampler for training and SequentialSampler for validation.
- Given the limited memory in my environment, I set batch_size=3

In [60]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32          # CONTROLLARE VALORE

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

##### Optimizer & Scheduler

- To construct an optimizer, we have to give it an iterable containing the parameters to optimize. Then, we can specify optimizer-specific options such as the learning rate, epsilon, etc
- Search for epochs=X which works well for this dataset
- Create a schedule with a learning rate that decreases linearly from the initial learning rate set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial learning rate set in the optimizer

In [61]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5,   #learning rate
                  eps=1e-8)
                  
epochs = 2 # CONTROLLARE VALORE (5)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



**nota**: epsilon è un piccolo valore aggiunto al denominatore per evitare la divisione per zero o instabilità numerica durante il training

### Performance 

We will use f1 score and accuracy per class as performance metrics.

In [62]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [63]:
from sklearn.metrics import accuracy_score

def calculate_accuracy(predictions, true_vals):
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()
    return accuracy_score(labels_flat, preds_flat)

### Training

In [64]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [65]:
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())

True
False


In [66]:
device = torch.device("cuda:1" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(device)

cpu


In [67]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [68]:
'''pip install --upgrade tqdm ipywidgets
jupyter nbextension enable --py widgetsnbextension --sys-prefix
jupyter nbextension enable --py ipywidgets --sys-prefix
pip install --upgrade notebook
'''

'pip install --upgrade tqdm ipywidgets\njupyter nbextension enable --py widgetsnbextension --sys-prefix\njupyter nbextension enable --py ipywidgets --sys-prefix\npip install --upgrade notebook\n'

In [33]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Average Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    val_accuracy = calculate_accuracy(predictions, true_vals)  # Calcolo dell'accuratezza
        
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    tqdm.write(f'Validation Accuracy: {val_accuracy}')

torch.save(model.state_dict(), f'../../data/BERT/finetuned_BERT_wholewordmasking.model')


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1041 [00:00<?, ?it/s]


Epoch 1
Average Training loss: 0.2552556678175024
Validation loss: 0.1925794691557277
F1 Score (Weighted): 0.9376462571378381
Validation Accuracy: 0.9372235454236135


Epoch 2:   0%|          | 0/1041 [00:00<?, ?it/s]


Epoch 2
Average Training loss: 0.13177899518842012
Validation loss: 0.19764518248896068
F1 Score (Weighted): 0.9389116922703608
Validation Accuracy: 0.938754678462062


In [69]:

model1 = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model1.to(device)

model1.load_state_dict(torch.load('../../data/BERT/finetuned_BERT_wholewordmasking.model', map_location=device))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: not_cyberbullying
Accuracy: 26/1183

Class: gender
Accuracy: 13/1160

Class: religion
Accuracy: 738/1185

Class: age
Accuracy: 0/1199

Class: ethnicity
Accuracy: 529/1151



In [70]:
preds_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_vals.flatten()

# Confusion Matrix
print(confusion_matrix(labels_flat, preds_flat))

# Classification Report
print(classification_report(labels_flat, preds_flat))

NameError: name 'confusion_matrix' is not defined

----

-------------