# BERT

- bert-base-uncased (model_1)
- bert-large-whole-word-masking (model_2)

### Import libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.preprocessing import LabelEncoder

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification
#from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, classification_report

In [12]:
MAX_LEN = 150

# Dizionario per bert-base-uncased
model_1_info = {
    "NAME": "bert-base-uncased",
    "BATCH_SIZE": 64,
    "EPOCHS": 3,
    "LR": 2e-5
}

# Dizionario per bert-large-whole-word-masking
model_2_info = {
    "NAME": "bert-large-whole-word-masking",
    "BATCH_SIZE": 32,
    "EPOCHS": 4,
    "LR": 1e-5
}

In [13]:
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"
print(device)

True
False
cpu


In [14]:
#pip install iProgress
#pip install ipywidgets
#!pip install ipywidgets --upgrade
#!pip install ipywidgets tqdm --upgrade

### Import Dataset

In [15]:
train_df = pd.read_csv('../../data/train_tweets_Transformers.csv')
eval_df = pd.read_csv('../../data/eval_tweets_Transformers.csv')
test_df = pd.read_csv('../../data/test_tweets_Transformers.csv')

### Labels encoding

In [16]:
'''possible_labels = train_df.cyberbullying_type.unique()

le = LabelEncoder()
label_dict = le.fit_transform(possible_labels)

print(label_dict)
label_dict_inverse = le.inverse_transform(possible_labels)
label_dict_inverse'''

possible_labels = train_df.cyberbullying_type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'ethnicity': 0, 'age': 1, 'gender': 2, 'not_cyberbullying': 3, 'religion': 4}

Sostituiamo nel dataset

In [17]:
'''train_df['label'] = le.fit_transform(train_df['cyberbullying_type'])
eval_df['label'] = le.fit_transform(eval_df['cyberbullying_type'])
test_df['label'] = le.fit_transform(test_df['cyberbullying_type'])'''

train_df['label'] = train_df.cyberbullying_type.replace(label_dict)
eval_df['label'] = eval_df.cyberbullying_type.replace(label_dict)
test_df['label'] = test_df.cyberbullying_type.replace(label_dict)


train_dff = train_df
train_df = train_df[:1000]


  train_df['label'] = train_df.cyberbullying_type.replace(label_dict)
  eval_df['label'] = eval_df.cyberbullying_type.replace(label_dict)
  test_df['label'] = test_df.cyberbullying_type.replace(label_dict)


In [18]:
def encode_data(df, tokenizer):
    encoded_data = tokenizer.batch_encode_plus(
        df.tweet_text.values, 
        add_special_tokens = True,         # Add [CLS] and [SEP] special tokens
        return_attention_mask = True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute
        max_length = MAX_LEN,
        padding = 'max_length', 
        truncation = True,
        return_tensors = 'pt'              # return pytorch, i tensori servono a rappresentare e manipolare dati multidimensionali in modo efficiente
    )
    return encoded_data

In [19]:
def get_dataloaders(tokenizer, batch_size):
    # Codifica i dati
    encoded_data_train = encode_data(train_df, tokenizer)
    encoded_data_val = encode_data(eval_df, tokenizer)

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(train_df.label.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(eval_df.label.values) #convertire le etichette in tensori

    # TensorDataset consente di creare un dataset basato su tensori, 
    # utile soprattutto quando si lavora con dati che possono essere rappresentati come tensori

    #Combines the input IDs, attention masks, and labels for the training set into a TensorDataset.
    # This allows the data to be easily accessed and used by PyTorch's DataLoader.
    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

    return dataloader_train, dataloader_validation

We will use f1 score and accuracy per class as performance metrics.

In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

def calculate_accuracy(predictions, true_vals):
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()
    return accuracy_score(labels_flat, preds_flat)

In [21]:
def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [22]:
def train_model(model, dataloader_train, dataloader_validation, 
                epochs, optimizer, scheduler):
    
    # Initialize history dictionary
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }

    for epoch in tqdm(range(1, epochs+1)):
        
        model.train()
        
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        correct_train, total_train = 0, 0
        
        for batch in progress_bar:

            model.zero_grad()
            
            batch = tuple(b.to(device) for b in batch)
            
            inputs = {'input_ids':    batch[0],
                    'attention_mask': batch[1],
                    'labels':         batch[2].long(),
                    }       

            outputs = model(**inputs)
            
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
            
            _, preds = torch.max(outputs[1], dim=1)
            correct_train += torch.sum(preds == inputs['labels'])
            total_train += len(inputs['labels'])
            
        tqdm.write(f'\nEpoch {epoch}')
        
        loss_train_avg = loss_train_total / len(dataloader_train)
        train_acc = correct_train.double() / total_train
        tqdm.write(f'Average Training loss: {loss_train_avg}')
        
        val_loss, predictions, true_vals = evaluate(dataloader_validation, model)
        val_f1 = f1_score_func(predictions, true_vals)
        val_accuracy = calculate_accuracy(predictions, true_vals)
            
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'Validation Accuracy: {val_accuracy}')

        # Append metrics to history
        history['train_loss'].append(loss_train_avg)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_accuracy)

    model_name = model.config.name_or_path
    torch.save(model.state_dict(), f'../../data/Transformers/finetuned_{model_name}.model')
    
    return history

'''def show_plots(history):
    # Plot training & validation loss values
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Plot training & validation accuracy values
    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Training Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()
'''


"def show_plots(history):\n    # Plot training & validation loss values\n    plt.figure(figsize=(12, 4))\n\n    plt.subplot(1, 2, 1)\n    plt.plot(history['train_loss'], label='Training Loss')\n    plt.plot(history['val_loss'], label='Validation Loss')\n    plt.title('Loss')\n    plt.xlabel('Epochs')\n    plt.ylabel('Loss')\n    plt.legend()\n\n    # Plot training & validation accuracy values\n    plt.subplot(1, 2, 2)\n    plt.plot(history['train_acc'], label='Training Accuracy')\n    plt.plot(history['val_acc'], label='Validation Accuracy')\n    plt.title('Accuracy')\n    plt.xlabel('Epochs')\n    plt.ylabel('Accuracy')\n    plt.legend()\n\n    plt.show()\n"

In [23]:
def execute(model_info):
    name = model_info["NAME"]
    batch_size = model_info["BATCH_SIZE"]
    epochs = model_info["EPOCHS"]
    lr = model_info["LR"]

    # Inizializzazione del tokenizer BERT basato su WordPiece, 
    # instanziando una configurazione bert-base (12 layer) e uncased, dato che durante il preprocessing abbiamo eliminato le lettere maiuscole
    tokenizer = BertTokenizer.from_pretrained(name, do_lower_case=True)

    dataloader_train, dataloader_validation = get_dataloaders(tokenizer, batch_size)
    
    model = BertForSequenceClassification.from_pretrained(name,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

    # gli ultimi due non sono necessari + settando a False riduciamo il peso computazionale

    optimizer = AdamW(model.parameters(), lr) #, eps=1e-8
                  
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)
    
    model.to(device)
    
    print("\n*** Avvio TRAINING ***")

    history = train_model(model, dataloader_train, dataloader_validation, epochs, optimizer, scheduler) #
    print("\n*** Fine TRAINING ***")
    print("\n -------- \n")

    _, predictions, true_vals = evaluate(dataloader_validation, model)
    accuracy_per_class(predictions, true_vals)

    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_vals.flatten()

    # Confusion Matrix
    print(confusion_matrix(labels_flat, preds_flat))
    print("\n\n")

    # Classification Report
    print(classification_report(labels_flat, preds_flat))

    return model, tokenizer, history

In [24]:
model_1, tokenizer_1, history_1 = execute(model_1_info)
#model_2, tokenizer_2 = execute(model_2_info)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



*** Avvio TRAINING ***


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 1
Average Training loss: 1.5312456786632538


In [None]:
import matplotlib.pyplot as plt
def show_plots(history):
    # Plot training & validation loss values
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Plot training & validation accuracy values
    plt.subplot(1, 2, 2)
    plt.plot(history['train_acc'], label='Training Accuracy')
    plt.plot(history['val_acc'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

In [None]:
show_plots(history_1)

NameError: name 'history_1' is not defined

In [None]:
history = {
        'train_loss': [0.3620856624607205, 0.1482452382688141, 0.10468212381267665],
        'val_loss': [0.22305859933898906, 0.2112742636089373, 0.21650002296600077],
        'train_acc': [],
        'val_acc': [0.9263241106719368, 0.9302766798418972, 0.9282213438735177]
    }
show_plots(history)

In [None]:
model_2, tokenizer_2, history_2 = execute(model_2_info)
show_plots(history_2)

## Evaluate test set

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader_test):
    model.eval()
    predictions, true_labels = []

    for batch in dataloader_test:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        logits = torch.argmax(logits, dim=1).flatten().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        
        predictions.extend(logits)
        true_labels.extend(label_ids)

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')
    f1 = f1_score(true_labels, predictions, average='weighted')

    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

# Codifica i dati di test e crea un dataloader
tokenizer = BertTokenizer.from_pretrained(model_1_info["NAME"])
encoded_data_test = encode_data(test_df, tokenizer)
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_df.label.values)

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
dataloader_test = DataLoader(dataset_test, sampler=SequentialSampler(dataset_test), batch_size=model_1_info["BATCH_SIZE"])

# Esegui la valutazione del modello
#model = BertForSequenceClassification.from_pretrained(model_1_info["NAME"], num_labels=len(label_dict))
#model.to(device)
evaluate_model(model_1, dataloader_test)


## Predictions

In [None]:
from colorama import Fore, Style

def predict(text, model, tokenizer, device):
    model.eval()
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]

    prediction_type = next((chiave for chiave, valore in label_dict.items() 
                           if valore == prediction), None)
    return prediction_type
    #return invert_label(prediction)  

#TODO: aggiungere normalizzazione al tweet in ingresso   
def print_category(sentence):
    predicted_category = predict(sentence, model_1, tokenizer_1, device)
    print(f"Text: {sentence} \nPredicted Cyberbullying Category: ", end="")
    print(Fore.BLUE + Style.BRIGHT + f"{predicted_category}"+ Style.RESET_ALL)
    print("----------------")
    return
    
print_category("Example of a new tweet that could be cyberbullying.")
print_category("fuck you black")
print_category("i will rape you")
print_category("muslim idiot")
print_category("muslim idiot")
print_category("hello how are you")
print_category("Can anyone else said to this nigger that the dress is blue?")
print_category("I'm really happy for your birthday")
print_category("In my opinion Allah is not a real god")
print_category("I fucking hate Allah")
print_category("I appreciate Allah")
print_category("Men are better than women")
print_category("Bro, you are a Nigga!!!")
print_category("You are a shit!!!")
print_category("dickhead!")