# Grid Search for BERT Models

# Importing libraries

In [35]:
import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import ElectraTokenizer, ElectraForSequenceClassification

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score

In [36]:
MAX_LEN = 150
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Data Preprocessing

In [37]:
train_df = pd.read_csv('../../data/train_tweets_Transformers.csv')
eval_df = pd.read_csv('../../data/eval_tweets_Transformers.csv')
test_df = pd.read_csv('../../data/test_tweets_Transformers.csv')

### Labels encoding

In [38]:
possible_labels = train_df.cyberbullying_type.unique()

le = LabelEncoder()
label_dict = le.fit_transform(possible_labels)

train_df['label'] = le.fit_transform(train_df['cyberbullying_type'])
eval_df['label'] = le.fit_transform(eval_df['cyberbullying_type'])
test_df['label'] = le.fit_transform(test_df['cyberbullying_type'])

### Data Preparation

In [39]:
def encode_data(df, checkpoint):
    
    if "roberta" in checkpoint:
        tokenizer = RobertaTokenizer.from_pretrained(checkpoint)
    elif "bert" in checkpoint:
        tokenizer = BertTokenizer.from_pretrained(checkpoint)
    elif "electra" in checkpoint:
        tokenizer = ElectraTokenizer.from_pretrained(checkpoint)
    else:
        raise ValueError("Unsupported model type")
        
    encoded_data = tokenizer.batch_encode_plus(
        df.tweet_text.values, 
        add_special_tokens = True,         # Add [CLS] and [SEP] special tokens
        return_attention_mask = True,      # it will return the attention mask according to the specific tokenizer defined by the max_length attribute
        max_length = MAX_LEN,
        padding = 'max_length', 
        truncation = True,
        return_tensors = 'pt'              # return pytorch, i tensori servono a rappresentare e manipolare dati multidimensionali in modo efficiente
    )
    
    return encoded_data
    

In [40]:
def get_dataloaders(checkpoint, batch_size=64):
    # Codifica i dati
    encoded_data_train = encode_data(train_df, checkpoint)
    encoded_data_val = encode_data(eval_df, checkpoint)

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(train_df.label.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(eval_df.label.values) #convertire le etichette in tensori

    # TensorDataset consente di creare un dataset basato su tensori, 
    # utile soprattutto quando si lavora con dati che possono essere rappresentati come tensori

    #Combines the input IDs, attention masks, and labels for the training set into a TensorDataset.
    # This allows the data to be easily accessed and used by PyTorch's DataLoader.
    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

    return dataloader_train, dataloader_validation

# Grid search preparation

In [41]:
def train_model(model, train_dataloader, val_dataloader, epochs, optimizer, device):
    model.train()

    for epoch in range(epochs):
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids, attention_masks, labels = batch
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
            #outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
            #outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)

            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        
    return val_accuracy / len(val_dataloader)

In [42]:
param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5],
    'batch_size': [16, 32, 64],
    'num_train_epochs': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

def grid_search(param_grid, model, train_dataloader, val_dataloader, device):
    best_params = None
    best_score = 0
    total_combinations = len(param_grid['learning_rate']) * len(param_grid['batch_size']) * len(param_grid['num_train_epochs'])
    with tqdm(total=total_combinations, desc="Grid Search Progress") as pbar:
        for lr in param_grid['learning_rate']:
            for bs in param_grid['batch_size']:
                for epochs in param_grid['num_train_epochs']:
                    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
                    score = train_model(model, train_dataloader, val_dataloader, epochs, optimizer, device)
                    if score > best_score:
                        best_score = score
                        best_params = {'learning_rate': lr, 'batch_size': bs, 'num_train_epochs': epochs}
                    pbar.update(1)
    return best_params, best_score

In [43]:
def execute_grid(checkpoint):
    
    train_dataloader, val_dataloader = get_dataloaders(checkpoint)

     # Model initialization
    if "roberta" in checkpoint:
        model = RobertaForSequenceClassification.from_pretrained(checkpoint, num_labels=5)
    elif "bert" in checkpoint:
        model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=5)
    elif "electra" in checkpoint:
        model = ElectraForSequenceClassification.from_pretrained(checkpoint, num_labels=5)
    else:
        raise ValueError("Unsupported model type")

    model.to(device)
    
    print(f"\nGrid Search {checkpoint}...")
    best_params, best_score = grid_search(param_grid, model, train_dataloader, val_dataloader, device)
    
    print(f'I migliori parametri trovati sono: {best_params} con un punteggio di: {best_score}\n')

# Run the grid search

In [44]:
execute_grid("bert-base-uncased")
#execute_grid("bert-large-uncased-whole-word-masking")
#execute_grid("roberta-base")
#execute_grid("google/electra-base-discriminator")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Grid Search bert-base-uncased...


Grid Search Progress:   0%|          | 0/27 [00:00<?, ?it/s]

KeyboardInterrupt: 