# Grid Search for BERT Model

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score

import sys
sys.path.append('..')

import import_ipynb
from data_preparation import Preprocessing

# Caricamento e preparazione dei dati
df = pd.read_csv('../../data/updated_tweets.csv')

def normalize_tweet_BERT(tweet):
    tweet = Preprocessing.remove_links_mentions(tweet)
    tweet = tweet.lower()
    tweet = Preprocessing.remove_hashtag(tweet)
    tweet = Preprocessing.remove_special_characters(tweet)
 
    tweet = Preprocessing.remove_spaces(tweet)
    tweet = Preprocessing.remove_textual_emojis(tweet)
    tweet = Preprocessing.remove_not_ASCII(tweet)

    return tweet

df['tweet_text'] = df['tweet_text'].apply(normalize_tweet_BERT)
df = Preprocessing.clean_normalized_df(df)


possible_labels = df.cyberbullying_type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)
df['label'] = df.cyberbullying_type.replace(label_dict)


def preprocess_data(df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    input_ids = []
    attention_masks = []

    for tweet in df.tweet_text:
        encoded_dict = tokenizer.encode_plus(
                            tweet,
                            add_special_tokens = True,
                            max_length = 64,
                            padding = 'max_length', 
                            return_attention_mask = True,
                            truncation = True,
                            return_tensors = 'pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df.label.values)
    
    return input_ids, attention_masks, labels

input_ids, attention_masks, labels = preprocess_data(df)

batch_size = 32
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

def train_model(model, train_dataloader, val_dataloader, epochs, optimizer, device):
    model.train()
    for epoch in range(epochs):
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids, attention_masks, labels = batch
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)
            labels = labels.to(device)
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
    
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    return val_accuracy / len(val_dataloader)

param_grid = {
    'learning_rate': [1e-5, 2e-5, 3e-5],
    'batch_size': [16, 32, 64],
    'num_train_epochs': [2, 3, 4]
}

'''def grid_search(param_grid, model, train_dataloader, val_dataloader, device):
    best_params = None
    best_score = 0
    for lr in param_grid['learning_rate']:
        for bs in param_grid['batch_size']:
            for epochs in param_grid['num_train_epochs']:
                optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
                score = train_model(model, train_dataloader, val_dataloader, epochs, optimizer, device)
                if score > best_score:
                    best_score = score
                    best_params = {'learning_rate': lr, 'batch_size': bs, 'num_train_epochs': epochs}
    return best_params, best_score
'''
def grid_search(param_grid, model, train_dataloader, val_dataloader, device):
    best_params = None
    best_score = 0
    total_combinations = len(param_grid['learning_rate']) * len(param_grid['batch_size']) * len(param_grid['num_train_epochs'])
    with tqdm(total=total_combinations, desc="Grid Search Progress") as pbar:
        for lr in param_grid['learning_rate']:
            for bs in param_grid['batch_size']:
                for epochs in param_grid['num_train_epochs']:
                    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
                    score = train_model(model, train_dataloader, val_dataloader, epochs, optimizer, device)
                    if score > best_score:
                        best_score = score
                        best_params = {'learning_rate': lr, 'batch_size': bs, 'num_train_epochs': epochs}
                    pbar.update(1)
    return best_params, best_score


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("\nGrid Search...")
best_params, best_score = grid_search(param_grid, model, train_dataloader, val_dataloader, device)

print(f'I migliori parametri trovati sono: {best_params} con un punteggio di: {best_score}')


importing Jupyter notebook from /home/g.russo55/Progetto/src/classifiers/../data_preparation/Preprocessing.ipynb
{'not_cyberbullying': 0, 'gender': 1, 'religion': 2, 'age': 3, 'ethnicity': 4}


  df['label'] = df.cyberbullying_type.replace(label_dict)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Grid Search...


Grid Search Progress:   0%|          | 0/27 [00:00<?, ?it/s]

I migliori parametri trovati sono: {'learning_rate': 2e-05, 'batch_size': 64, 'num_train_epochs': 3} con un punteggio di: 0.9443613652357495
