# Experiments - Fine tune Bert

The goal of this notebook is use [Building a Sentiment Corpus of Tweets in Brazilian Portuguese](https://arxiv.org/abs/1712.08917).

## Libraries and Settings

Thirdy party libraries

In [1]:
# General
import os
import gc
import sys
import time
import shutil
import funcy as fp
import numpy as np
import pandas as pd

# Visualization / Presentation
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from IPython.core.display import HTML, display

# Model Training and Evaluation
import mlflow
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import GradScaler, autocast
from sklearn import metrics

Internal libraries

In [2]:
sys.path.append(os.path.abspath(os.path.pardir))

from src import settings
from src.pipeline.resources import load_corpus
from src.models.transformer import preprocess, initialize_model, set_seed, get_device, evaluate, predict
from src.utils import format_nested_parameters

Presentation settings

In [3]:
%matplotlib inline 
pd.set_option('max_colwidth', 150)

Experiment settings

In [4]:
EXPERIMENT_ID = 3
EXPERIMENT_RUN_NAME = f'TrackingParameters'

## Load and Prepare Dataset

In [5]:
frame = load_corpus()

In [6]:
frame = (frame
         .assign(label=lambda f: f['sentiment'].map({'-1': 0, '0':1, '1': 2}))
         [['text', 'label', 'group']]
        )
frame.sample(6)

Unnamed: 0,text,label,group
12193,as irmãs galvão arrasando no salto elas são tão simpáticas #Encontro,2.0,train
13325,merda ! esqueci que sandy estaria no . perdi muita coisa #Encontro,0.0,train
13578,todo bebê de NUMBER ano e pouco eu lembro do meu sobrinho 😍,1.0,train
11369,fiquei tão nervosa e chorei tanto com o USERNAME no,0.0,train
3153,adorei a roupa da sophie abrão ❤ ️ ❤ ️,2.0,test
2833,eu odeio esss leo do,0.0,train


In [7]:
train_frame = frame.loc[lambda f: f['group'] == 'train']
test_frame = frame.loc[lambda f: f['group'] == 'test']
del frame

X_test = test_frame.text.values
y_test = test_frame.label.values

X_train = train_frame.text.values
y_train = train_frame.label.values

print(f'Train: {len(X_train)} | Test: {len(X_test)}')

Train: 12990 | Test: 2010


## Check GPU Availability

In [8]:
if torch.cuda.is_available():
    print(f'GPU(s) available: {torch.cuda.device_count()}. Device name: {torch.cuda.get_device_name(0)}')
else:
    print('Using the CPU.')

GPU(s) available: 1. Device name: GeForce RTX 3090


## Fine Tuning Model

### Define Parameters and Settings

In [9]:
from transformers import BertModel,  BertTokenizer

MODEL_CLASS = BertModel
MODEL_TOKENIZER = BertTokenizer
#MODEL_NAME = 'neuralmind/bert-base-portuguese-cased'
MODEL_NAME = 'neuralmind/bert-large-portuguese-cased'

model_registry = {
    'BERT-BASE': {}
}


"""
from transformers import RobertaModel, RobertaTokenizer
MODEL_CLASS = RobertaModel
MODEL_TOKENIZER = RobertaTokenizer

MODEL_NAME = 'rdenadai/BR_BERTo'
"""

MODEL_LAYERS = []
MODEL_DROPOUT_LAYERS = []

"""
from transformers import RobertaModel, RobertaTokenizer
MODEL_CLASS = RobertaModel
MODEL_TOKENIZER = RobertaTokenizer
MODEL_NAME = 'rdenadai/BR_BERTo'
"""

FREEZE = False
LEARNING_RATE = 3e-5
BATCH_SIZE = 32
EPOCHS = 2
SEED = 42

preprocessing_params = {
    'unify_html_tags': False,
    'unify_urls': True, 
    'trim_repeating_spaces': True,
    'unify_hashtags': False, 
    'unify_mentions': True,
    'unify_numbers': False, 
    'trim_repeating_letters': True,
    'lower_case': True
}

tokenizer = MODEL_TOKENIZER.from_pretrained(MODEL_NAME, do_lower_case=preprocessing_params['lower_case'])
preprocessing_params['tokenizer'] = tokenizer

Encode all sentences to get the maximum length.

In [10]:
all_tweets = np.concatenate([train_frame.text.values, test_frame.text.values])
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]
max_len = max([len(sent) for sent in encoded_tweets])
preprocessing_params['max_len'] = max_len
del encoded_tweets

Preprocess and tokenize data.

In [11]:
# Encode the first dataset sentence and show Token IDs
token_ids = list(preprocess([X_train[0]], **preprocessing_params)[0].squeeze().numpy())
print('Original: ', X_train[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
train_inputs, train_masks = preprocess(X_train, **preprocessing_params)
test_inputs, test_masks = preprocess(X_test, **preprocessing_params)

Original:  apareceu o índice de morte na minha cidade tô muito assustado #BelemPedePaz
Token IDs:  [101, 4169, 146, 2884, 22279, 125, 1386, 229, 7122, 651, 374, 785, 17154, 487, 108, 4826, 21813, 11237, 321, 22305, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Create PyTorch DataLoaders for Train and Test Datasets

In [12]:
train_labels = torch.tensor(y_train, dtype=torch.int64)
test_labels = torch.tensor(y_test, dtype=torch.int64)

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

## Train

Define the training function.

In [13]:
def train(model, loss_fn, optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, steps_to_eval=10):
    
    best_eval = 0.
    best_epoch = -1.
    best_model_state = None
    train_history = []
    
    scaler = GradScaler()

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val F1':^9} | {'Elapsed':^9}")
        print("-" * 70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts += 1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(get_device()) for t in batch)

            model.zero_grad()

            with autocast(enabled=False):
                logits = model(b_input_ids, b_attn_mask)
                loss = loss_fn(logits, b_labels)
                batch_loss += loss.item()
                total_loss += loss.item()

            #scaler.scale(loss).backward()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            #scaler.step(optimizer)
            optimizer.step()
            scheduler.step()
            #scaler.update()

            # Print the loss values and time elapsed for every 20 batches
            if (step % steps_to_eval == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                train_history.append({'epoch': epoch_i, 'step': step, 'time_elapsed': time_elapsed, 'batch_loss': batch_loss / batch_counts})
                
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            val_loss, val_f1 = evaluate(model, val_dataloader, loss_fn)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_f1:^9.2f} | {time_elapsed:^9.2f}")
            print("-" * 70)
            if val_f1 > best_eval:
                best_eval = val_f1
                best_epoch = epoch_i
                torch.save(model.state_dict(), '../artifacts/models/best-model-parameters.pt')

        print("\n")
        
    del scaler, logits, loss, batch_loss, total_loss, scheduler

    print(f"Training complete. Best result: {best_eval} | epoch {best_epoch}.")
    model.load_state_dict(torch.load('../artifacts/models/best-model-parameters.pt'))
    return model, train_history

Training process.

In [14]:
set_seed(SEED)
loss_fn = nn.CrossEntropyLoss()
transformer_classifier, optimizer, scheduler = initialize_model(MODEL_CLASS, MODEL_NAME, 
                                                                MODEL_LAYERS, MODEL_DROPOUT_LAYERS,
                                                                len(train_dataloader), epochs=EPOCHS, 
                                                                freeze=FREEZE, learning_rate=LEARNING_RATE)

execution_params = {
    'model_class':MODEL_CLASS,
    'model_tokenizer': MODEL_TOKENIZER,
    'model_name': MODEL_NAME,
    'model_layers': MODEL_LAYERS,
    'model_dropout_layers': MODEL_DROPOUT_LAYERS,
    'freeze': FREEZE,
    'learning_rate': LEARNING_RATE,
    'batch_size': BATCH_SIZE,
    'epochs': EPOCHS,
    'model_definition': transformer_classifier.classifier,
    'scheduler': scheduler,
    'optimizer': optimizer,
    'seed': SEED,
    'device': get_device()
}


with mlflow.start_run(run_name=EXPERIMENT_RUN_NAME, experiment_id=EXPERIMENT_ID) as main_run:

    start_time = time.time()
    transformer_classifier, train_history = train(transformer_classifier, loss_fn, optimizer, scheduler, train_dataloader, 
                                                  test_dataloader, epochs=EPOCHS, evaluation=True)
    training_time = time.time() - start_time

    probs = predict(transformer_classifier, test_dataloader)

    eval_metric = metrics.f1_score(y_test, probs.argmax(axis=1), average=None)
    global_eval_metric = metrics.f1_score(y_test, probs.argmax(axis=1), average='micro')

    shutil.rmtree(settings.LOGS_ARTIFACTS_PATH)
    os.makedirs(settings.LOGS_ARTIFACTS_PATH) 

    mlflow.pytorch.log_model(transformer_classifier, "model")
    mlflow.log_metric('training_time', training_time)

    simple_preprocessing_params, complex_preprocessing_params = format_nested_parameters(preprocessing_params, 'preprocessing')
    mlflow.log_params(simple_preprocessing_params)

    simple_execution_params, complex_execution_params = format_nested_parameters(execution_params, 'execution')
    mlflow.log_params(simple_execution_params)

    mlflow.log_param('X_training', X_train.shape)
    mlflow.log_param('X_test', X_test.shape)    

    evaluation_summary_frame = (pd.DataFrame([eval_metric], columns=['F1-Neg', 'F1-Neu', 'F1-Pos'])
                                .assign(F1=global_eval_metric)
                                [['F1-Pos', 'F1-Neu', 'F1-Neg', 'F1']]
                               )
    for ix, metric in enumerate(['F1-Neg', 'F1-Neu', 'F1-Pos', 'F1']):
        mlflow.log_metric(metric, evaluation_summary_frame.loc[0][metric])

    evaluation_summary_frame.to_csv(f'{settings.LOGS_ARTIFACTS_PATH}/experiment_runs_summary.csv')
    evaluation_summary_frame.to_html(f'{settings.LOGS_ARTIFACTS_PATH}/experiment_runs_summary.html')

    train_history_frame = pd.DataFrame(train_history)
    train_history_frame.to_csv(f'{settings.LOGS_ARTIFACTS_PATH}/experiment_train_history.csv')
    train_history_frame.to_html(f'{settings.LOGS_ARTIFACTS_PATH}/experiment_train_history.html')

    for param_name, param_value in {**complex_preprocessing_params, **complex_execution_params}.items():
        with open(f'{settings.LOGS_ARTIFACTS_PATH}/{param_name}.txt', 'w') as file:
            file.write(param_value)

    mlflow.log_artifact(settings.LOGS_ARTIFACTS_PATH)

    del train_inputs, train_masks, train_data, train_labels, train_sampler, train_dataloader
    del test_inputs, test_masks, test_data, test_labels, test_sampler, test_dataloader
    del optimizer, scheduler, loss_fn
    del transformer_classifier, tokenizer, token_ids, probs
    del execution_params, preprocessing_params
    del MODEL_CLASS, MODEL_TOKENIZER

    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    gc.collect()

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val F1   |  Elapsed 
----------------------------------------------------------------------
   1    |   10    |   1.082325   |     -      |     -     |   2.36   
   1    |   20    |   0.953956   |     -      |     -     |   2.13   
   1    |   30    |   0.882444   |     -      |     -     |   2.17   
   1    |   40    |   0.746143   |     -      |     -     |   2.25   
   1    |   50    |   0.817672   |     -      |     -     |   2.16   
   1    |   60    |   0.804885   |     -      |     -     |   2.16   
   1    |   70    |   0.797636   |     -      |     -     |   2.15   
   1    |   80    |   0.865540   |     -      |     -     |   2.15   
   1    |   90    |   0.745033   |     -      |     -     |   2.15   
   1    |   100   |   0.739163   |     -      |     -     |   2.16   
   1    |   110   |   0.707805   |     -      |     -     |   2.16   
   1    |   120   |   0.644881   |     -      |     -     |   2.20   
   1    |   130   |

## Experiments Results

In [15]:
display(HTML('<h3>Test</h3>'))
display(evaluation_summary_frame)

Unnamed: 0,F1-Pos,F1-Neu,F1-Neg,F1
0,0.854673,0.658052,0.7773,0.783085
