In [1]:
from datasets import load_dataset
from model import NERClassifier
from preprocess_dataset import NERDataset 
from trainner import Trainner
from transformers import BertTokenizerFast
from preprocess_dataset import remove_empty_entries

import numpy as np
import torch

## Checkpoints to be used

In [2]:
pt_base_cased    = 'neuralmind/bert-base-portuguese-cased'
pt_large_cased   = 'neuralmind/bert-large-portuguese-cased'
en_base_uncased = 'bert-base-uncased'
en_large_uncased = 'bert-large-uncased'
en_base_cased = 'bert-base-cased'
en_large_cased = 'bert-large-cased'

pt = [pt_base_cased, pt_large_cased]
en = [en_base_cased, en_large_cased, en_base_uncased, en_large_uncased]
cased = [pt_base_cased, en_base_cased, pt_large_cased, en_large_cased]
uncased = [en_base_uncased, en_large_uncased] 

In [3]:
models = [pt, en]

In [4]:
flat_models = [check for checkpoints in models for check in checkpoints]

In [5]:
portuguese_flat = list(filter(lambda x: x.find('/') != -1, flat_models))

In [6]:
en_flat = list(filter(lambda x: x.find('/') == -1, flat_models))

In [7]:
en_flat

['bert-base-cased',
 'bert-large-cased',
 'bert-base-uncased',
 'bert-large-uncased']

In [8]:
portuguese_flat

['neuralmind/bert-base-portuguese-cased',
 'neuralmind/bert-large-portuguese-cased']

In [9]:
flat_models

['neuralmind/bert-base-portuguese-cased',
 'neuralmind/bert-large-portuguese-cased',
 'bert-base-cased',
 'bert-large-cased',
 'bert-base-uncased',
 'bert-large-uncased']

In [10]:
len(flat_models)

6

## Dataset

In [11]:
data = "lener_br"
dataset = load_dataset(data)
dataset = remove_empty_entries(dataset)

Reusing dataset lener_br (/home/caiotulio/.cache/huggingface/datasets/lener_br/lener_br/1.0.0/4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382)
Loading cached processed dataset at /home/caiotulio/.cache/huggingface/datasets/lener_br/lener_br/1.0.0/4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382/cache-5e59bc59f25f3d7f.arrow
Loading cached processed dataset at /home/caiotulio/.cache/huggingface/datasets/lener_br/lener_br/1.0.0/4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382/cache-8d0457760cd67ee6.arrow
Loading cached processed dataset at /home/caiotulio/.cache/huggingface/datasets/lener_br/lener_br/1.0.0/4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382/cache-74e841c1c151996a.arrow


## Hyperparameters

In [12]:
MAX_LEN = 128
LEARNING_RATE=3e-4
n_labels = 13
BATCH_SIZE=8
shuffle=True
NUM_EPOCHS=1
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Training different checkpoints

In [13]:
import pandas as pd
from evaluator import Evaluator
from torch.utils.data import DataLoader
from transformers import AdamW

In [14]:
def compare(checkpoints):

    data = {"f1_t":[], "f1_e":[], "loss_t": [], "loss_e": []}

    for idx, checkpoint in enumerate(checkpoints):
        print(f"Progresso: {idx+1}/{len(checkpoints)}")
        print(f"------Iniciando treino para o checkpoint {checkpoint}---------")

        tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
        print("Tokenizer carregado!")
        pytorch_dataset_train = NERDataset(data=dataset['train'], max_len=MAX_LEN, tokenizer=tokenizer)
        loader = DataLoader(pytorch_dataset_train, batch_size=BATCH_SIZE, shuffle=shuffle)
        print("Dataloader carregado!")

        model = NERClassifier(n_labels=n_labels, checkpoint=checkpoint)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
        evaluator = Evaluator(loader, model, device)
        tr = Trainner(device,\
                      loader,\
                      model,\
                      optimizer,\
                      max_len=MAX_LEN,\
                      num_examples=len(pytorch_dataset_train),\
                      num_epochs=NUM_EPOCHS,\
                      evaluator=evaluator)
        print("Trainner carregado!")
        loss_t, loss_e, f1_e, f1_t = tr.train()
        print(f"Treino finalizado para o checkpoint {checkpoint}\n" + \
              f"loss_t:{loss_t}, loss_e:{loss_e}, f1_e:{f1_e}, f1_t:{f1_t}")
        data["f1_t"].append(f1_t[0]) # We return the f1 score for all epochs. Since we're using 
        data["f1_e"].append(f1_e[0]) # num_epochs=1, we'll just take the first item.
        data["loss_t"].append(loss_t[0])
        data["loss_e"].append(loss_e[0])
        del model
        torch.cuda.empty_cache()
    return data

In [15]:
%%time
df_en_uncased = compare(en_flat[2:])

Progresso: 1/2
------Iniciando treino para o checkpoint bert-base-uncased---------
Tokenizer carregado!
Dataloader carregado!


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Trainner carregado!
Começando treino! Essa função retorna a media de f1 e loss em cada epoch de treino e avaliação
----------Começando treino da epoch nº 1
Treinando em cuda
Iteração 0 -------- Loss: 2.344785690307617 f1 nas ultimas 100 iterações: 0.004580497266795979 ------ Progresso: 0.00%.
Iteração 100 -------- Loss: 0.8272997140884399 f1 nas ultimas 100 iterações: 0.7546770773516339 ------ Progresso: 10.21%.
Iteração 200 -------- Loss: 1.482980489730835 f1 nas ultimas 100 iterações: 0.8146044723696698 ------ Progresso: 20.43%.
Iteração 300 -------- Loss: 0.2990460991859436 f1 nas ultimas 100 iterações: 0.815000685096214 ------ Progresso: 30.64%.
Iteração 400 -------- Loss: 0.27978137135505676 f1 nas ultimas 100 iterações: 0.8009900016420815 ------ Progresso: 40.86%.
Iteração 500 -------- Loss: 0.9474799633026123 f1 nas ultimas 100 iterações: 0.8213372307469158 ------ Progresso: 51.07%.
Iteração 600 -------- Loss: 0.6051630973815918 f1 nas ultimas 100 iterações: 0.815022591184831 --

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large

Trainner carregado!
Começando treino! Essa função retorna a media de f1 e loss em cada epoch de treino e avaliação
----------Começando treino da epoch nº 1
Treinando em cuda
Iteração 0 -------- Loss: 3.0868489742279053 f1 nas ultimas 100 iterações: 9.977827050997783e-05 ------ Progresso: 0.00%.
Iteração 100 -------- Loss: 0.6433040499687195 f1 nas ultimas 100 iterações: 0.7753721256091479 ------ Progresso: 10.21%.
Iteração 200 -------- Loss: 0.4900451898574829 f1 nas ultimas 100 iterações: 0.7995239944619591 ------ Progresso: 20.43%.
Iteração 300 -------- Loss: 0.46434643864631653 f1 nas ultimas 100 iterações: 0.829660489728704 ------ Progresso: 30.64%.
Iteração 400 -------- Loss: 0.6035807132720947 f1 nas ultimas 100 iterações: 0.8028153807876661 ------ Progresso: 40.86%.
Iteração 500 -------- Loss: 0.22309009730815887 f1 nas ultimas 100 iterações: 0.8141738999863479 ------ Progresso: 51.07%.
Iteração 600 -------- Loss: 0.44416260719299316 f1 nas ultimas 100 iterações: 0.8084743096221

In [16]:
df_en_uncased = pd.DataFrame(df_en_uncased)

In [18]:
df_en_uncased['checkpoint'] = en_flat[2:]
df_en_uncased = df_en_uncased.set_index('checkpoint')

In [19]:
df_en_uncased

Unnamed: 0_level_0,f1_t,f1_e,loss_t,loss_e
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bert-base-uncased,0.807418,0.812724,0.699376,0.662042
bert-large-uncased,0.807043,0.812558,0.708094,0.664394


In [49]:
df_en_uncased.to_csv('checkpoint_en_uncased.csv')

In [50]:
df_pt = pd.read_csv('checkpoint_pt.csv')

In [51]:
df_en_cased = pd.read_csv('checkpoint_en_cased.csv')

In [32]:
df_final = pd.concat([df_pt, df_en_cased, df_en_uncased])

In [37]:
df_final

Unnamed: 0,checkpoint,f1_t,f1_e,loss_t,loss_e
0,neuralmind/bert-base-portuguese-cased,0.814131,0.815739,0.666314,0.661454
1,neuralmind/bert-large-portuguese-cased,0.812499,0.815978,0.69123,0.659751
0,bert-base-cased,0.811261,0.81364,0.680731,0.668316
1,bert-large-cased,0.811336,0.811681,0.691754,0.660262
bert-base-uncased,,0.807418,0.812724,0.699376,0.662042
bert-large-uncased,,0.807043,0.812558,0.708094,0.664394


In [39]:
df_final.reset_index(drop=True)

Unnamed: 0,checkpoint,f1_t,f1_e,loss_t,loss_e
0,neuralmind/bert-base-portuguese-cased,0.814131,0.815739,0.666314,0.661454
1,neuralmind/bert-large-portuguese-cased,0.812499,0.815978,0.69123,0.659751
2,bert-base-cased,0.811261,0.81364,0.680731,0.668316
3,bert-large-cased,0.811336,0.811681,0.691754,0.660262
4,,0.807418,0.812724,0.699376,0.662042
5,,0.807043,0.812558,0.708094,0.664394


In [42]:
df_final["checkpoint"] = flat_models

In [45]:
df_final=df_final.set_index("checkpoint")

In [46]:
df_final

Unnamed: 0_level_0,f1_t,f1_e,loss_t,loss_e
checkpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
neuralmind/bert-base-portuguese-cased,0.814131,0.815739,0.666314,0.661454
neuralmind/bert-large-portuguese-cased,0.812499,0.815978,0.69123,0.659751
bert-base-cased,0.811261,0.81364,0.680731,0.668316
bert-large-cased,0.811336,0.811681,0.691754,0.660262
bert-base-uncased,0.807418,0.812724,0.699376,0.662042
bert-large-uncased,0.807043,0.812558,0.708094,0.664394


In [48]:
df_final.to_csv("all_checkpoints.csv")