### Bibliotecas necessárias

- A biblioteca **simpletransformers** é baseada na famosa biblioteca do Hugging Face, conhecida como **transformers**. Como no nome dela já diz, trata-se de uma ferramenta mais simples de usar, onde seu uso é apropriado em tarefas bem definidas, como é o caso deste trabalho. O objetivo é treinar um model para reconhecimento de entidades nomeadas para o Português com base em dados jurídicos.

In [1]:
!pip install transformers -qq

[K     |████████████████████████████████| 2.3MB 8.6MB/s 
[K     |████████████████████████████████| 3.3MB 37.6MB/s 
[K     |████████████████████████████████| 901kB 35.4MB/s 
[?25h

In [2]:
!pip install simpletransformers -qq

[K     |████████████████████████████████| 225kB 8.7MB/s 
[K     |████████████████████████████████| 51kB 6.1MB/s 
[K     |████████████████████████████████| 8.2MB 13.6MB/s 
[K     |████████████████████████████████| 235kB 40.1MB/s 
[K     |████████████████████████████████| 122kB 44.7MB/s 
[K     |████████████████████████████████| 1.2MB 32.4MB/s 
[K     |████████████████████████████████| 1.8MB 29.8MB/s 
[K     |████████████████████████████████| 81kB 7.4MB/s 
[K     |████████████████████████████████| 81kB 8.7MB/s 
[K     |████████████████████████████████| 174kB 45.7MB/s 
[K     |████████████████████████████████| 112kB 46.7MB/s 
[K     |████████████████████████████████| 4.2MB 22.0MB/s 
[K     |████████████████████████████████| 245kB 35.8MB/s 
[K     |████████████████████████████████| 112kB 44.1MB/s 
[K     |████████████████████████████████| 102kB 10.7MB/s 
[K     |████████████████████████████████| 133kB 43.6MB/s 
[K     |████████████████████████████████| 71kB 7.9MB/s 
[K   

In [3]:
import gc
import pandas as pd
import numpy as np
import random
import csv
import matplotlib.pyplot as plt
import re
import string
from collections import defaultdict
import torch
from torch import nn, optim
from torch.optim import Adam
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences #install
from simpletransformers.ner import NERModel, NERArgs
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import seaborn as sns
from sklearn.metrics import confusion_matrix
from joblib import dump, load
import time
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Leitura e tratamento dos dados

Vamos baixar o conjunto de dados de treino e teste. Cada linha do arquivo contém (1) uma **palavra** e **tag** separadas por uma **tabulação** ou (2) uma linha em **branco** indicando o final de um documento. Vamos escrever uma função para ler isso. Vamos pegar o caminho do arquivo e retornar **token_docs** que é uma lista de listas de strings de token e **token_tags** que é uma lista de listas de strings de tag.

In [5]:
import pandas as pd
path_train = '/content/drive/My Drive/NLP/Competition2/Data/train.conll'
path_test  = '/content/drive/My Drive/NLP/Competition2/Data/test.conll'

In [6]:
train_data = pd.read_csv(path_train, names=['text','tag'], sep='\s+')
test_data = pd.read_csv(path_test, names=['text'], sep='\s+', engine='python')

def load_words(data_file, tipo_dado):
    #data_file = train_file #debug
    lista = []
    #reader = csv.reader(data_file, delimiter='\t')
    reader = open(data_file).readlines()
    k = 0
    for i, row in enumerate(reader):
        row = row.replace('\n','')
        row = row.split()

        if row != []:
            row = [k+1] + row
            lista.append(row)
            k += 1
        else:
            k += 1

        # Imprime 2500 exemplos de treino
        if tipo_dado:
            if i < 80:
                print(i, row)
        # Imprime 2500 exemplos de teste
        else:
            if i < 80:
                print(i, row)

    return lista

train_list = load_words(path_train, True)
#dev_words, dev_tags = load_words(dev_file)
test_list = load_words(path_test, False)

0 [1, 'E', 'O']
1 [2, 'M', 'O']
2 [3, 'E', 'O']
3 [4, 'N', 'O']
4 [5, 'T', 'O']
5 [6, 'A', 'O']
6 [7, 'Órgão', 'O']
7 [8, ':', 'O']
8 [9, '8ª', 'B-ORGANIZACAO']
9 [10, 'TURMA', 'I-ORGANIZACAO']
10 [11, 'CÍVEL', 'I-ORGANIZACAO']
11 [12, 'Classe', 'O']
12 [13, ':', 'O']
13 [14, 'APELAÇÃO', 'O']
14 [15, 'CÍVEL', 'O']
15 [16, 'N', 'O']
16 [17, '.', 'O']
17 [18, 'Processo', 'O']
18 [19, ':', 'O']
19 [20, '20150110436469APC', 'B-JURISPRUDENCIA']
20 [21, '(', 'O']
21 [22, '0012843-03.2015.8.07.0001', 'B-JURISPRUDENCIA']
22 [23, ')', 'O']
23 [24, 'Apelante', 'O']
24 [25, '(', 'O']
25 [26, 's', 'O']
26 [27, ')', 'O']
27 [28, ':', 'O']
28 [29, 'BRASILIA', 'B-ORGANIZACAO']
29 [30, 'CURSOS', 'I-ORGANIZACAO']
30 [31, 'E', 'I-ORGANIZACAO']
31 [32, 'CONCURSOS', 'I-ORGANIZACAO']
32 [33, 'LTDA', 'I-ORGANIZACAO']
33 [34, 'GRANCURSOS', 'I-ORGANIZACAO']
34 [35, 'ESCOLA', 'I-ORGANIZACAO']
35 [36, 'PARA', 'I-ORGANIZACAO']
36 [37, 'CONCURSOS', 'I-ORGANIZACAO']
37 [38, 'PUBLICOS', 'I-ORGANIZACAO']
38 [39, 'LT

In [7]:
train_df = pd.DataFrame(train_list, columns=["sentence_id", "words", "labels"])
train_df['words'] = train_df['words'].str.lower()
train_df.head(10)

Unnamed: 0,sentence_id,words,labels
0,1,e,O
1,2,m,O
2,3,e,O
3,4,n,O
4,5,t,O
5,6,a,O
6,7,órgão,O
7,8,:,O
8,9,8ª,B-ORGANIZACAO
9,10,turma,I-ORGANIZACAO


In [8]:
test_df = pd.DataFrame(test_list, columns=["sentence_id", "words"])
test_df['words'] = test_df['words'].str.lower()
test_df.head(10)

Unnamed: 0,sentence_id,words
0,1,ação
1,2,monitória
2,3,.
3,5,prescrição
4,6,da
5,7,pretensão
6,8,monitória
7,9,.
8,11,reconhecimento
9,12,.


In [9]:
#unique_tags = set(train_tags)
#unique_tags.discard('')
#unique_tags.add('X')
unique_tags = list(train_data.tag.unique())
tag_index = dict((tag, i) for i, tag in enumerate(unique_tags))
index_tag = dict((i, tag) for i, tag in enumerate(unique_tags))

print(unique_tags)
print()
print(tag_index)
print()
print(index_tag)

['O', 'B-ORGANIZACAO', 'I-ORGANIZACAO', 'B-JURISPRUDENCIA', 'B-PESSOA', 'I-PESSOA', 'I-JURISPRUDENCIA', 'B-LEGISLACAO', 'I-LEGISLACAO', 'B-TEMPO', 'B-LOCAL', 'I-LOCAL', 'I-TEMPO']

{'O': 0, 'B-ORGANIZACAO': 1, 'I-ORGANIZACAO': 2, 'B-JURISPRUDENCIA': 3, 'B-PESSOA': 4, 'I-PESSOA': 5, 'I-JURISPRUDENCIA': 6, 'B-LEGISLACAO': 7, 'I-LEGISLACAO': 8, 'B-TEMPO': 9, 'B-LOCAL': 10, 'I-LOCAL': 11, 'I-TEMPO': 12}

{0: 'O', 1: 'B-ORGANIZACAO', 2: 'I-ORGANIZACAO', 3: 'B-JURISPRUDENCIA', 4: 'B-PESSOA', 5: 'I-PESSOA', 6: 'I-JURISPRUDENCIA', 7: 'B-LEGISLACAO', 8: 'I-LEGISLACAO', 9: 'B-TEMPO', 10: 'B-LOCAL', 11: 'I-LOCAL', 12: 'I-TEMPO'}


In [13]:
print(f"Size train file: {data_train.shape[0]}")
print(f"Size test file: {test_df.shape[0]}")

Size train file: 230125
Size test file: 11239


In [14]:
from sklearn.model_selection import train_test_split

data_train, data_dev = train_test_split(train_df, test_size=.30, random_state=42, stratify=train_df.labels)

print(f"Train shape tokens: {data_train.shape}")
print(f"Dev shape: {data_dev.shape}")
print(f"Test shape: {test_df.shape}")

Train shape tokens: (214783, 3)
Dev shape: (92051, 3)
Test shape: (11239, 2)


In [15]:
data_train.groupby(['labels']).count()

Unnamed: 0_level_0,sentence_id,words
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
B-JURISPRUDENCIA,965,965
B-LEGISLACAO,1751,1751
B-LOCAL,500,500
B-ORGANIZACAO,2344,2344
B-PESSOA,1368,1368
B-TEMPO,1168,1168
I-JURISPRUDENCIA,2472,2472
I-LEGISLACAO,10178,10178
I-LOCAL,702,702
I-ORGANIZACAO,4220,4220


In [16]:
data_dev.groupby(['labels']).count()

Unnamed: 0_level_0,sentence_id,words
labels,Unnamed: 1_level_1,Unnamed: 2_level_1
B-JURISPRUDENCIA,413,413
B-LEGISLACAO,750,750
B-LOCAL,214,214
B-ORGANIZACAO,1005,1005
B-PESSOA,587,587
B-TEMPO,500,500
I-JURISPRUDENCIA,1059,1059
I-LEGISLACAO,4362,4362
I-LOCAL,301,301
I-ORGANIZACAO,1809,1809


### Configuração do modelo para treinamento

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [18]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [19]:
bert_args = {
    "num_train_epochs": 2,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "learning_rate": 5e-5, 
    'max_seq_length': 50,
    'evaluate_during_training': False,
    'wandb_project': 'ner-competition2',
    'output_dir': '/content/drive/My Drive/NLP/Competition2/outputs/bestModel',
    'overwrite_output_dir': True
}

In [20]:
model = NERModel(
    "bert",
    "monilouise/ner_pt_br",
    args=bert_args,
    labels=unique_tags,
    use_cuda=True
)

INFO:filelock:Lock 139913447704528 acquired on /root/.cache/huggingface/transformers/e352afbb1d9ea4b80e97e657a74f30c51bcc9da455da8306fdcaaf0cabc6937e.08868e7914995036411ed3728b7e905ee94d69938262ad30ffe4e5dac473f090.lock


Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

INFO:filelock:Lock 139913447704528 released on /root/.cache/huggingface/transformers/e352afbb1d9ea4b80e97e657a74f30c51bcc9da455da8306fdcaaf0cabc6937e.08868e7914995036411ed3728b7e905ee94d69938262ad30ffe4e5dac473f090.lock
INFO:filelock:Lock 139913342129232 acquired on /root/.cache/huggingface/transformers/cd1568faa5c78984243139ac6d2643b4ad1f337895f9f6c046ed9bc7271f8d02.d38ca2e5cf614b94008f1c5e374f16ff65c9317c3b981e2cafe0390b585cf7b2.lock


Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

INFO:filelock:Lock 139913342129232 released on /root/.cache/huggingface/transformers/cd1568faa5c78984243139ac6d2643b4ad1f337895f9f6c046ed9bc7271f8d02.d38ca2e5cf614b94008f1c5e374f16ff65c9317c3b981e2cafe0390b585cf7b2.lock
INFO:filelock:Lock 139913192092432 acquired on /root/.cache/huggingface/transformers/8ab71a31d8c7aaa1a67b8f388ac2cae42a643a5aeb9c75021f9232d090d47418.af25fb1e29ad0175300146695fd80069be69b211c52fa5486fa8aae2754cc814.lock


Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

INFO:filelock:Lock 139913192092432 released on /root/.cache/huggingface/transformers/8ab71a31d8c7aaa1a67b8f388ac2cae42a643a5aeb9c75021f9232d090d47418.af25fb1e29ad0175300146695fd80069be69b211c52fa5486fa8aae2754cc814.lock
INFO:filelock:Lock 139913192153040 acquired on /root/.cache/huggingface/transformers/fa7e3da05ce53aade05d92ce2c035ec3de9f65f97a868a67b21e2721858d1984.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

INFO:filelock:Lock 139913192153040 released on /root/.cache/huggingface/transformers/fa7e3da05ce53aade05d92ce2c035ec3de9f65f97a868a67b21e2721858d1984.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
INFO:filelock:Lock 139913191734288 acquired on /root/.cache/huggingface/transformers/e1fccf979f3fddc8e448aae3be82676bfbfda4eaa29a984c0b39f4852eea399f.c8e627eef86d8864ea5006f2bddab4afa35fd3470a139cda6261c1b3cc594c5b.lock


Downloading:   0%|          | 0.00/529 [00:00<?, ?B/s]

INFO:filelock:Lock 139913191734288 released on /root/.cache/huggingface/transformers/e1fccf979f3fddc8e448aae3be82676bfbfda4eaa29a984c0b39f4852eea399f.c8e627eef86d8864ea5006f2bddab4afa35fd3470a139cda6261c1b3cc594c5b.lock


In [21]:
import gc

gc.collect()                # Coleta lixo.
torch.cuda.empty_cache()    # Limpa o chache.

model.train_model(data_train)

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 2:   0%|          | 0/6712 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/6712 [00:00<?, ?it/s]

INFO:simpletransformers.ner.ner_model: Training of bert model complete. Saved to /content/drive/My Drive/NLP/Competition2/outputs/bestModel.


(13424, 0.31738707793394216)

In [22]:
result, model_outputs, wrong_preds = model.eval_model(data_dev)

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/2877 [00:00<?, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.2747
lr,0.0
global_step,13400.0
_runtime,2756.0
_timestamp,1622993405.0
_step,267.0


0,1
Training loss,█▂▃▄▂▃▂▆▅▆▂▂▆▃▂▄▁▃▂▇▅▃▂▄▁▂▆▂▅▂▁▅▄▂▃▂▆▂▃▄
lr,▃▄███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


INFO:simpletransformers.ner.ner_model:{'eval_loss': 0.25872209067898294, 'precision': 0.7687046195938125, 'recall': 0.5835130601485742, 'f1_score': 0.6634274816092998}


In [23]:
result

{'eval_loss': 0.25872209067898294,
 'f1_score': 0.6634274816092998,
 'precision': 0.7687046195938125,
 'recall': 0.5835130601485742}

In [24]:
predictions, raw_outputs = model.predict(list(test_df['words']))

INFO:simpletransformers.ner.ner_model: Converting to features started.


  0%|          | 0/3 [00:00<?, ?it/s]

Exception ignored in: <finalize object at 0x7f3f88836da0; dead>
Traceback (most recent call last):
  File "/usr/lib/python3.7/weakref.py", line 572, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "/usr/lib/python3.7/tempfile.py", line 936, in _cleanup
    _rmtree(name)
  File "/usr/lib/python3.7/shutil.py", line 485, in rmtree
    onerror(os.lstat, path, sys.exc_info())
  File "/usr/lib/python3.7/shutil.py", line 483, in rmtree
    orig_st = os.lstat(path)
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp42pzx5dk'


Running Prediction:   0%|          | 0/352 [00:00<?, ?it/s]

In [25]:
print(predictions[:5])
print(len(predictions))

[[{'ação': 'O'}], [{'monitória': 'O'}], [{'.': 'O'}], [{'prescrição': 'O'}], [{'da': 'O'}]]
11239


In [26]:
test_tags = []
for lista in predictions:
    value = list(lista[0].values())[0]
    test_tags.append(value)
test_tags[:10]

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [27]:
test_df['labels'] = test_tags
test_df.head(10)

Unnamed: 0,sentence_id,words,labels
0,1,ação,O
1,2,monitória,O
2,3,.,O
3,5,prescrição,O
4,6,da,O
5,7,pretensão,O
6,8,monitória,O
7,9,.,O
8,11,reconhecimento,O
9,12,.,O


In [28]:
test_df.shape

(11239, 3)

In [29]:
preds = pd.DataFrame(None, columns=['Id', 'Tag'])

preds['Id'] = test_df.sentence_id
preds['Tag'] = test_df.labels
preds

Unnamed: 0,Id,Tag
0,1,O
1,2,O
2,3,O
3,5,O
4,6,O
...,...,...
11234,14073,I-TEMPO
11235,14074,O
11236,14076,I-LEGISLACAO
11237,14077,I-JURISPRUDENCIA


In [30]:
preds.to_csv('/content/drive/My Drive/NLP/Competition2/Predictions/submissions.csv', index=False, sep=',')