In [502]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordTokenizer, WhitespaceTokenizer
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
import os
import re
import data_processing
from datasets import load_dataset
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [503]:
path_to_test_file = "acerpi_dataset/test/"
path_to_train_file = "acerpi_dataset/train/"

#### Open Unannotated Train Documents
We get the unnanotatted train texts and later we'll store each document as a key, value pair in a dictionary

In [504]:
train_documents = {}
for filename in os.listdir(path_to_train_file):
   if filename.endswith(".txt"):
      with open(os.path.join(path_to_train_file, filename), 'r', encoding="utf8") as f:
         key = int(filename.replace('.txt', ''))
         value = f.read()
         train_documents[key] = value

#### Open Annotated Sentences
We get the sentences from 'prep_data_labeling.ipynb' that we annotated manually with deccano.

In [505]:
annotated_sentences = pd.read_json(os.path.join(path_to_test_file, 'annotated_sentences.jsonl'), orient='record', lines=True)

In [506]:
print(annotated_sentences.shape)
annotated_sentences.head()

(84, 6)


Unnamed: 0,id,text,document,sentence_id,duplicates,label
0,115,Documento gerado sob autenticação Nº LKB.506.4...,105798,0,[0],"[[233, 238, ORGANIZATION]]"
1,116,1/1 PORTARIA Nº 1955 de 05/03/2020 O PRÓ-REITO...,105798,1,[1],"[[72, 113, ORGANIZATION], [448, 475, MISCELLAN..."
2,117,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,105798,2,"[2, 5, 8, 11, 17, 19, 21, 23, 25, 41, 56]","[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
3,118,Documento gerado sob autenticação Nº BOA.507.6...,105799,3,[3],"[[233, 238, ORGANIZATION]]"
4,119,1/1 PORTARIA Nº 1956 de 05/03/2020 O PRÓ-REITO...,105799,4,[4],"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."


In [507]:
annotated_sentences.iloc[2]['text']

'MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão de Pessoas'

Next we'll explode our dataframe so that duplicate sentences can become separate rows. We'll also drop a few columns that will have inconsistent information after the explode and will also not be useful for us.

In [508]:
pd.options.display.max_rows = 999
annotated_sentences = annotated_sentences.explode("duplicates")
annotated_sentences.drop(columns=['id', 'document', 'sentence_id'], inplace=True)
annotated_sentences.columns = ['text', 'sentence_id', 'label']
annotated_sentences = annotated_sentences.set_index(['sentence_id'], verify_integrity=True)
annotated_sentences.index.name = None
annotated_sentences = annotated_sentences.sort_index()

  return Index(sequences[0], name=names)


In [509]:
print(annotated_sentences.shape)
annotated_sentences.head(20)

(106, 2)


Unnamed: 0,text,label
0,Documento gerado sob autenticação Nº LKB.506.4...,"[[233, 238, ORGANIZATION]]"
1,1/1 PORTARIA Nº 1955 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [448, 475, MISCELLAN..."
2,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
3,Documento gerado sob autenticação Nº BOA.507.6...,"[[233, 238, ORGANIZATION]]"
4,1/1 PORTARIA Nº 1956 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."
5,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
6,Documento gerado sob autenticação Nº KMT.508.8...,"[[233, 238, ORGANIZATION]]"
7,1/1 PORTARIA Nº 1957 de 05/03/2020 O PRÓ-REITO...,"[[37, 68, MISCELLANEOUS], [72, 113, ORGANIZATI..."
8,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
9,Documento gerado sob autenticação Nº DYG.509.9...,"[[233, 238, ORGANIZATION]]"


In [510]:
for idx, row in annotated_sentences.iterrows():
    if len(row['label']) == 0 or len(row['label']) == 1:
        print(len(row['label']), row['label'])
        annotated_sentences.drop(idx, inplace=True)
annotated_sentences.reset_index(drop=True, inplace=True)

1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
1 [[43, 91, 'MISCELLANEOUS']]
1 [[12, 53, 'ORGANIZATION']]
1 [[39, 55, 'MISCELLANEOUS']]
0 []
1 [[42, 67, 'MISCELLANEOUS']]
0 []
0 []
1 [[0, 21, 'PERSON']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[0, 21, 'PERSON']]
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
1 [[0, 21, 'PERSON']]
1 [[233, 238, 'ORGANIZATION']]
0 []
0 []
1 [[3, 41, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
0 []
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATION']]
1 [[233, 238, 'ORGANIZATI

In [511]:
annotated_sentences

Unnamed: 0,text,label
0,1/1 PORTARIA Nº 1955 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [448, 475, MISCELLAN..."
1,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
2,1/1 PORTARIA Nº 1956 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."
3,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
4,1/1 PORTARIA Nº 1957 de 05/03/2020 O PRÓ-REITO...,"[[37, 68, MISCELLANEOUS], [72, 113, ORGANIZATI..."
5,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
6,1/1 PORTARIA Nº 1958 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."
7,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
8,1/1 PORTARIA Nº 1706 de 27/02/2020 A VICE-REIT...,"[[53, 94, ORGANIZATION], [284, 320, MISCELLANE..."
9,JANE FRAGA TUTIKIAN Vice-Reitora.,"[[0, 19, PERSON], [20, 32, MISCELLANEOUS]]"


Let's tokenize the sentences and tokens using the TreebankWordTokenizer
The first method 'tokenize' will split our sentence and return a list of words which we'll organize in a 'tokenized_sentences' list.
The second method 'span_tokenize' will return the start and end position of each token resulting from the split. We'll organize them in a 'token_positions' list.

In [512]:
tokenized_sentences = []
token_positions = []
for idx, sentence in annotated_sentences['text'].items():
    tokenized_sentences.append(TreebankWordTokenizer().tokenize(sentence))
    token_positions.append(list(TreebankWordTokenizer().span_tokenize(sentence)))


print(tokenized_sentences[0])
print(token_positions[0])

['1/1', 'PORTARIA', 'Nº', '1955', 'de', '05/03/2020', 'O', 'PRÓ-REITOR', 'DE', 'GESTÃO', 'DE', 'PESSOAS', 'DA', 'UNIVERSIDADE', 'FEDERAL', 'DO', 'RIO', 'GRANDE', 'DO', 'SUL', ',', 'no', 'uso', 'de', 'suas', 'atribuições', 'que', 'lhe', 'foram', 'conferidas', 'pela', 'Portaria', 'nº', '7684', ',', 'de', '03', 'de', 'outubro', 'de', '2016', ',', 'do', 'Magnífico', 'Reitor', ',', 'e', 'conforme', 'o', 'Laudo', 'Médico', 'n°60131', ',', 'RESOLVE', 'Designar', ',', 'temporariamente', ',', 'nos', 'termos', 'da', 'Lei', 'nº', '8.112', ',', 'de', '11', 'de', 'dezembro', 'de', '1990', ',', 'com', 'redação', 'dada', 'pela', 'Lei', 'nº', '9.527', ',', 'de', '10', 'de', 'dezembro', 'de', '1997', ',', 'a', 'ocupante', 'do', 'cargo', 'de', 'ASSISTENTE', 'EM', 'ADMINISTRAÇÃO', ',', 'do', 'Quadro', 'de', 'Pessoal', 'desta', 'Universidade', ',', 'FRANCIELE', 'MARQUES', 'ZIQUINATTI', '(', 'Siape', ':', '1091092', ')', ',', 'para', 'substituir', 'TURENE', 'ANDRADE', 'E', 'SILVA', 'NETO', '(', 'Siape', ':

In [513]:
tokenized_sentence_position = []
for sentence, positions in zip(tokenized_sentences, token_positions):
    tokenized_sentence_position.append(list(zip(sentence, positions)))

In [514]:
print(annotated_sentences['label'].loc[43])
tokenized_sentence_position[43]

[[37, 49, 'MISCELLANEOUS'], [79, 120, 'ORGANIZATION'], [214, 230, 'PERSON'], [232, 264, 'MISCELLANEOUS'], [276, 309, 'ORGANIZATION'], [329, 370, 'ORGANIZATION'], [410, 431, 'ORGANIZATION'], [436, 441, 'LOCATION'], [443, 451, 'LOCATION'], [536, 541, 'ORGANIZATION']]


[('1/1', (0, 3)),
 ('PORTARIA', (4, 12)),
 ('Nº', (13, 15)),
 ('9641', (16, 20)),
 ('de', (21, 23)),
 ('29/11/2018', (24, 34)),
 ('A', (35, 36)),
 ('VICE-REITORA', (37, 49)),
 (',', (49, 50)),
 ('NO', (51, 53)),
 ('EXERCÍCIO', (54, 63)),
 ('DA', (64, 66)),
 ('REITORIA', (67, 75)),
 ('DA', (76, 78)),
 ('UNIVERSIDADE', (79, 91)),
 ('FEDERAL', (92, 99)),
 ('DO', (100, 102)),
 ('RIO', (103, 106)),
 ('GRANDE', (107, 113)),
 ('DO', (114, 116)),
 ('SUL', (117, 120)),
 (',', (120, 121)),
 ('no', (122, 124)),
 ('uso', (125, 128)),
 ('de', (129, 131)),
 ('suas', (132, 136)),
 ('atribuições', (137, 148)),
 ('legais', (149, 155)),
 ('e', (156, 157)),
 ('estatutárias', (158, 170)),
 ('RESOLVE', (171, 178)),
 ('Autorizar', (179, 188)),
 ('o', (189, 190)),
 ('afastamento', (191, 202)),
 ('do', (203, 205)),
 ('País', (206, 210)),
 ('de', (211, 213)),
 ('LUCIANE', (214, 221)),
 ('INES', (222, 226)),
 ('ELY', (227, 230)),
 (',', (230, 231)),
 ('Técnico', (232, 239)),
 ('em', (240, 242)),
 ('Assuntos', (

Next we create a list of dataframes. Each dataframe is a sentence and will have the columns 'token', 'start' and 'end.

In [515]:
list_of_token_df = []
for list_of_words in tokenized_sentence_position:
    tokens_df = pd.DataFrame(list_of_words, columns = ['token', 'pos'])
    tokens_df[['start', 'end']] = tokens_df['pos'].to_list()
    tokens_df = tokens_df.drop(columns='pos')
    list_of_token_df.append(tokens_df)

In [516]:
list_of_token_df[43]

Unnamed: 0,token,start,end
0,1/1,0,3
1,PORTARIA,4,12
2,Nº,13,15
3,9641,16,20
4,de,21,23
5,29/11/2018,24,34
6,A,35,36
7,VICE-REITORA,37,49
8,",",49,50
9,NO,51,53


We now have, for each sentences, a dataframe of all the word and a list with all the labels.
What needs to be done is assign the correct label to each word of the tokenized sentence.

In [517]:
for index, token_df in enumerate(list_of_token_df):
    labels = []
    is_entity = False
    for idx, token in token_df.iterrows():
        for label in annotated_sentences['label'].loc[index]:
            if token['start'] == label[0]:
                labels.append('B-' + label[2])
                is_entity = True
                break
            if token['start'] >= label[0] and token['end'] <= label[1]:
                labels.append('I-' + label[2])
                is_entity = True
                break
            is_entity = False
        if is_entity == False:
            labels.append('O')
    token_df['label'] = labels

In [518]:
print(annotated_sentences.shape,'\n',
      len(list_of_token_df))

(57, 2) 
 57


In [519]:
# Validate that the tokens are correctly assigned to each label.
#for index in range(0, 89): 
    #print(annotated_sentences['label'].loc[index])
    #print(list_of_token_df[index][list_of_token_df[index]['label'] != 'O'])
    #print(list_of_token_df[index]['label'].value_counts())
    #print(index)

In [520]:
target_labels = ['O', 'B-PERSON', 'I-PERSON', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-LOCATION', 'I-LOCATION', 'B-MISCELLANEOUS', 'I-MISCELLANEOUS']

# Convert label from name to id
label_to_id = {
    'O': 0, 
    'B-PERSON': 1, 
    'I-PERSON': 2, 
    'B-ORGANIZATION': 3, 
    'I-ORGANIZATION': 4, 
    'B-LOCATION': 5, 
    'I-LOCATION': 6, 
    'B-MISCELLANEOUS': 7, 
    'I-MISCELLANEOUS': 8
}

id_to_label = {
    0: 'O', 
    1: 'B-PERSON', 
    2: 'I-PERSON', 
    3: 'B-ORGANIZATION', 
    4: 'I-ORGANIZATION', 
    5: 'B-LOCATION', 
    6: 'I-LOCATION', 
    7: 'B-MISCELLANEOUS', 
    8: 'I-MISCELLANEOUS'
}
for sentence in list_of_token_df:
    sentence['label'] = sentence['label'].map(label_to_id)


In [521]:
id_to_label

{0: 'O',
 1: 'B-PERSON',
 2: 'I-PERSON',
 3: 'B-ORGANIZATION',
 4: 'I-ORGANIZATION',
 5: 'B-LOCATION',
 6: 'I-LOCATION',
 7: 'B-MISCELLANEOUS',
 8: 'I-MISCELLANEOUS'}

In [522]:
list_of_token_df[6]

Unnamed: 0,token,start,end,label
0,1/1,0,3,0
1,PORTARIA,4,12,0
2,Nº,13,15,0
3,1958,16,20,0
4,de,21,23,0
5,05/03/2020,24,34,0
6,O,35,36,0
7,PRÓ-REITOR,37,47,0
8,DE,48,50,0
9,GESTÃO,51,57,0


### Exploring a dataset from the datasets library
The easiest way to use our data with HuggingFace is to use the datasets library. It allows us to import our own data and it will format it into a Dataset Object that is ready to be used by the NER model.

To understand how our data need to be formatted let's explore a sample dataset that already exists inside the datasets library.

In [523]:
sample_data = load_dataset('conll2003')
sample_data

Found cached dataset conll2003 (C:/Users/arthu/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

The format of DatasetDict, which we'll want to use, is a list of three arrow Datasets: train, test and validation. 
Each Dataset is composed of two main object: features and num_rows. We need to make sure our JSON has the features 'tokens' and 'ner_tags'
The sample data uses the following dictionary to convert each label to an int:

**{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}**

Since we are using the exact same labels we can utilize this dictionary as well.

In [524]:
sample_data['train']['tokens'][:3]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22']]

In [525]:
sample_data['train']['ner_tags'][:3]

[[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2], [5, 0]]

### Splitting dataset into train and test
We'll split our DataFrame into to lists of lists. One for the input tokens and another for the labels.
After that we'll use the scklearn train_test_split method to get both our train and test data.

In [526]:
# Divide inputs and tokens into X and y lists.
sample_X = []
sample_y = []
for sentence in list_of_token_df:
    sample_X.append(list(sentence['token']))
    sample_y.append(list(sentence['label']))

# Split X and y into train and test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample_X, sample_y, test_size=0.33, random_state=43)

In [527]:
def get_label_distribution(seq_labels):
    label_count = {}
    for label in target_labels:
        label_count[label] = 0
    for seq in seq_labels:
        for target_id in seq:
            label = id_to_label[target_id]
            label_count[label] += 1
    return label_count

In [528]:
get_label_distribution(y_train)

{'O': 2495,
 'B-PERSON': 52,
 'I-PERSON': 138,
 'B-ORGANIZATION': 68,
 'I-ORGANIZATION': 233,
 'B-LOCATION': 7,
 'I-LOCATION': 4,
 'B-MISCELLANEOUS': 78,
 'I-MISCELLANEOUS': 132}

In [529]:
get_label_distribution(y_test)

{'O': 1089,
 'B-PERSON': 22,
 'I-PERSON': 53,
 'B-ORGANIZATION': 30,
 'I-ORGANIZATION': 126,
 'B-LOCATION': 2,
 'I-LOCATION': 0,
 'B-MISCELLANEOUS': 33,
 'I-MISCELLANEOUS': 52}

In [530]:
# Create a train and test dictionary
train_data = {'inputs': X_train, 'targets': y_train}
test_data = {'inputs': X_test, 'targets': y_test}

#Convert dictionary into DataFrame
#Needed as intermediary step because DataFrames support convertion into the json record format we need.
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

#Convert DataFrame into json
train_json = train_df.to_json(orient='records')
test_json = test_df.to_json(orient='records')

In [531]:
#Save train and test jsons in the auto_data directory.
#This directory will serve as the repository of our auto labeled data and we'll use it to import the data with the datasets library.
import os
os.path
file_path = "C:\\Users\\arthu\\Desktop\\ner-using-bert\BERT_Experiment\\auto_data\\"

with open(file_path+'train.json', 'w') as outfile:
    outfile.write(train_json)

with open(file_path+'test.json', 'w') as outfile:
    outfile.write(test_json)

### Load and validate
We load the dataset that we saved previously. We'll use the load_dataset method from the datasets library, which will allow us to easily use hugging face models with our data.

In [532]:
ufrgs_data = load_dataset('json', data_dir = file_path)

Downloading and preparing dataset json/default to C:/Users/arthu/.cache/huggingface/datasets/json/default-b4d3e01485de334b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/arthu/.cache/huggingface/datasets/json/default-b4d3e01485de334b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [533]:
# we get a DatasetDict object with two Datasets, one for train and one for test.
ufrgs_data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 38
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 19
    })
})

In [534]:
print(ufrgs_data['train'][12]['inputs'])
print(ufrgs_data['train'][12]['targets'])

['Documento', 'gerado', 'sob', 'autenticação', 'Nº', 'QON.500.984.BHA', ',', 'disponível', 'no', 'endereço', 'http', ':', '//www.ufrgs.br/autenticacao', '1/1', 'PORTARIA', 'Nº', '1184', 'de', '18/02/2016', 'O', 'PRÓ-REITOR', 'DE', 'GESTÃO', 'DE', 'PESSOAS', 'DA', 'UNIVERSIDADE', 'FEDERAL', 'DO', 'RIO', 'GRANDE', 'DO', 'SUL', ',', 'no', 'uso', 'de', 'suas', 'atribuições', 'que', 'lhe', 'foram', 'conferidas', 'pela', 'Portaria', 'nº', '5469', ',', 'de', '04', 'de', 'outubro', 'de', '2012', ',', 'do', 'Magnífico', 'Reitor', ',', 'e', 'conforme', 'o', 'Laudo', 'Médico', 'n°37308', ',', 'RESOLVE', ':', 'Designar', ',', 'temporariamente', ',', 'nos', 'termos', 'da', 'Lei', 'nº', '8.112', ',', 'de', '11', 'de', 'dezembro', 'de', '1990', ',', 'com', 'redação', 'dada', 'pela', 'Lei', 'nº', '9.527', ',', 'de', '10', 'de', 'dezembro', 'de', '1997', ',', 'a', 'ocupante', 'do', 'cargo', 'de', 'PORTEIRO', ',', 'do', 'Quadro', 'de', 'Pessoal', 'desta', 'Universidade', ',', 'ELIANE', 'RICARDO', 'IRANC

### Tokenizer
We create a tokenizer to convert our inputs into sub-word ids. We need to use a tokenizer that is compatible with the model we'll use.
HuggingFace makes that easy through the AutoTokenizer, which allows us to specify which model will be used and it already makes sure that our tokenizer will work with it.


In [535]:
from transformers import AutoTokenizer

In [536]:
checkpoint = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

loading configuration file config.json from cache at C:\Users\arthu/.cache\huggingface\hub\models--neuralmind--bert-base-portuguese-cased\snapshots\94d69c95f98f7d5b2a8700c420230ae10def0baa\config.json
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_v

In [537]:
t = tokenizer(ufrgs_data['train'][12]['inputs'], is_split_into_words=True)
t

{'input_ids': [101, 19816, 310, 14928, 425, 16782, 874, 100, 5226, 12234, 119, 5047, 119, 15405, 22336, 119, 241, 18394, 117, 5656, 202, 14441, 14305, 131, 120, 120, 2702, 11740, 119, 169, 2527, 6891, 119, 235, 22282, 120, 16782, 232, 304, 22280, 205, 120, 205, 212, 8718, 5118, 21748, 22301, 100, 17263, 22336, 125, 542, 120, 16956, 120, 4284, 231, 11635, 22369, 118, 257, 18469, 15349, 22322, 10836, 278, 3341, 22321, 16484, 10836, 212, 3341, 19715, 4089, 250, 22301, 7281, 9846, 5054, 22308, 6392, 11836, 22309, 263, 12002, 5054, 9369, 15040, 257, 15749, 278, 5650, 22320, 7545, 15040, 200, 18199, 117, 202, 1700, 125, 675, 20215, 179, 2036, 506, 7940, 649, 412, 14120, 322, 100, 11365, 10852, 117, 125, 16720, 125, 1511, 125, 3470, 117, 171, 13128, 3313, 3501, 428, 117, 122, 4762, 146, 11706, 243, 21491, 149, 22359, 9330, 3708, 22330, 117, 257, 3341, 7918, 22339, 22309, 131, 15945, 159, 117, 12885, 117, 538, 3401, 180, 2502, 100, 1015, 119, 21950, 117, 125, 1433, 125, 1512, 125, 5737, 117, 1

In [538]:
#The tokenizer return looks like a dictionary but it is actually an object called BatchEncoding
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [539]:
#The object has a tokens method that returns the original tokens before transforming them into integers
t.tokens()

['[CLS]',
 'Docu',
 '##mento',
 'gerado',
 'sob',
 'autent',
 '##icação',
 '[UNK]',
 'Q',
 '##ON',
 '.',
 '500',
 '.',
 '98',
 '##4',
 '.',
 'B',
 '##HA',
 ',',
 'disponível',
 'no',
 'endereço',
 'http',
 ':',
 '/',
 '/',
 'w',
 '##ww',
 '.',
 'u',
 '##fr',
 '##gs',
 '.',
 'b',
 '##r',
 '/',
 'autent',
 '##ica',
 '##ca',
 '##o',
 '1',
 '/',
 '1',
 'P',
 '##OR',
 '##TA',
 '##RI',
 '##A',
 '[UNK]',
 '118',
 '##4',
 'de',
 '18',
 '/',
 '02',
 '/',
 '2016',
 'O',
 'PR',
 '##Ó',
 '-',
 'R',
 '##EI',
 '##TO',
 '##R',
 'DE',
 'G',
 '##ES',
 '##T',
 '##ÃO',
 'DE',
 'P',
 '##ES',
 '##SO',
 '##AS',
 'D',
 '##A',
 'UN',
 '##IV',
 '##ER',
 '##S',
 '##ID',
 '##AD',
 '##E',
 'F',
 '##ED',
 '##ER',
 '##AL',
 'DO',
 'R',
 '##IO',
 'G',
 '##RA',
 '##N',
 '##DE',
 'DO',
 'S',
 '##UL',
 ',',
 'no',
 'uso',
 'de',
 'suas',
 'atribuições',
 'que',
 'lhe',
 'foram',
 'confer',
 '##idas',
 'pela',
 'Porta',
 '##ria',
 '[UNK]',
 '54',
 '##69',
 ',',
 'de',
 '04',
 'de',
 'outubro',
 'de',
 '2012',
 ',',
 'do

In [540]:
# The word_ids methods returns a list with the ID that maps each sub-word to the original word it was tokenized from.
t.word_ids()

[None,
 0,
 0,
 1,
 2,
 3,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 15,
 16,
 16,
 17,
 18,
 18,
 18,
 18,
 18,
 19,
 20,
 20,
 20,
 20,
 20,
 20,
 20,
 21,
 22,
 22,
 22,
 22,
 23,
 24,
 24,
 24,
 24,
 25,
 25,
 26,
 26,
 26,
 26,
 26,
 26,
 26,
 27,
 27,
 27,
 27,
 28,
 29,
 29,
 30,
 30,
 30,
 30,
 31,
 32,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 42,
 43,
 44,
 44,
 45,
 46,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 56,
 57,
 57,
 58,
 59,
 60,
 61,
 62,
 62,
 63,
 64,
 64,
 64,
 64,
 64,
 65,
 66,
 66,
 66,
 66,
 66,
 67,
 68,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 77,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 92,
 92,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 102,
 103,
 104,
 105,
 106,
 106,
 106,
 106,
 106,
 107,
 108,
 109,
 109

### Target Alignment
Now that our input is composed of sub-words, we need to make sure that we have one target per sub-word. To do this we will use the align_targets function and map targets from each word to its sub-words.

In [541]:
#Define relationship between B and I tags
#['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8
}

# Function that aligns the labels to be correctly associated with each sub-word.
def align_targets(labels, word_ids):
    aligned_labels = []
    previous_word = None

    for word in word_ids:
        if word is None:
            # Tokens like [CLS] and [SEP]
            label = -100 #This value is used by Hugging Face to ignore the tokens during training
        elif word != previous_word:
            # New word in the list
            label = labels[word]
        else:
            #Repeated word (Would be the next sub-word)
            if labels[word] in begin2inside:
                #Change B- to I-
                label = begin2inside[labels[word]]
            else:
                # Sub-word of a word classified as "O" gets the same label "O"
                label = labels[word]

        aligned_labels.append(label)
        previous_word = word #update last word
 
    return aligned_labels

In [542]:
## Label-Token Alignment Test
idx = 6
test_data = tokenizer(ufrgs_data['train'][idx]['inputs'], is_split_into_words=True)
print("Tokenized Data:", test_data)
print("Word Tokens:", ufrgs_data['train'][idx]['inputs'])
test_labels = ufrgs_data['train'][idx]['targets']
print("Word Labels:", test_labels)
print("Word IDs:", test_data.word_ids())
aligned_targets = align_targets(test_labels, test_data.word_ids())
print("Sub-Word Labels:", aligned_targets)
print("Sub-Word Tokens:", test_data.tokens())

Tokenized Data: {'input_ids': [101, 205, 120, 205, 212, 8718, 5118, 21748, 22301, 100, 8103, 5752, 125, 16720, 120, 16899, 120, 5096, 177, 5427, 9617, 118, 257, 18469, 15349, 5650, 250, 22301, 7281, 9846, 5054, 22308, 6392, 11836, 22309, 263, 12002, 5054, 9369, 15040, 257, 15749, 278, 5650, 22320, 7545, 15040, 200, 18199, 117, 202, 1700, 125, 675, 20215, 117, 9319, 146, 16620, 229, 14120, 322, 100, 16444, 8510, 117, 125, 2939, 125, 1544, 125, 4284, 257, 3341, 7918, 22339, 22309, 3928, 15802, 6939, 150, 8346, 117, 240, 7887, 125, 4576, 117, 202, 12106, 157, 1014, 1582, 117, 320, 8922, 299, 6072, 22309, 17715, 22322, 22055, 22308, 278, 18178, 3341, 15040, 22308, 16288, 22333, 6072, 117, 12600, 14188, 200, 5234, 8214, 149, 22359, 22222, 4649, 22330, 22330, 117, 1340, 487, 122, 173, 8750, 202, 5985, 125, 8153, 1262, 171, 2900, 125, 9682, 122, 5503, 7023, 138, 117, 180, 3383, 250, 125, 8922, 8688, 243, 117, 2270, 19148, 117, 221, 123, 3383, 250, 125, 8922, 8688, 243, 117, 2270, 16720, 117, 

In [543]:
aligned_labels = [target_labels[t] if t>=0 else None for t in aligned_targets]
for x, y in zip(test_data.tokens(), aligned_labels):
    print(f"{x}\t{y}")
    

[CLS]	None
1	O
/	O
1	O
P	O
##OR	O
##TA	O
##RI	O
##A	O
[UNK]	O
38	O
##12	O
de	O
04	O
/	O
05	O
/	O
2017	O
A	O
VI	O
##CE	O
-	O
R	O
##EI	O
##TO	O
##RA	O
D	O
##A	O
UN	B-ORGANIZATION
##IV	I-ORGANIZATION
##ER	I-ORGANIZATION
##S	I-ORGANIZATION
##ID	I-ORGANIZATION
##AD	I-ORGANIZATION
##E	I-ORGANIZATION
F	I-ORGANIZATION
##ED	I-ORGANIZATION
##ER	I-ORGANIZATION
##AL	I-ORGANIZATION
DO	I-ORGANIZATION
R	I-ORGANIZATION
##IO	I-ORGANIZATION
G	I-ORGANIZATION
##RA	I-ORGANIZATION
##N	I-ORGANIZATION
##DE	I-ORGANIZATION
DO	I-ORGANIZATION
S	I-ORGANIZATION
##UL	I-ORGANIZATION
,	O
no	O
uso	O
de	O
suas	O
atribuições	O
,	O
considerando	O
o	O
disposto	O
na	O
Porta	O
##ria	O
[UNK]	O
76	O
##24	O
,	O
de	O
29	O
de	O
setembro	O
de	O
2016	O
R	O
##ES	O
##OL	O
##V	O
##E	O
Conc	O
##eder	O
progress	O
##ão	O
funcional	O
,	O
por	O
avaliação	O
de	O
desempenho	O
,	O
no	O
Quad	O
##ro	O
desta	O
Universidade	O
,	O
ao	O
Professor	B-MISCELLANEOUS
J	B-PERSON
##OS	I-PERSON
##E	I-PERSON
CA	I-PERSON
##R	I-PERSON
##LO	I-PERSON
##S	I-PERS

### Tokenize inputs
We take the sub-word inputs and labels and pass it to the 'tokenize_fn' function to generate the tokens we'll feed to the model.

In [544]:
# Function to tokenize both inputs and targets
def tokenize_fn(batch):
    # Tokenize the input sequence first
    tokenized_inputs = tokenizer(batch['inputs'], truncation=True, is_split_into_words=True)
    labels_batch = batch['targets'] # The original targets word-by-word
    aligned_labels_batch = [] # The aligned targets sub-word by sub-word
    # Loop through each label sequence in the batch
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i) # Get word IDs for the sequence
        aligned_labels_batch.append(align_targets(labels, word_ids)) # Align sequence labels
    
    # Save final aligned labels in a column called 'labels' which is the required name for the hugging face models
    tokenized_inputs['labels'] = aligned_labels_batch
    
    return tokenized_inputs

In [545]:
ufrgs_data['train']

Dataset({
    features: ['inputs', 'targets'],
    num_rows: 38
})

In [546]:
# Use the datasets method 'map' to apply the tokenize function to the train and test datasets.
# We'll use the batched parameter to improve the eficiency of the tokenization.
tokenized_datasets = ufrgs_data.map(
	tokenize_fn,
	batched=True,
	remove_columns=ufrgs_data["train"].column_names
)

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

In [547]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 38
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 19
    })
})

### Test the DataCollator
There are several required steps before passing the text into the model: padding, truncate, converting to tensors, etc. When we use the tokenizer method we are not doing most of these steps because the ‘data collator’ in the trainer is taking care of it implicitly when we train the model.

The Data Collator is built into the trainer and is defined as such:

> • **data_collator** (`DataCollator`, *optional*) — The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to **[default_data_collator()](https://huggingface.co/docs/transformers/v4.22.1/en/main_classes/data_collator#transformers.default_data_collator)** if no `tokenizer` is provided, an instance of **[DataCollatorWithPadding](https://huggingface.co/docs/transformers/v4.22.1/en/main_classes/data_collator#transformers.DataCollatorWithPadding)** otherwise.
> 

#### For Token Classification

The Trainer object in Hugging Face is not capable of recognizing which task we are trying to execute and therefore is not able to automatically select the correct Data Collator for us. Since the default DataCollatorWithPadding does not support tasks of the token classification type, we’ll need to manually define the one we want by importing the token classification data collator.

In [548]:
from transformers import DataCollatorForTokenClassification

In [549]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='neuralmind/bert-base-portuguese-cased', vocab_size=29794, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [550]:
# Test the data collator. It should return both inputs as tensor of the same size (including the padding).
collator_testset = [tokenized_datasets["train"][i] for i in range(2)]
batch = data_collator(collator_testset)
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    3,    4,    4,    4,    4,    4,    4,
            4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,    4,
            4,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    7,    8,    8,    8,    8,
            8,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            0,    0,    0,  

### Metrics for validation
Computing metrics isn’t as straight forward when we deal with multiple targets per sample(sentence). Usually when calculating accuracy with a single target we can do #correct_samples/#total_samples. This is not possible when each sample has several targets.
One solution would be flattening all predictions to calculate #correct_targets/#total_targets.
#### Seqeval
This library is the standard method to calculate metrics in hugging face as its sole purpose is to compute metrics for NLP tasks with sequence targets.\
https://huggingface.co/spaces/evaluate-metric/seqeval

In [551]:
from datasets import load_metric
metric = load_metric("seqeval")

In [552]:
# Seqeval will not work with a single input single label task. It will only be usable in tasks that require mulitple labels for multiple inputs
# Single input single label example:
#metric.compute(predictions=[0, 0, 0], references=[0, 0, 1])

In [553]:
# Multilple inputs multiple labels example:
metric.compute(
    predictions=[[0, 0, 0], [1, 0, 1]], 
    references=[[0, 0, 1], [1, 0, 1]])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.8333333333333334}

In [554]:
# Seqeval supports sequence labeling evalutation with the IOB formats. So to get rid of the warning and correctly compute the metrics we need to follow this formatting standard.
metric.compute(
    predictions=[['O', 'I-ORG', 'B-ORG', 'B-ORG', 'B-LOC'], ['B-MISC', 'O', 'B-PER', 'I-PER', "I-MISC"]], 
    references=[['O', 'B-LOC', 'B-ORG', 'I-ORG', 'I-ORG'], ['B-MISC', 'I-MISC', 'B-PER', 'I-PER', 'O']])

{'LOC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'ORG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.14285714285714285,
 'overall_recall': 0.25,
 'overall_f1': 0.18181818181818182,
 'overall_accuracy': 0.5}

In [555]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    preds = np.argmax(logits, axis=-1)

    # remove -100, convert the label ids to label names
    str_labels = [[target_labels[t] for t in label if t != -100] for label in labels]

    # do the same for predictions whenever true label is -100
    str_preds = [[target_labels[p] for p, t in zip(pred, targ) if t != -100] for pred, targ in zip(preds, labels)]

    the_metrics = metric.compute(predictions=str_preds, references=str_labels)
    return {
        'precision': the_metrics['overall_precision'],
        'recall': the_metrics['overall_recall'],
        'f1': the_metrics['overall_f1'],
        'accuracy': the_metrics['overall_accuracy']
        }

In [556]:
id2label = {k: v for k, v in enumerate(target_labels)} #Get label IDs
label2id = {v: k for k, v in id2label.items()} #Get label names from IDs
id2label

{0: 'O',
 1: 'B-PERSON',
 2: 'I-PERSON',
 3: 'B-ORGANIZATION',
 4: 'I-ORGANIZATION',
 5: 'B-LOCATION',
 6: 'I-LOCATION',
 7: 'B-MISCELLANEOUS',
 8: 'I-MISCELLANEOUS'}

#### Load the pre-trained model

We use the AutoModelForTokenClassification.from_pretrained method to load the BERT model from huggingface.\
The model loades will be the one defined by 'checkpoint'.\
We pass the id2label and label2id parameters for the model to understand the targets we are using for the prediction and training.

In [557]:
from transformers import AutoModelForTokenClassification

In [558]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)

loading configuration file config.json from cache at C:\Users\arthu/.cache\huggingface\hub\models--neuralmind--bert-base-portuguese-cased\snapshots\94d69c95f98f7d5b2a8700c420230ae10def0baa\config.json
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PERSON",
    "2": "I-PERSON",
    "3": "B-ORGANIZATION",
    "4": "I-ORGANIZATION",
    "5": "B-LOCATION",
    "6": "I-LOCATION",
    "7": "B-MISCELLANEOUS",
    "8": "I-MISCELLANEOUS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOCATION": 5,
    "B-MISCELLANEOUS": 7,
    "B-ORGANIZATION": 3,
    "B-PERSON": 1,
    "I-LOCATION": 6,
    "I-MISCELLANEOUS": 8,
    "I-ORGANIZATION": 4,
    "I-PERSON": 2,
    

#### Create the Training Arguments
The training arguments define several parameters of how the training of the model will happen. Some argument define where the outputs will be save, how often during training we want to compute metric, how many epochs we will use for training, define the learning rate and many others. There are a several arguments which can all be found in the documentation of the function:

[https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments](https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/trainer#transformers.TrainingArguments)

The trainer uses AdamW for the backpropagation optimization. We can use a custom one if necessary.

In [559]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "bert-base-portuguese-cased-ner-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


#### Trainer

The trainer object is what we’ll use to run the training process. The arguments are fairly simple:

- The pre-trained model we will use
- The training arguments
- The train dataset (already tokenized)
- The validation dataset
- The data collator
- The metrics we will use for validation
- The tokenizer

The training is done by calling the train method as trainer.train()

In [560]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [561]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [562]:
print(trainer.args.device)
print_gpu_utilization()
!nvidia-smi

cuda:0
GPU memory occupied: 8076 MB.
Mon Apr  3 04:31:11 2023       

In [563]:
result = trainer.train()
print_summary(result)

***** Running training *****
  Num examples = 38
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 25
  Number of trainable parameters = 108339465



+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.02       Driver Version: 528.02       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   37C    P5    14W / 180W |   7957MiB /  8192MiB |      8%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                           

  0%|          | 0/25 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 19
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-base-portuguese-cased-ner-finetuned\checkpoint-5
Configuration saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-5\config.json


{'eval_loss': 1.4058964252471924, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7182601880877743, 'eval_runtime': 0.6931, 'eval_samples_per_second': 27.411, 'eval_steps_per_second': 4.328, 'epoch': 1.0}


Model weights saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-5\pytorch_model.bin
tokenizer config file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-5\tokenizer_config.json
Special tokens file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-5\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 19
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-base-portuguese-cased-ner-finetuned\checkpoint-10
Configuration saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-10\config.json


{'eval_loss': 0.9357545971870422, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7394200626959248, 'eval_runtime': 0.6373, 'eval_samples_per_second': 29.812, 'eval_steps_per_second': 4.707, 'epoch': 2.0}


Model weights saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-10\pytorch_model.bin
tokenizer config file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-10\tokenizer_config.json
Special tokens file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-10\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 19
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-base-portuguese-cased-ner-finetuned\checkpoint-15
Configuration saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-15\config.json


{'eval_loss': 0.7065246105194092, 'eval_precision': 0.1, 'eval_recall': 0.04597701149425287, 'eval_f1': 0.06299212598425197, 'eval_accuracy': 0.832680250783699, 'eval_runtime': 0.7271, 'eval_samples_per_second': 26.133, 'eval_steps_per_second': 4.126, 'epoch': 3.0}


Model weights saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-15\pytorch_model.bin
tokenizer config file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-15\tokenizer_config.json
Special tokens file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-15\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 19
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-base-portuguese-cased-ner-finetuned\checkpoint-20


{'eval_loss': 0.5992717146873474, 'eval_precision': 0.8285714285714286, 'eval_recall': 0.3333333333333333, 'eval_f1': 0.47540983606557374, 'eval_accuracy': 0.850705329153605, 'eval_runtime': 0.6647, 'eval_samples_per_second': 28.584, 'eval_steps_per_second': 4.513, 'epoch': 4.0}


Configuration saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-20\config.json
Model weights saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-20\pytorch_model.bin
tokenizer config file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-20\tokenizer_config.json
Special tokens file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-20\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 19
  Batch size = 8


  0%|          | 0/3 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to bert-base-portuguese-cased-ner-finetuned\checkpoint-25
Configuration saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-25\config.json


{'eval_loss': 0.5647721290588379, 'eval_precision': 0.7111111111111111, 'eval_recall': 0.367816091954023, 'eval_f1': 0.48484848484848486, 'eval_accuracy': 0.8557993730407524, 'eval_runtime': 0.6702, 'eval_samples_per_second': 28.351, 'eval_steps_per_second': 4.476, 'epoch': 5.0}


Model weights saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-25\pytorch_model.bin
tokenizer config file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-25\tokenizer_config.json
Special tokens file saved in bert-base-portuguese-cased-ner-finetuned\checkpoint-25\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 131.4418, 'train_samples_per_second': 1.446, 'train_steps_per_second': 0.19, 'train_loss': 1.0373165130615234, 'epoch': 5.0}
Time: 131.44
Samples/second: 1.45
GPU memory occupied: 8079 MB.


In [564]:
print_gpu_utilization()

GPU memory occupied: 8082 MB.


In [565]:
type(trainer.train_dataset)

datasets.arrow_dataset.Dataset

In [566]:
trainer.save_model('auto_tagger')

Saving model checkpoint to auto_tagger
Configuration saved in auto_tagger\config.json
Model weights saved in auto_tagger\pytorch_model.bin
tokenizer config file saved in auto_tagger\tokenizer_config.json
Special tokens file saved in auto_tagger\special_tokens_map.json


In [567]:

from transformers import pipeline

ner = pipeline(
    "token-classification",
    model='auto_tagger',
    aggregation_strategy="average",
    ignore_labels=[""],
    device=0
)

loading configuration file auto_tagger\config.json
Model config BertConfig {
  "_name_or_path": "auto_tagger",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PERSON",
    "2": "I-PERSON",
    "3": "B-ORGANIZATION",
    "4": "I-ORGANIZATION",
    "5": "B-LOCATION",
    "6": "I-LOCATION",
    "7": "B-MISCELLANEOUS",
    "8": "I-MISCELLANEOUS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOCATION": 5,
    "B-MISCELLANEOUS": 7,
    "B-ORGANIZATION": 3,
    "B-PERSON": 1,
    "I-LOCATION": 6,
    "I-MISCELLANEOUS": 8,
    "I-ORGANIZATION": 4,
    "I-PERSON": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_pa

In [583]:
test_sample = """Documento gerado sob autenticação Nº NIK.843.557.VM6, disponível no endereço http://www.ufrgs.br/autenticacao
1/1
PORTARIA Nº             1181                  de  18/02/2016
O PRÓ-REITOR DE GESTÃO DE PESSOAS DA UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL, no
uso de suas atribuições que lhe foram conferidas pela Portaria nº.5469, de 04 de outubro de 2012, do
Magnífico Reitor, e conforme o Laudo Médico n°37564,
RESOLVE:
Designar, temporariamente, nos termos da Lei nº. 8.112, de 11 de dezembro de 1990, com redação
dada  pela  Lei  nº.9.527,  de  10  de  dezembro  de  1997,  a  ocupante  do  cargo  de  ASSISTENTE  EM
ADMINISTRAÇÃO, do Quadro de Pessoal desta Universidade, DENISE SCHROEDER (Siape: 0358763 ),  para
substituir   MARILDA SANTOS DA ROCHA (Siape: 1044125 ), Secretário do Depto de Plantas Forrageiras e
Agrometeorologia da Faculdade de Agronomia, Código FG-7, em seu afastamento por motivo de Laudo
Médico do titular da Função, no período de 03/02/2016 a 10/02/2016, com o decorrente pagamento das
vantagens por 8 dias.
MAURÍCIO VIÉGAS DA SILVA
Pró-Reitor de Gestão de Pessoas""" 
clean_sample = data_processing.clear_text(test_sample)
print(clean_sample)
token_sample = tokenizer(clean_sample)
print(token_sample.tokens())

Documento gerado sob autenticação Nº NIK.843.557.VM6, disponível no endereço http://www.ufrgs.br/autenticacao 1/1 PORTARIA Nº 1181 de 18/02/2016 O PRÓ-REITOR DE GESTÃO DE PESSOAS DA UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL, no uso de suas atribuições que lhe foram conferidas pela Portaria nº 5469, de 04 de outubro de 2012, do Magnífico Reitor, e conforme o Laudo Médico n°37564, RESOLVE: Designar, temporariamente, nos termos da Lei nº 8.112, de 11 de dezembro de 1990, com redação dada pela Lei nº 9.527, de 10 de dezembro de 1997, a ocupante do cargo de ASSISTENTE EM ADMINISTRAÇÃO, do Quadro de Pessoal desta Universidade, DENISE SCHROEDER (Siape: 0358763 ), para substituir MARILDA SANTOS DA ROCHA (Siape: 1044125 ), Secretário do Depto de Plantas Forrageiras e Agrometeorologia da Faculdade de Agronomia, Código FG-7, em seu afastamento por motivo de Laudo Médico do titular da Função, no período de 03/02/2016 a 10/02/2016, com o decorrente pagamento das vantagens por 8 dias. MAURÍCIO VIÉGA

In [584]:
type(ner)

transformers.pipelines.token_classification.TokenClassificationPipeline

In [606]:
results = ner(clean_sample)
results

[{'entity_group': 'O',
  'score': 0.82746357,
  'word': 'Documento gerado sob autenticação Nº NIK. 843. 557. VM6, disponível no endereço http : / / www. ufrgs. br / autenticacao 1 / 1 PORTARIA Nº 1181 de 18 / 02 / 2016 O PRÓ - REITOR DE',
  'start': 0,
  'end': 160},
 {'entity_group': 'MISCELLANEOUS',
  'score': 0.3277286,
  'word': 'GESTÃO',
  'start': 161,
  'end': 167},
 {'entity_group': 'O',
  'score': 0.4662776,
  'word': 'DE PESSOAS DA',
  'start': 168,
  'end': 181},
 {'entity_group': 'ORGANIZATION',
  'score': 0.669568,
  'word': 'UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL',
  'start': 182,
  'end': 223},
 {'entity_group': 'O',
  'score': 0.8813147,
  'word': ', no uso de suas atribuições que lhe foram conferidas pela Portaria nº 5469, de 04 de outubro de 2012, do Magnífico Reitor, e conforme o Laudo Médico n°37564, RESOLVE : Designar, temporariamente, nos termos da Lei nº 8. 112, de 11 de dezembro de 1990, com redação dada pela Lei nº 9. 527, de 10 de dezembro de 1997, a ocupan

In [626]:
def reformat_sentence(original_sentence, tagged_results):
    og_sentence_pos = 0
    tagged_sentence_pos = 0
    formatted_results = []
    for entity in tagged_results:
        formatted_entity = ""
        #print(entity['word'])
        for index_char, char in enumerate(entity['word']):
            #print(char, original_sentence[og_sentence_pos])
            if char == original_sentence[og_sentence_pos]:
                og_sentence_pos += 1
                formatted_entity = formatted_entity + char
            else:
                #print("Bad char:", char)
                # #Look ahead and see if next char is equal to original
                if entity['word'][index_char+1] == original_sentence[og_sentence_pos]:
                    #print("Found char in next pos")
                    pass
                # Else add the character to the formatted entity
                else:
                    og_sentence_pos += 2
                    formatted_entity = formatted_entity + char

        #print("Formatted:", formatted_entity)
        formatted_result = entity.copy()
        formatted_result['word'] = formatted_entity
        formatted_results.append(formatted_result)
    return formatted_results

In [622]:
reformatted_sentence = reformat_sentence(clean_sample, results)
sentence_for_relabelling = ""
for entity in reformatted_sentence:
    sentence_for_relabelling = sentence_for_relabelling + entity['word'] + ' '
sentence_for_relabelling = re.sub(' ,', ',', sentence_for_relabelling)

In [623]:
print(sentence_for_relabelling)
print(clean_sample)

Documento gerado sob autenticação Nº NIK.843.557.VM6, disponível no endereço http://www.ufrgs.br/autenticacao 1/1 PORTARIA Nº 1181 de 18/02/2016 O PRÓ-REITOR DE GESTÃO DE PESSOAS DA UNIVERSIDADE FEDERAL DO RIO GRANDE DO SUL, no uso de suas atribuições que lhe foram conferidas pela Portaria nº 5469, de 04 de outubro de 2012, do Magnífico Reitor, e conforme o Laudo Médico n°37564, RESOLVE: Designar, temporariamente, nos termos da Lei nº 8.112, de 11 de dezembro de 1990, com redação dada pela Lei nº 9.527, de 10 de dezembro de 1997, a ocupante do cargo de ASSISTENTE EM ADMINISTRAÇÃO, do Quadro de Pessoal desta Universidade, DENISE SCHROEDER  Siape: 0358763 ), para substituir MARILDA SANTOS DA ROCHA  Siape: 1044125 ), Secretário do Depto de Plantas Forrageiras e Agrometeorologia da Faculdade de Agronomia, Código FG-7, em seu afastamento por motivo de Laudo Médico do titular da Função, no período de 03/02/2016 a 10/02/2016, com o decorrente pagamento das vantagens por 8 dias. MAURÍCIO VIÉGA

In [None]:
def join_sentence(tagged_sentence):
    list_of_words = []
    for token in tokens:
        