# Create the dataset

In [1]:
#External
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordTokenizer, WhitespaceTokenizer
import numpy as np
import pandas as pd
from pprint import pprint
import os
nltk.download('punkt')
from transformers import AutoTokenizer
import textwrap
import re
from datasets import load_dataset

#Internal
import data_cleaning.data_cleaning as dc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Open Annotated Sentences
We get the sentences from 'prep_data_labeling.ipynb' that we annotated manually with deccano.

In [2]:
annotated_df = pd.read_json('../data/annotated/doccano-extraction-09-12-24.jsonl', lines=True)

In [4]:
annotated_df.head()

Unnamed: 0,id,text,tagged_entities,document_id,sentence_id,doc__sentence_id,duplicate_ids,label
0,2227,"Conceder à servidora LISIANE RAMOS VILK, ocupa...","[{'entity_group': 'O', 'score': 0.7762932777, ...",25644,2,25644-2,[25644-2],"[[21, 39, PERSON], [62, 88, OCCUPATION], [109,..."
1,2228,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[{'entity_group': 'PERSON', 'score': 0.6592996...",105798,3,105798-3,"[105798-3, 105799-3, 105801-3, 105802-3, 18001...","[[0, 24, PERSON], [25, 56, OCCUPATION]]"
2,2229,Autorizar o afastamento do país de CRISTINE MA...,"[{'entity_group': 'O', 'score': 0.6819901466, ...",25645,2,25645-2,[25645-2],"[[35, 58, PERSON], [60, 92, OCCUPATION], [170,..."
3,2230,CARLOS ALEXANDRE NETTO Reitor,"[{'entity_group': 'PERSON', 'score': 0.6323550...",25645,4,25645-4,"[25645-4, 25646-4, 25647-4, 25648-4, 25649-4, ...","[[0, 22, PERSON], [23, 29, OCCUPATION]]"
4,2231,Autorizar o afastamento do país de ANDRE DIAS ...,"[{'entity_group': 'O', 'score': 0.6808940768, ...",25646,2,25646-2,[25646-2],"[[35, 53, PERSON], [55, 82, OCCUPATION], [109,..."


### Separate duplicate sentences [OPTIONAL STEP]
(Having repeated sentences might not help training. Ideal solution is using data augmentation if more data is necessary)
Next we'll explode our dataframe so that duplicate sentences can become separate rows. We'll also drop a few columns that will have inconsistent information after the explode and will also not be useful for us.

In [4]:
#annotated_df = annotated_df.explode("duplicate_ids")

### Clean document so only required data is kept
All the document and sentences ids are not needed for training. Their use would be necessary only when trying to map sentences back to the origin.

In [5]:
sentences = annotated_df[['text', 'label']].copy()
print(sentences.shape)
sentences.head() 

(768, 2)


Unnamed: 0,text,label
0,"Conceder à servidora LISIANE RAMOS VILK, ocupa...","[[21, 39, PERSON], [62, 88, OCCUPATION], [109,..."
1,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, OCCUPATION]]"
2,Autorizar o afastamento do país de CRISTINE MA...,"[[35, 58, PERSON], [60, 92, OCCUPATION], [170,..."
3,CARLOS ALEXANDRE NETTO Reitor,"[[0, 22, PERSON], [23, 29, OCCUPATION]]"
4,Autorizar o afastamento do país de ANDRE DIAS ...,"[[35, 53, PERSON], [55, 82, OCCUPATION], [109,..."


Drop all sentences that only contain 1 or less labels. This is useful so that we don't fine tune our model with sentences that have almost no information about named entities. [Optional]

In [6]:
#Drop all sentences that contain 1 or less labels
print('Shape before filter:', sentences.shape)

for idx, row in sentences.iterrows():
    if len(row['label']) <= 1 :
        sentences.drop(idx, inplace=True)
sentences.reset_index(drop=True, inplace=True)

print('Shape after filter:', sentences.shape)

Shape before filter: (768, 2)
Shape after filter: (676, 2)


In [7]:
# Check list of sentences with a specific length
#sentences[sentences['label'].map(len) == 1]

## Splitting dataset from sentences to list of words

Let's tokenize the sentences and tokens using the TreebankWordTokenizer
The first method 'tokenize' will split our sentence and return a list of words which we'll organize in a 'tokenized_sentences' list.
The second method 'span_tokenize' will return the start and end position of each token resulting from the split. We'll organize them in a 'token_positions' list.

In [8]:
# Split sentences into list of words. Tokenized_sentences has all the lists of words and token_positions has the start and end position of each word in the original sentence.
tokenized_sentences = []
token_positions = []
for idx, sentence in sentences['text'].items():
    tokenized_sentences.append(TreebankWordTokenizer().tokenize(sentence))
    token_positions.append(list(TreebankWordTokenizer().span_tokenize(sentence)))


print(tokenized_sentences[0])
print(token_positions[0])

['Conceder', 'à', 'servidora', 'LISIANE', 'RAMOS', 'VILK', ',', 'ocupante', 'do', 'cargo', 'de', 'Administrador', 'de', 'Edifícios', '-', '701400', ',', 'lotada', 'na', 'Faculdade', 'de', 'Arquitetura', ',', 'SIAPE', '2325261', ',', 'o', 'percentual', 'de', '25', '%', '(', 'vinte', 'e', 'cinco', 'por', 'cento', ')', 'de', 'Incentivo', 'à', 'Qualificação', ',', 'a', 'contar', 'de', '15/07/2016', ',', 'tendo', 'em', 'vista', 'a', 'conclusão', 'do', 'curso', 'de', 'Graduação', 'em', 'Administração', '-', 'Bacharelado', ',', 'conforme', 'o', 'Processo', 'nº', '23078.015333/2016-19', '.']
[(0, 8), (9, 10), (11, 20), (21, 28), (29, 34), (35, 39), (39, 40), (41, 49), (50, 52), (53, 58), (59, 61), (62, 75), (76, 78), (79, 88), (89, 90), (91, 97), (97, 98), (99, 105), (106, 108), (109, 118), (119, 121), (122, 133), (133, 134), (135, 140), (141, 148), (148, 149), (150, 151), (152, 162), (163, 165), (166, 168), (168, 169), (170, 171), (171, 176), (177, 178), (179, 184), (185, 188), (189, 194), (1

In [9]:
# We'll zip both the words and their positions together in the same list.
tokenized_sentence_position = []
for sentence, positions in zip(tokenized_sentences, token_positions):
    tokenized_sentence_position.append(list(zip(sentence, positions)))
    
# Check the zip
print(sentences['label'].loc[1])
tokenized_sentence_position[1]

[[0, 24, 'PERSON'], [25, 56, 'OCCUPATION']]


[('MAURÍCIO', (0, 8)),
 ('VIÉGAS', (9, 15)),
 ('DA', (16, 18)),
 ('SILVA', (19, 24)),
 ('Pró-Reitor', (25, 35)),
 ('de', (36, 38)),
 ('Gestão', (39, 45)),
 ('de', (46, 48)),
 ('Pessoas', (49, 56))]

In [10]:
# Intermediary step to convert the list of words into a dataframe (Used only to facilitate the next steps)
list_of_tokenized_sentences = []
for list_of_words in tokenized_sentence_position:
    tokens_df = pd.DataFrame(list_of_words, columns = ['token', 'pos'])
    tokens_df[['start', 'end']] = tokens_df['pos'].to_list()
    tokens_df = tokens_df.drop(columns='pos')
    list_of_tokenized_sentences.append(tokens_df)

# Check the dataframe
list_of_tokenized_sentences[1]

Unnamed: 0,token,start,end
0,MAURÍCIO,0,8
1,VIÉGAS,9,15
2,DA,16,18
3,SILVA,19,24
4,Pró-Reitor,25,35
5,de,36,38
6,Gestão,39,45
7,de,46,48
8,Pessoas,49,56


### Assigning the labels to each word
We now have, for each sentences, a dataframe of all the word and a list with all the labels.
What needs to be done is assign the correct label to each word of the tokenized sentence.

In [11]:
for index, token_df in enumerate(list_of_tokenized_sentences):
    labels = []
    is_entity = False
    for idx, token in token_df.iterrows():
        for label in sentences['label'].loc[index]:
            if token['start'] == label[0]: #Check if the word starts at the same position as the label. (Start is in position 0)
                labels.append('B-' + label[2]) #Puts the label name. (Name is in position 2)
                is_entity = True
                break
            if token['start'] >= label[0] and token['end'] <= label[1]: #Check if the word starts after the label and ends before it. (Start is in position 0 and end is in position 1)
                labels.append('I-' + label[2])
                is_entity = True
                break
            is_entity = False
        if is_entity == False:
            labels.append('O')
    token_df['label'] = labels

In [12]:
print(sentences.shape,'\n',
      len(list_of_tokenized_sentences))

(676, 2) 
 676


In [15]:
# Validate that the tokens are correctly assigned to each label.
for index in range(0, 5):
    print('SENTENCE',index, '\n') 
    print(sentences['label'].loc[index], '\n')
    print(list_of_tokenized_sentences[index][list_of_tokenized_sentences[index]['label'] != 'O'], '\n')
    print(list_of_tokenized_sentences[index]['label'].value_counts(), '\n')

SENTENCE 0 

[[21, 39, 'PERSON'], [62, 88, 'OCCUPATION'], [109, 133, 'ORGANIZATION'], [237, 247, 'DATE']] 

            token  start  end           label
3         LISIANE     21   28        B-PERSON
4           RAMOS     29   34        I-PERSON
5            VILK     35   39        I-PERSON
11  Administrador     62   75    B-OCCUPATION
12             de     76   78    I-OCCUPATION
13      Edifícios     79   88    I-OCCUPATION
19      Faculdade    109  118  B-ORGANIZATION
20             de    119  121  I-ORGANIZATION
21    Arquitetura    122  133  I-ORGANIZATION
46     15/07/2016    237  247          B-DATE 

O                 58
I-PERSON           2
I-OCCUPATION       2
I-ORGANIZATION     2
B-PERSON           1
B-OCCUPATION       1
B-ORGANIZATION     1
B-DATE             1
Name: label, dtype: int64 

SENTENCE 1 

[[0, 24, 'PERSON'], [25, 56, 'OCCUPATION']] 

        token  start  end         label
0    MAURÍCIO      0    8      B-PERSON
1      VIÉGAS      9   15      I-PERSON
2        

In [None]:
target_labels = ['O', 'B-PERSON', 'I-PERSON', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-LOCATION', 'I-LOCATION', 'B-MISCELLANEOUS', 'I-MISCELLANEOUS']

# Convert label from name to id
label_to_id = {
    'O': 0, 
    'B-PERSON': 1, 
    'I-PERSON': 2, 
    'B-ORGANIZATION': 3, 
    'I-ORGANIZATION': 4, 
    'B-LOCATION': 5, 
    'I-LOCATION': 6, 
    'B-MISCELLANEOUS': 7, 
    'I-MISCELLANEOUS': 8
}

id_to_label = {
    0: 'O', 
    1: 'B-PERSON', 
    2: 'I-PERSON', 
    3: 'B-ORGANIZATION', 
    4: 'I-ORGANIZATION', 
    5: 'B-LOCATION', 
    6: 'I-LOCATION', 
    7: 'B-MISCELLANEOUS', 
    8: 'I-MISCELLANEOUS'
}
for sentence in list_of_tokenized_sentences:
    sentence['label'] = sentence['label'].map(label_to_id)


In [None]:
id_to_label

{0: 'O',
 1: 'B-PERSON',
 2: 'I-PERSON',
 3: 'B-ORGANIZATION',
 4: 'I-ORGANIZATION',
 5: 'B-LOCATION',
 6: 'I-LOCATION',
 7: 'B-MISCELLANEOUS',
 8: 'I-MISCELLANEOUS'}

The format of DatasetDict, which we'll want to use, is a list of three arrow Datasets: train, test and validation. 
Each Dataset is composed of two main object: features and num_rows. We need to make sure our JSON has the features 'tokens' and 'ner_tags'
The sample data uses the following dictionary to convert each label to an int:

**{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}**

Since we are using the exact same labels we can utilize this dictionary as well.

### Splitting dataset into train and test
We'll split our DataFrame into to lists of lists. One for the input tokens and another for the labels.
After that we'll use the scklearn train_test_split method to get both our train and test data.

In [None]:
# Divide inputs and tokens into X and y lists.
sample_X = []
sample_y = []
for sentence in list_of_token_df:
    sample_X.append(list(sentence['token']))
    sample_y.append(list(sentence['label']))

# Split X and y into train and test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample_X, sample_y, test_size=0.33, random_state=43)

In [None]:
def get_label_distribution(seq_labels):
    label_count = {}
    for label in target_labels:
        label_count[label] = 0
    for seq in seq_labels:
        for target_id in seq:
            label = id_to_label[target_id]
            label_count[label] += 1
    return label_count

In [None]:
get_label_distribution(y_train)

{'O': 2495,
 'B-PERSON': 52,
 'I-PERSON': 138,
 'B-ORGANIZATION': 68,
 'I-ORGANIZATION': 233,
 'B-LOCATION': 7,
 'I-LOCATION': 4,
 'B-MISCELLANEOUS': 78,
 'I-MISCELLANEOUS': 132}

In [None]:
get_label_distribution(y_test)

{'O': 1089,
 'B-PERSON': 22,
 'I-PERSON': 53,
 'B-ORGANIZATION': 30,
 'I-ORGANIZATION': 126,
 'B-LOCATION': 2,
 'I-LOCATION': 0,
 'B-MISCELLANEOUS': 33,
 'I-MISCELLANEOUS': 52}

In [None]:
# Create a train and test dictionary
train_data = {'inputs': X_train, 'targets': y_train}
test_data = {'inputs': X_test, 'targets': y_test}

#Convert dictionary into DataFrame
#Needed as intermediary step because DataFrames support convertion into the json record format we need.
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

#Convert DataFrame into json
train_json = train_df.to_json(orient='records')
test_json = test_df.to_json(orient='records')

In [None]:
#Save train and test jsons in the auto_data directory.
#This directory will serve as the repository of our auto labeled data and we'll use it to import the data with the datasets library.
import os
os.path
file_path = "C:\\Users\\arthu\\Desktop\\ner-using-bert\BERT_Experiment\\auto_data\\"

with open(file_path+'train.json', 'w') as outfile:
    outfile.write(train_json)

with open(file_path+'test.json', 'w') as outfile:
    outfile.write(test_json)

### Load and validate
We load the dataset that we saved previously. We'll use the load_dataset method from the datasets library, which will allow us to easily use hugging face models with our data.

In [None]:
ufrgs_data = load_dataset('json', data_dir = file_path)

Downloading and preparing dataset json/default to C:/Users/arthu/.cache/huggingface/datasets/json/default-b4d3e01485de334b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/arthu/.cache/huggingface/datasets/json/default-b4d3e01485de334b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# we get a DatasetDict object with two Datasets, one for train and one for test.
ufrgs_data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 38
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 19
    })
})

In [None]:
print(ufrgs_data['train'][12]['inputs'])
print(ufrgs_data['train'][12]['targets'])

['Documento', 'gerado', 'sob', 'autenticação', 'Nº', 'QON.500.984.BHA', ',', 'disponível', 'no', 'endereço', 'http', ':', '//www.ufrgs.br/autenticacao', '1/1', 'PORTARIA', 'Nº', '1184', 'de', '18/02/2016', 'O', 'PRÓ-REITOR', 'DE', 'GESTÃO', 'DE', 'PESSOAS', 'DA', 'UNIVERSIDADE', 'FEDERAL', 'DO', 'RIO', 'GRANDE', 'DO', 'SUL', ',', 'no', 'uso', 'de', 'suas', 'atribuições', 'que', 'lhe', 'foram', 'conferidas', 'pela', 'Portaria', 'nº', '5469', ',', 'de', '04', 'de', 'outubro', 'de', '2012', ',', 'do', 'Magnífico', 'Reitor', ',', 'e', 'conforme', 'o', 'Laudo', 'Médico', 'n°37308', ',', 'RESOLVE', ':', 'Designar', ',', 'temporariamente', ',', 'nos', 'termos', 'da', 'Lei', 'nº', '8.112', ',', 'de', '11', 'de', 'dezembro', 'de', '1990', ',', 'com', 'redação', 'dada', 'pela', 'Lei', 'nº', '9.527', ',', 'de', '10', 'de', 'dezembro', 'de', '1997', ',', 'a', 'ocupante', 'do', 'cargo', 'de', 'PORTEIRO', ',', 'do', 'Quadro', 'de', 'Pessoal', 'desta', 'Universidade', ',', 'ELIANE', 'RICARDO', 'IRANC

# NOTEBOOK APPENDIX TO TEST TOKENIZATION METHODS

In [19]:
test_sentence = annotated_df['text'][0].lower()
test_sentence

'conceder à servidora lisiane ramos vilk, ocupante do cargo de administrador de edifícios - 701400, lotada na faculdade de arquitetura, siape 2325261, o percentual de 25% (vinte e cinco por cento) de incentivo à qualificação, a contar de 15/07/2016, tendo em vista a conclusão do curso de graduação em administração - bacharelado, conforme o processo nº 23078.015333/2016-19.'

In [20]:
w_tkn = TreebankWordTokenizer().tokenize(test_sentence)

In [21]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_1 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-large-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_2 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_3 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-large-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_4 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_5 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-large-uncased-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_6 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'bert-large-cased-whole-word-masking'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_7 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_8 = tokenizer(test_sentence, is_split_into_words=False)

checkpoint = 'neuralmind/bert-large-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sw_tkn_9 = tokenizer(test_sentence, is_split_into_words=False)

In [22]:
print(w_tkn)

['conceder', 'à', 'servidora', 'lisiane', 'ramos', 'vilk', ',', 'ocupante', 'do', 'cargo', 'de', 'administrador', 'de', 'edifícios', '-', '701400', ',', 'lotada', 'na', 'faculdade', 'de', 'arquitetura', ',', 'siape', '2325261', ',', 'o', 'percentual', 'de', '25', '%', '(', 'vinte', 'e', 'cinco', 'por', 'cento', ')', 'de', 'incentivo', 'à', 'qualificação', ',', 'a', 'contar', 'de', '15/07/2016', ',', 'tendo', 'em', 'vista', 'a', 'conclusão', 'do', 'curso', 'de', 'graduação', 'em', 'administração', '-', 'bacharelado', ',', 'conforme', 'o', 'processo', 'nº', '23078.015333/2016-19', '.']


In [23]:
print(sw_tkn_1.tokens())
print(sw_tkn_2.tokens())
print(sw_tkn_3.tokens())
print(sw_tkn_4.tokens())
print(sw_tkn_5.tokens())
print(sw_tkn_6.tokens())
print(sw_tkn_7.tokens())
print(sw_tkn_8.tokens())
print(sw_tkn_9.tokens())

['[CLS]', 'con', '##cede', '##r', 'a', 'ser', '##vid', '##ora', 'li', '##sian', '##e', 'ramos', 'vi', '##lk', ',', 'o', '##cup', '##ante', 'do', 'cargo', 'de', 'ad', '##mini', '##stra', '##dor', 'de', 'ed', '##ific', '##ios', '-', '70', '##14', '##00', ',', 'lot', '##ada', 'na', 'fa', '##cu', '##lda', '##de', 'de', 'ar', '##qui', '##tet', '##ura', ',', 'si', '##ape', '232', '##52', '##6', '##1', ',', 'o', 'percent', '##ual', 'de', '25', '%', '(', 'vin', '##te', 'e', 'ci', '##nco', 'por', 'cent', '##o', ')', 'de', 'inc', '##ent', '##ivo', 'a', 'qu', '##ali', '##fi', '##ca', '##cao', ',', 'a', 'con', '##tar', 'de', '15', '/', '07', '/', '2016', ',', 'tend', '##o', 'em', 'vista', 'a', 'con', '##cl', '##usa', '##o', 'do', 'cu', '##rso', 'de', 'gr', '##ad', '##ua', '##cao', 'em', 'ad', '##mini', '##stra', '##cao', '-', 'bach', '##are', '##lad', '##o', ',', 'conform', '##e', 'o', 'process', '##o', 'n', '##º', '230', '##7', '##8', '.', '01', '##53', '##33', '/', '2016', '-', '19', '.', '[SEP]