In [82]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordTokenizer, WhitespaceTokenizer
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
import os
import re
import data_processing
from datasets import load_dataset
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
path_to_test_file = "acerpi_dataset/test/"
path_to_train_file = "acerpi_dataset/train/"

#### Open Unannotated Train Documents
We get the unnanotatted train texts and later we'll store each document as a key, value pair in a dictionary

In [41]:
train_documents = {}
for filename in os.listdir(path_to_train_file):
   if filename.endswith(".txt"):
      with open(os.path.join(path_to_train_file, filename), 'r', encoding="utf8") as f:
         key = int(filename.replace('.txt', ''))
         value = f.read()
         train_documents[key] = value

#### Open Annotated Sentences
We get the sentences from 'prep_data_labeling.ipynb' that we annotated fully manually with deccano.

In [44]:
annotated_sentences = pd.read_json(os.path.join(path_to_test_file, 'annotated_sentences.jsonl'), orient='record', lines=True)

In [45]:
print(annotated_sentences.shape)
annotated_sentences.head()

(84, 6)


Unnamed: 0,id,text,document,sentence_id,duplicates,label
0,115,Documento gerado sob autenticação Nº LKB.506.4...,105798,0,[0],"[[233, 238, ORGANIZATION]]"
1,116,1/1 PORTARIA Nº 1955 de 05/03/2020 O PRÓ-REITO...,105798,1,[1],"[[72, 113, ORGANIZATION], [448, 475, MISCELLAN..."
2,117,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,105798,2,"[2, 5, 8, 11, 17, 19, 21, 23, 25, 41, 56]","[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
3,118,Documento gerado sob autenticação Nº BOA.507.6...,105799,3,[3],"[[233, 238, ORGANIZATION]]"
4,119,1/1 PORTARIA Nº 1956 de 05/03/2020 O PRÓ-REITO...,105799,4,[4],"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."


Next we'll explode our dataframe so that duplicate sentences can become separate rows. We'll also drop a few columns that will have inconsistent information after the explode and will also not be useful for us.

In [46]:
pd.options.display.max_rows = 999
annotated_sentences = annotated_sentences.explode("duplicates")
annotated_sentences.drop(columns=['id', 'document', 'sentence_id'], inplace=True)
annotated_sentences.columns = ['text', 'sentence_id', 'label']
annotated_sentences = annotated_sentences.set_index(['sentence_id'], verify_integrity=True)
annotated_sentences.index.name = None
annotated_sentences = annotated_sentences.sort_index()

  return Index(sequences[0], name=names)


In [47]:
print(annotated_sentences.shape)
annotated_sentences.head()

(106, 2)


Unnamed: 0,text,label
0,Documento gerado sob autenticação Nº LKB.506.4...,"[[233, 238, ORGANIZATION]]"
1,1/1 PORTARIA Nº 1955 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [448, 475, MISCELLAN..."
2,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[[0, 24, PERSON], [25, 56, MISCELLANEOUS]]"
3,Documento gerado sob autenticação Nº BOA.507.6...,"[[233, 238, ORGANIZATION]]"
4,1/1 PORTARIA Nº 1956 de 05/03/2020 O PRÓ-REITO...,"[[72, 113, ORGANIZATION], [229, 235, MISCELLAN..."


Let's tokenize the sentences and tokens using the TreebankWordTokenizer
The first method 'tokenize' will split our sentence and return a list of words which we'll organize in a 'tokenized_sentences' list.
The second method 'span_tokenize' will return the start and end position of each token resulting from the split. We'll organize them in a 'token_positions' list.

In [54]:
tokenized_sentences = []
token_positions = []
for idx, sentence in annotated_sentences['text'].items():
    tokenized_sentences.append(TreebankWordTokenizer().tokenize(sentence))
    token_positions.append(list(TreebankWordTokenizer().span_tokenize(sentence)))


print(tokenized_sentences[0])
print(token_positions[0])

['Documento', 'gerado', 'sob', 'autenticação', 'Nº', 'LKB.506.405.IRF', ',', 'disponível', 'no', 'endereço', 'http', ':', '//www.ufrgs.br/autenticacao', 'Documento', 'certificado', 'eletronicamente', ',', 'conforme', 'Portaria', 'nº', '3362/2016', ',', 'que', 'institui', 'o', 'Sistema', 'de', 'Documentos', 'Eletrônicos', 'da', 'UFRGS', '.']
[(0, 9), (10, 16), (17, 20), (21, 33), (34, 36), (37, 52), (52, 53), (54, 64), (65, 67), (68, 76), (77, 81), (81, 82), (82, 109), (110, 119), (120, 131), (132, 147), (147, 148), (149, 157), (158, 166), (167, 169), (170, 179), (179, 180), (181, 184), (185, 193), (194, 195), (196, 203), (204, 206), (207, 217), (218, 229), (230, 232), (233, 238), (238, 239)]


In [53]:
tokenized_sentence_position = []
for sentence, positions in zip(tokenized_sentences, token_positions):
    tokenized_sentence_position.append(list(zip(sentence, positions)))

In [52]:
print(annotated_sentences['label'].loc[43])
tokenized_sentence_position[43]

[[43, 91, 'MISCELLANEOUS']]


[('1/2', (0, 3)),
 ('PORTARIA', (4, 12)),
 ('Nº', (13, 15)),
 ('3796', (16, 20)),
 ('de', (21, 23)),
 ('03/05/2017', (24, 34)),
 ('Nomeia', (36, 42)),
 ('Coordenadoras', (43, 56)),
 ('do', (57, 59)),
 ('Programa', (60, 68)),
 ('Idiomas', (69, 76)),
 ('sem', (77, 80)),
 ('Fronteiras', (81, 91)),
 ('-', (92, 93)),
 ('IsF', (94, 97)),
 ('.', (97, 98))]

Next we create a list of dataframes. Each dataframe is a sentence and will have the columns 'token', 'start' and 'end.

In [248]:
list_of_token_df = []
for list_of_words in tokenized_sentence_position:
    tokens_df = pd.DataFrame(list_of_words, columns = ['token', 'pos'])
    tokens_df[['start', 'end']] = tokens_df['pos'].to_list()
    tokens_df = tokens_df.drop(columns='pos')
    list_of_token_df.append(tokens_df)

In [249]:
list_of_token_df[43]

Unnamed: 0,token,start,end
0,1/2,0,3
1,PORTARIA,4,12
2,Nº,13,15
3,3796,16,20
4,de,21,23
5,03/05/2017,24,34
6,Nomeia,36,42
7,Coordenadoras,43,56
8,do,57,59
9,Programa,60,68


In [250]:
test_label

[[72, 113, 'ORGANIZATION'],
 [448, 475, 'MISCELLANEOUS'],
 [518, 546, 'PERSON'],
 [582, 609, 'PERSON'],
 [629, 636, 'MISCELLANEOUS'],
 [640, 671, 'ORGANIZATION'],
 [675, 719, 'ORGANIZATION']]

We now have, for each sentences, a dataframe of all the word and a list with all the labels.
What needs to be done is assign the correct label to each word of the tokenized sentence.

In [251]:
for index, token_df in enumerate(list_of_token_df):
    labels = []
    is_entity = False
    for idx, token in token_df.iterrows():
        for label in annotated_sentences['label'].loc[index]:
            if token['start'] == label[0]:
                labels.append('B-' + label[2])
                is_entity = True
                break
            if token['start'] >= label[0] and token['end'] <= label[1]:
                labels.append('I-' + label[2])
                is_entity = True
                break
            is_entity = False
        if is_entity == False:
            labels.append('O')
    token_df['label'] = labels

In [252]:
print(annotated_sentences.shape,'\n',
      len(list_of_token_df))

(106, 2) 
 106


In [253]:
# Validate that the tokens are correctly assigned to each label.
for index in range(0, 105): 
    print(annotated_sentences['label'].loc[index])
    print(list_of_token_df[index][list_of_token_df[index]['label'] != 'O'])

[[233, 238, 'ORGANIZATION']]
    token  start  end           label
30  UFRGS    233  238  B-ORGANIZATION
[[72, 113, 'ORGANIZATION'], [448, 475, 'MISCELLANEOUS'], [518, 546, 'PERSON'], [582, 609, 'PERSON'], [629, 636, 'MISCELLANEOUS'], [640, 671, 'ORGANIZATION'], [675, 719, 'ORGANIZATION']]
             token  start  end            label
13    UNIVERSIDADE     72   84   B-ORGANIZATION
14         FEDERAL     85   92   I-ORGANIZATION
15              DO     93   95   I-ORGANIZATION
16             RIO     96   99   I-ORGANIZATION
17          GRANDE    100  106   I-ORGANIZATION
18              DO    107  109   I-ORGANIZATION
19             SUL    110  113   I-ORGANIZATION
92      ASSISTENTE    448  458  B-MISCELLANEOUS
93              EM    459  461  I-MISCELLANEOUS
94   ADMINISTRAÇÃO    462  475  I-MISCELLANEOUS
103      FRANCIELE    518  527         B-PERSON
104        MARQUES    528  535         I-PERSON
105     ZIQUINATTI    536  546         I-PERSON
114         TURENE    582  588       

In [254]:
# Convert label from name to id
label_to_id = {
    'O': 0, 
    'B-PERSON': 1, 
    'I-PERSON': 2, 
    'B-ORGANIZATION': 3, 
    'I-ORGANIZATION': 4, 
    'B-LOCATION': 5, 
    'I-LOCATION': 6, 
    'B-MISCELLANEOUS': 7, 
    'I-MISCELLANEOUS': 8
}
for sentence in list_of_token_df:
    sentence['label'] = sentence['label'].map(label_to_id)


In [262]:
list_of_token_df[6]

Unnamed: 0,token,start,end,label
0,Documento,0,9,0
1,gerado,10,16,0
2,sob,17,20,0
3,autenticação,21,33,0
4,Nº,34,36,0
5,KMT.508.839.IRF,37,52,0
6,",",52,53,0
7,disponível,54,64,0
8,no,65,67,0
9,endereço,68,76,0


### Exploring a dataset from the datasets library
The easiest way to use our data with HuggingFace is to use the datasets library. It allows us to import our own data and it will format it into a Dataset Object that is ready to be used by the NER model.

To understand how our data need to be formatted let's explore a sample dataset that already exists inside the datasets library.

In [263]:
sample_data = load_dataset('conll2003')
sample_data

Found cached dataset conll2003 (C:/Users/arthu/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

The format of DatasetDict, which we'll want to use, is a list of three arrow Datasets: train, test and validation. 
Each Dataset is composed of two main object: features and num_rows. We need to make sure our JSON has the features 'tokens' and 'ner_tags'
The sample data uses the following dictionary to convert each label to an int:

**{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}**

Since we are using the exact same labels we can utilize this dictionary as well.

In [264]:
sample_data['train']['tokens'][:3]

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22']]

In [265]:
sample_data['train']['ner_tags'][:3]

[[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2], [5, 0]]

### Splitting dataset into train and test
We'll split our DataFrame into to lists of lists. One for the input tokens and another for the labels.
After that we'll use the scklearn train_test_split method to get both our train and test data.

In [266]:
# Divide inputs and tokens into X and y lists.
sample_X = []
sample_y = []
for sentence in list_of_token_df:
    sample_X.append(list(sentence['token']))
    sample_y.append(list(sentence['label']))

# Split X and y into train and test.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sample_X, sample_y, test_size=0.33, random_state=42)

In [267]:
# Create a train and test dictionary
train_data = {'inputs': X_train, 'targets': y_train}
test_data = {'inputs': X_test, 'targets': y_test}

#Convert dictionary into DataFrame
#Needed as intermediary step because DataFrames support convertion into the json record format we need.
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

#Convert DataFrame into json
train_json = train_df.to_json(orient='records')
test_json = test_df.to_json(orient='records')

In [268]:
#Save train and test jsons in the auto_data directory.
#This directory will serve as the repository of our auto labeled data and we'll use it to import the data with the datasets library.
import os
os.path
file_path = "C:\\Users\\arthu\\Desktop\\ner-using-bert\BERT_Experiment\\auto_data\\"

with open(file_path+'train.json', 'w') as outfile:
    outfile.write(train_json)

with open(file_path+'test.json', 'w') as outfile:
    outfile.write(test_json)

### Load and validate
We load the dataset that we saved previously. We'll use the load_dataset method from the datasets library, which will allow us to easily use hugging face models with our data.

In [269]:
ufrgs_data = load_dataset('json', data_dir = file_path)

Downloading and preparing dataset json/default to C:/Users/arthu/.cache/huggingface/datasets/json/default-acb63ead6bdfc0f7/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/arthu/.cache/huggingface/datasets/json/default-acb63ead6bdfc0f7/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [308]:
# we get a DatasetDict object with two Datasets, one for train and one for test.
ufrgs_data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 71
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 35
    })
})

In [299]:
print(ufrgs_data['train'][12]['inputs'])
print(ufrgs_data['train'][12]['targets'])

['RUI', 'VICENTE', 'OPPERMANN', ',', 'Reitor', '.']
[1, 2, 2, 0, 0, 0]


### Tokenizer
We create a tokenizer to convert our inputs into sub-word ids. We need to use a tokenizer that is compatible with the model we'll use.
HuggingFace makes that easy through the AutoTokenizer, which allows us to specify which model will be used and it already makes sure that our tokenizer will work with it.


In [274]:
from transformers import AutoTokenizer

In [275]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [294]:
t = tokenizer(ufrgs_data['train'][12]['inputs'], is_split_into_words=True)
t

{'input_ids': [101, 155, 22054, 7118, 10954, 15681, 2036, 152, 20923, 9637, 8271, 2249, 2249, 117, 11336, 15419, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [295]:
#The tokenizer return looks like a dictionary but it is actually an object called BatchEncoding
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [296]:
#The object has a tokens method that returns the original tokens before transforming them into integers
t.tokens()

['[CLS]',
 'R',
 '##UI',
 'VI',
 '##CE',
 '##NT',
 '##E',
 'O',
 '##PP',
 '##ER',
 '##MA',
 '##N',
 '##N',
 ',',
 'Re',
 '##itor',
 '.',
 '[SEP]']

In [297]:
# The word_ids methods returns a list with the ID that maps each sub-word to the original word it was tokenized from.
t.word_ids()

[None, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 4, 4, 5, None]

### Target Alignment
Now that our input is composed of sub-words, we need to make sure that we have one target per sub-word. To do this we will use the align_targets function and map targets from each word to its sub-words.

In [300]:
#Define relationship between B and I tags
#['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8
}

# Function that aligns the labels to be correctly associated with each sub-word.
def align_targets(labels, word_ids):
    aligned_labels = []
    previous_word = None

    for word in word_ids:
        if word is None:
            # Tokens like [CLS] and [SEP]
            label = -100 #This value is used by Hugging Face to ignore the tokens during training
        elif word != previous_word:
            # New word in the list
            label = labels[word]
        else:
            #Repeated word (Would be the next sub-word)
            if labels[word] in begin2inside:
                #Change B- to I-
                label = begin2inside[labels[word]]
            else:
                # Sub-word of a word classified as "O" gets the same label "O"
                label = labels[word]

        aligned_labels.append(label)
        previous_word = word #update last word
 
    return aligned_labels

In [307]:
## Label-Token Alignment Test
idx = 2
test_data = tokenizer(ufrgs_data['train'][idx]['inputs'], is_split_into_words=True)
print("Tokenized Data:", test_data)
print("Word Tokens:", ufrgs_data['train'][idx]['inputs'])
test_labels = ufrgs_data['train'][idx]['targets']
print("Word Labels:", test_labels)
print("Word IDs:", test_data.word_ids())
aligned_targets = align_targets(test_labels, test_data.word_ids())
print("Sub-Word Labels:", aligned_targets)
print("Sub-Word Tokens:", test_data.tokens())

Tokenized Data: {'input_ids': [101, 9960, 19556, 28188, 19747, 2346, 7118, 28187, 10583, 1708, 141, 1592, 156, 17656, 12152, 153, 1197, 7774, 118, 11336, 15419, 1260, 144, 2556, 9290, 1260, 153, 5800, 12985, 1116, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Word Tokens: ['MAURÍCIO', 'VIÉGAS', 'DA', 'SILVA', 'Pró-Reitor', 'de', 'Gestão', 'de', 'Pessoas']
Word Labels: [1, 2, 2, 2, 7, 8, 8, 8, 8]
Word IDs: [None, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 7, 8, 8, 8, 8, None]
Sub-Word Labels: [-100, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, -100]
Sub-Word Tokens: ['[CLS]', 'MA', '##UR', '##Í', '##CI', '##O', 'VI', '##É', '##GA', '##S', 'D', '##A', 'S', '##IL', '##VA', 'P', '##r', '##ó', '-', 'Re', '##itor', 'de', 'G', '##est', '##ão', 'de', 'P', '##ess', '##oa', '##s', '[SEP]']
