In [1]:
import data_processing
import os.path
import pandas as pd

#import tokenizer from nltk that is able to separate text into sentences and tokens.
from nltk import tokenize, download
download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
os.getcwd()

'c:\\Users\\arthu\\Desktop\\TCC\\ner-using-bert\\BERT_Experiment'

# Import SpaCy anottated data

In [4]:
train_file_path = "acerpi_dataset/train/annotated/"
test_file_path = "acerpi_dataset/test/annotated/"
train_data = pd.read_json(train_file_path + "ner_ufrgs_correct_correct.jsonl", lines=True)
test_data = pd.read_json(test_file_path + "ner_ufrgs_test.jsonl", lines=True)

## Extracting SpaCy anottations
The first thing we'll do is to extract a list of labeled tokens. When importing from the spacy anotattion, the labels are not formatted by token, so we will need to go through the *spans* columns and the *tokens* columns to change the labelling from by document to be by token.

In [5]:
pd.set_option("max_colwidth", 400)
display(train_data['tokens'][1][0:2])
train_data, spacy_tokens = data_processing.extract_spacy_tokens(train_data)
display(train_data['tokens'][1][0:2])

[{'text': 'Documento', 'start': 0, 'end': 9, 'id': 0},
 {'text': 'gerado', 'start': 10, 'end': 16, 'id': 1}]

[{'text': 'Documento',
  'start': 0,
  'end': 9,
  'id': 0,
  'tag': 'O',
  'document': 1},
 {'text': 'gerado',
  'start': 10,
  'end': 16,
  'id': 1,
  'tag': 'O',
  'document': 1}]

In [6]:
# There are instances of tokens with \n and a whitespace, we want to remove those because they won't be present in our cleaned text later on.
print(spacy_tokens.shape)
spacy_tokens = spacy_tokens[spacy_tokens['text'].str.fullmatch(r'([A-zÀ-ÿ]+)')]
print(spacy_tokens.shape)

(3053, 3)
(2910, 3)


In [7]:
# Find repeated entities in spacy_tokens and remove them to have a more compact dataframe
token_groups = spacy_tokens.groupby(by='entity_id')
n_of_entities = int(spacy_tokens['entity_id'].max())
duplicate_entities = []
for entity_id in range(n_of_entities+1):
    named_entity = token_groups.get_group(entity_id).reset_index(drop=True)[['text', 'tag']]
    for cmp_index in range(entity_id + 1, n_of_entities+1):
        other_entity = token_groups.get_group(cmp_index).reset_index(drop=True)[['text', 'tag']]
        if named_entity.equals(other_entity):
            duplicate_entities += (list(token_groups.get_group(cmp_index).index.values))

spacy_tokens = spacy_tokens.drop(index=duplicate_entities)

In [13]:
print(spacy_tokens.shape)
spacy_tokens.head(10)

(1680, 3)


Unnamed: 0,text,tag,entity_id
94,GLAISON,PER,0.0
95,AUGUSTO,PER,0.0
96,GUERRERO,PER,0.0
210,RUI,PER,1.0
211,VICENTE,PER,1.0
212,OPPERMANN,PER,1.0
312,LUCIOLA,PER,2.0
313,CAMPESTRINI,PER,2.0
425,JANE,PER,3.0
426,FRAGA,PER,3.0


# Text pre processing
Now that we have a list with all tokens that were classified as a **Named Entity** by SpaCy we need to clean our text data to get better inputs and a cleaner list of tokens for our neural network.

In [None]:
clean_train_text = data_processing.clear_text(train_data['text'])
type(clean_train_text)

In [None]:
train_sentences = data_processing.split_text_sentences(clean_train_text)

# Add a 'tokens' column tokenzing the 'sentence' column
train_sentences['tokens'] = train_sentences.apply(lambda row: tokenize.word_tokenize(row['sentence'], language='portuguese'), axis=1)
train_sentences.head()

In [None]:
# Format the final dataframe where each row will be a token and the columns will be 'Sentence', 'Word' and 'Tag'.
train_tokens = train_sentences.explode('tokens').drop(columns=['document', 'sentence']).reset_index(drop=True)
train_tokens.head()

In [None]:
train_labeled_tokens = data_processing.find_labeled_entitites(train_tokens, spacy_tokens)
train_labeled_tokens.columns = ['Sentence #', 'Word', 'Tag']
train_labeled_tokens[train_labeled_tokens['Tag'] == 'PER'].head()

In [None]:
train_labeled_tokens = data_processing.apply_iob_format(train_labeled_tokens)

In [None]:
# Do a capitalization for all word that belong to a named entity
# This is important since the pre-trained models all have names that follow this format. Thus having all letters being uppercase gives us very bad results using BERT.
token_final['Word'] = token_final.apply(lambda row: row['Word'].capitalize() if row['Tag'] != 'O' else row['Word'], axis=1)

In [None]:
token_final.to_csv(train_file_path + 'acerpi_ner_ufrgs_train_data')