#### Import dependencies

In [1]:
#External
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
import re
import sys
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

#Internal
import data_cleaning.data_cleaning as dc

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Open Unannotated Documents
We get the unnanotatted texts as key, value pairs. This will make it easy to convert them into a dataframe.

In [2]:
path_to_raw = "../data/raw_documents/"

In [3]:
documents = {}

# Get documents that require manual labeling
for filename in os.listdir(path_to_raw):
   if filename.endswith(".txt"):
      with open(os.path.join(path_to_raw, filename), 'r', encoding="utf8") as f:
         key = int(filename.replace('.txt', ''))
         value = f.read()
         documents[key] = value

# Transform documents dictionary into a pandas dataframe
df_docs = pd.DataFrame.from_dict(documents, orient='index')
df_docs.columns = ['text']

print(df_docs.shape)

(568, 1)


#### Data Cleaning
We will use the **data_processing.clear_text** function which was built to clean the specific document we are using here. Since we already know they will be used for NER, the objective of the cleaning is getting the text into a format which will be easy to split into sentences. Each sentence will later be used as an input to our neural network.
There are three aspects we are cleaning:
1. Removing line breaks as they make it very hard for the sentence tokenizer to correctly recognize the begining and ending of sentences in the text
2. Removing repeated whitespaces which are used for the visual formatting of the documents but will generate unnecessary tokens for our neural network.
3. There are a lot of law and document numbers in this documents. There isn't a consistent writing of these numbers which can start as "nº.123", "nº. 123" and "nº 123". We will padronize this occurences to appear as "nº 123" since the punctuation after "nº" makes it harder for the sentence tokenizer to correctly separate the sentences.

In [4]:
df_docs['text'] = df_docs['text'].apply(dc.clear_text)

#### Sentence tokenization
Having cleaned our documents we will split them into sentences. The **data_processing.split_text_sentences** function will return a DataFrame with: The sentences, a unique ID for each sentence and the index of the document each sentence is a part of.

In [5]:
df_sentences = dc.split_text_sentences(df_docs['text'])

# We create a new column doc_sentence_id that serves as a unique identifier for each sentence in our dataset
df_sentences['doc_sentence_id'] = df_sentences['document'].astype(str) + '-' + df_sentences['sentence_id'].astype(str)

print(df_sentences.shape)

(2648, 4)


#### Dealing with duplicate sentences
Now that we have our dataset of sentences we'll go to the last step which is identifying and categorizing duplicate sentences. Since we need to manually annotate the dataset for training we can group duplicate sentences to facilitate the anottation process. The document we have are very standardized so there are multiple duplicate sentences in them.

Let's group sentences by their text. Each group will be composed of identical sentences.

In [6]:
duplicate_group = df_sentences.groupby('sentence')

# Loop each group of duplicated sentences and create a dictionary where the key will be the index of the first sentence of the group and the value is the doc_sentence_id of all sentences equal to the first.
duplicates = {}
for i in duplicate_group:
    dup_list = i[1]['doc_sentence_id'].tolist()
    duplicates[dup_list[0]] = dup_list

We create a dataframe which will contain only unique sentences. The sentences that have identic pairs will have all of their indexes in a list in the 'duplicates' column. This will allow us to replicate the labels after the anotattion process.

In [7]:
df_unique_sentences = df_sentences.copy()
df_unique_sentences['duplicates'] = df_unique_sentences['doc_sentence_id'].map(duplicates)
df_unique_sentences = df_unique_sentences.dropna(subset=['duplicates'])
df_unique_sentences['label'] = ""
df_unique_sentences['tagged_entities'] = ''

We can see that the new dataframe has less sentences since each entry is a unique sentence.

In [8]:
pprint(df_sentences.shape)
pprint(df_unique_sentences.shape)

(2648, 4)
(2070, 7)


#### Bad Sentence Filter
Some sentences are not going to be helpful on our final train and test data. They either have just one or no entities we are interested in. Because of that we'll remove them from our dataset.

In [9]:
pprint(df_unique_sentences.shape)
df_unique_sentences = df_unique_sentences[df_unique_sentences['sentence'].str.len() > 20]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Documento gerado sob")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Solicitação nº")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Processo")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.contains("RESOLVE")]
pprint(df_unique_sentences.shape)

(2070, 7)
(768, 7)


#### Save a clean_documents file with all formatted documents


In [10]:
#pprint(df_unique_sentences.to_json(lines=True, orient = 'records'))
save_path = "../data/unannotated/"
df_docs.to_json(os.path.join(save_path, 'clean_documents_06-12-24.jsonl'),lines=True, orient = 'records')
df_unique_sentences.to_json(os.path.join(save_path, 'unique_sentences-06-12-24.jsonl'),lines=True, orient = 'records')

In [11]:
df_unique_sentences.columns

Index(['document', 'sentence', 'sentence_id', 'doc_sentence_id', 'duplicates',
       'label', 'tagged_entities'],
      dtype='object')

#### Update unannotated dataset
The unique_sentences file is in the format required to start manual anotation in doccano. We'll join it with a dataset of all sentences that have already been annotated so that we can update all of them inside docanno.

In [12]:
# Open annotated dataset to merge it with new unnanotated one.
path_to_annotated = "../data/annotated/"
df_annotated = pd.read_json(os.path.join(path_to_annotated, 'doccano-extraction-27-11-24.jsonl'), lines=True)

#Drop the id column since we don't use it for anything
df_annotated.drop(columns=['id'], inplace=True)

#Rename columns in both datasets so they match
df_annotated.columns = ['text', 'document_id', 'sentence_id', 'duplicate_ids', 'tagged_entities', 'label']
df_unique_sentences.columns = ['document_id', 'text', 'sentence_id', 'doc_sentence_id', 'duplicate_ids', 'label', 'tagged_entities']

### Merge both datasets
We do an outer join to take all rows and columns from both datasets.

We'll keep the **text**, **tagged_entities** and **label** columns from the annotated dataset as they have the most complete information on those (have already been tagged).

The **document_id**, **sentence_id**, **doc_sentence_id** and **duplicate_ids** are updated when new unnanotated documents are added, so we'll keep the unannotated dataset columns for these.

In [13]:
full_unannotated = df = pd.merge(df_annotated, df_unique_sentences, on='text', how='outer')

full_unannotated = full_unannotated[['text', 'tagged_entities_x', 'label_x', 'document_id_y', 'sentence_id_y', 'doc_sentence_id', 'duplicate_ids_y']]
full_unannotated.columns = ['text', 'tagged_entities', 'label', 'document_id', 'sentence_id', 'doc__sentence_id', 'duplicate_ids']

In [17]:
full_unannotated.head()

Unnamed: 0,text,tagged_entities,label,document_id,sentence_id,doc__sentence_id,duplicate_ids
0,"Conceder à servidora LISIANE RAMOS VILK, ocupa...","[{'entity_group': 'O', 'score': 0.7762932777, ...","[[21, 39, PERSON], [62, 88, OCCUPATION], [109,...",25644,2,25644-2,[25644-2]
1,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,"[{'entity_group': 'PERSON', 'score': 0.6592996...","[[0, 24, PERSON], [25, 56, OCCUPATION]]",105798,3,105798-3,"[105798-3, 105799-3, 105801-3, 105802-3, 18001..."
2,Autorizar o afastamento do país de CRISTINE MA...,"[{'entity_group': 'O', 'score': 0.6819901466, ...","[[35, 58, PERSON], [60, 92, OCCUPATION], [119,...",25645,2,25645-2,[25645-2]
3,CARLOS ALEXANDRE NETTO Reitor,"[{'entity_group': 'PERSON', 'score': 0.6323550...","[[0, 22, PERSON], [23, 29, OCCUPATION]]",25645,4,25645-4,"[25645-4, 25646-4, 25647-4, 25648-4, 25649-4, ..."
4,Autorizar o afastamento do país de ANDRE DIAS ...,"[{'entity_group': 'O', 'score': 0.6808940768, ...","[[35, 53, PERSON], [55, 82, OCCUPATION], [109,...",25646,2,25646-2,[25646-2]


#### Save a unique_sentences file with all formatted sentences
This file is **ready** to be imported in doccano for manual anotation


In [16]:
full_unannotated.to_json(os.path.join(save_path, 'unique_sentences-09-12-24.jsonl'),lines=True, orient = 'records')