### Import external dependencies

In [2]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
import re
import sys
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt', quiet=True)

True

### Import internal dependencies

In [3]:
# current_dir = os.path.dirname(os.path.abspath('__file__'))
# parent_dir = os.path.dirname(current_dir)
# sys.path.append(parent_dir)
# from src.data_cleaning import data_cleaning

from src.data_cleaning.data_cleaning import clear_text

ModuleNotFoundError: No module named 'src'

#### Open Unannotated Documents
We get the unnanotatted texts as key, value pairs. This will make it easy to convert them into a dataframe.

In [None]:
path_to_test = "../data/test/raw_documents/"
path_to_train = "../data/train/raw_documents/"

In [None]:
documents = {}

# Get train documents that require manual labeling
for filename in os.listdir(path_to_train):
   if filename.endswith(".txt"):
      with open(os.path.join(path_to_train, filename), 'r', encoding="utf8") as f:
         key = int(filename.replace('.txt', ''))
         value = f.read()
         documents[key] = value

# Transform documents dictionary into a pandas dataframe
df_docs = pd.DataFrame.from_dict(documents, orient='index')
df_docs.columns = ['text']

print(df_docs.shape)

#### Data Cleaning
We will use the **data_processing.clear_text** function which was built to clean the specific document we are using here. Since we already know they will be used for NER, the objective of the cleaning is getting the text into a format which will be easy to split into sentences. Each sentence will later be used as an input to our neural network.
There are three aspects we are cleaning:
1. Removing line breaks as they make it very hard for the sentence tokenizer to correctly recognize the begining and ending of sentences in the text
2. Removing repeated whitespaces which are used for the visual formatting of the documents but will generate unnecessary tokens for our neural network.
3. There are a lot of law and document numbers in this documents. There isn't a consistent writing of these numbers which can start as "nº.123", "nº. 123" and "nº 123". We will padronize this occurences to appear as "nº 123" since the punctuation after "nº" makes it harder for the sentence tokenizer to correctly separate the sentences.

In [None]:
df_docs['text'] = df_docs['text'].apply(data_cleaning.clear_text)

#### Sentence tokenization
Having cleaned our documents we will split them into sentences. The **data_processing.split_text_sentences** function will return a DataFrame with: The sentences, a unique ID for each sentence and the index of the document each sentence is a part of.

In [None]:
df_sentences = data_cleaning.split_text_sentences(df_docs['text'])
pd.set_option("display.max_colwidth", 0)
print(df_sentences.shape)

#### Dealing with duplicate sentences
Now that we have our dataset of sentences we'll go to the last step which is identifying and categorizing duplicate sentences. Since we need to manually annotate the dataset for training we can group duplicate sentences to facilitate the anottation process. The document we have are fairly standardized so there are a few duplicates we will find.

Let's group sentences by their textual content. Each group will be composed of identical sentences.

In [None]:
duplicate_group = df_sentences.groupby('sentence')

We loop each group of duplicated sentences and create a dictionary where the key will be the index of the first sentence of the group and the value is the index of all sentences equal to the first.

In [None]:
duplicates = {}
for i in duplicate_group:
    dup_list = i[1]['sentence_id'].tolist()
    duplicates[dup_list[0]] = dup_list

We create a dataframe which will contain only unique sentences. The sentences that have identic pairs will have all of their indexes in a list in the 'duplicates' column. This will allow us to replicate the labels after the anotattion process.

In [None]:
df_unique_sentences = df_sentences.copy()
df_unique_sentences['duplicates'] = pd.Series(duplicates, index=df_sentences.index)
df_unique_sentences = df_unique_sentences.dropna(subset=['duplicates'])
df_unique_sentences['label'] = ""

We can see that the unique dataframe has less sentences which represen the duplicates we removed

In [None]:
pprint(df_sentences.shape)
pprint(df_unique_sentences.shape)

#### Manual Removal of Bad Sentences
Some sentences are not going to be helpful on our final train and test data. They either have just one or no entities we are interested in. Because of that we'll remove them from our dataset.

In [None]:
pprint(df_unique_sentences.shape)
df_unique_sentences = df_unique_sentences[df_unique_sentences['sentence'].str.len() > 20]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Documento gerado sob")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Solicitação nº")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.startswith("Processo")]
df_unique_sentences = df_unique_sentences[~df_unique_sentences['sentence'].str.contains("RESOLVE")]
pprint(df_unique_sentences.shape)

In [None]:
#pprint(df_unique_sentences.to_json(lines=True, orient = 'records'))
save_path = "../data/unannotated/"
df_docs.to_json(os.path.join(save_path, 'clean_raw_documents.jsonl'),lines=True, orient = 'records')
df_unique_sentences.to_json(os.path.join(save_path, 'unlabeled_sentences.jsonl'),lines=True, orient = 'records')

#### Assistive Auto Labelling
Before we import our sentences into doccano for manual labelling, we'll use the auto_tagger to automatically label them. This will make it easier to do the manual labelling as a few entities will already be correctly or almost correctly identified.

In [None]:
# Load the auto_tagger model
from transformers import pipeline
assistive_auto_tagger = pipeline(
    "token-classification",
    model='auto_tagger',
    aggregation_strategy="average",
    ignore_labels=[""],
    device=0
)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Run the sentences through our NER tagging model
df_unique_sentences['tagged_entities'] = df_unique_sentences['sentence'].apply(lambda x: assistive_auto_tagger(x[:512]))

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Reformat the classified entities text to match the original sentence.
df_unique_sentences['tagged_entities'] = df_unique_sentences.apply(lambda x: data_processing.reformat_sentence(x['sentence'], x['tagged_entities']), axis=1)

In [None]:
df_unique_sentences['tagged_entities'].head(10)

In [None]:
# Function that takes a Hugging Face NER Pipeline dictionary output and formats it in a format that can be used by Doccano for manual labelling.
# This is useful when using the NER Pipeline as an assistive auto labelling before doing the manual work.
def ner_entity_dict_to_doccano_jsonl(tagged_entities):
    # Join the entities texts to get the full sentence
    full_sentence_text = ""
    entities_metadata = []
    for entity in tagged_entities:
        full_sentence_text = full_sentence_text + entity['word'] + ' '
        if entity['entity_group'] != 'O':
            entities_metadata.append([entity['start'], entity['end'], entity['entity_group']])
    full_sentence_text = re.sub(' ,', ',', full_sentence_text)
    return full_sentence_text, entities_metadata

    # Get a list of the entities in the format [START, END, TYPE] for each of them.

In [None]:
df_unique_sentences.head()

In [None]:
df_unique_sentences['tagged_entities'].loc[2]
print(ner_entity_dict_to_doccano_jsonl(df_unique_sentences['tagged_entities'].loc[2]))
df_unique_sentences['label'] = df_unique_sentences.apply(lambda x: ner_entity_dict_to_doccano_jsonl(x['tagged_entities'])[1],axis=1)

In [None]:
df_unique_sentences.to_json(os.path.join(path_to_train, 'unique_sentences.jsonl'),lines=True, orient = 'records')

In [None]:
df_unique_sentences[df_unique_sentences['sentence_id'] == 2507]