### Replicate Tagged Entities
We'll take a dataset that was partially annotated in doccano and fill the rest of the documents in the dataset with the entities that were already tagged.\
Steps:
1. Load the partially tagged dataset
2. Create a dictionary of the tagged entities
3. Loop through the unannotated documents searching for possible entities in the dictionary.
4. Save the resulting dataset and load it in Doccano to finish manual tagging.



In [318]:
import pandas as pd
import os
import re

#### 1. Load the dataset

In [319]:
path_to_test = "acerpi_dataset/test/annotated"
path_to_train = "acerpi_dataset/train/annotated"

In [320]:
ner_dataset = pd.read_json(os.path.join(path_to_train, '300_annotated_ufrgs.jsonl'), orient='record', lines=True)

In [321]:
ner_dataset.head()

Unnamed: 0,id,text,document,sentence_id,duplicates,tagged_entities,label
0,199,"Conceder à servidora LISIANE RAMOS VILK, ocupa...",25644,2,[2],"[{'entity_group': 'O', 'score': 0.7762932777, ...","[[21, 39, PERSON], [62, 88, OCCUPATION], [109,..."
1,200,MAURÍCIO VIÉGAS DA SILVA Pró-Reitor de Gestão ...,25644,3,"[3, 94, 156, 168, 173, 272, 276, 280, 284, 288...","[{'entity_group': 'PERSON', 'score': 0.6592996...","[[0, 24, PERSON], [25, 56, OCCUPATION]]"
2,201,Autorizar o afastamento do país de CRISTINE MA...,25645,6,[6],"[{'entity_group': 'O', 'score': 0.6819901466, ...","[[35, 58, PERSON], [60, 92, OCCUPATION], [119,..."
3,202,CARLOS ALEXANDRE NETTO Reitor,25645,8,"[8, 13, 18, 23, 28, 38, 43, 63, 72, 81, 99, 10...","[{'entity_group': 'PERSON', 'score': 0.6323550...","[[0, 22, PERSON], [23, 29, OCCUPATION]]"
4,203,Autorizar o afastamento do país de ANDRE DIAS ...,25646,11,[11],"[{'entity_group': 'O', 'score': 0.6808940768, ...","[[35, 53, PERSON], [55, 82, OCCUPATION], [109,..."


In [322]:
tagged_sentences = ner_dataset.copy()
#untagged_sentences = ner_dataset.iloc[80:].copy()

#### 2. Create dictionary of entities

In [323]:
known_entities = {}

for index, row in tagged_sentences.iterrows():
    for entity in row['label']:
        start_pos = int(entity[0])
        end_pos = int(entity[1])
        label = entity[2]
        entity_text = row['text'][start_pos:end_pos]
        known_entities[entity_text] = label


##### 2.1 Use a DataFrame to modify the entities manually

In [324]:
# Create a dataframe of entities
entities_df = pd.DataFrame.from_dict(known_entities, orient= 'index')
entities_df=entities_df.reset_index()
entities_df.columns = ['entity', 'label']

In [325]:
# Regex match of entities to remove
entities_df[~entities_df['entity'].str.match(r'^[A-ZÀ-Ü]')]

Unnamed: 0,entity,label
617,", VÂNIA CRISTINA SANTOS PEREI",PERSON
618,r MAURÍCIO VIÉGAS DA SIL,PERSON
621,MAURÍCIO VIÉGAS DA SILV,PERSON
653,da,PERSON
666,ANAMARIA KURTZ DE SOUZA WELP CARLA KARNOPPI V...,PERSON
691,aroline,PERSON
757,ROSANGELA DALLA NOR,PERSON
852,CELSO GIANNETTI LOUREIRO CHAVE,PERSON
865,de,PERSON
903,CAROLINA RITTE,PERSON


In [326]:
#Remove entities that are incorrectly labeled
print(entities_df.shape)
entities_df = entities_df[entities_df['entity'].str.match(r'^[A-ZÀ-Ü]')]
print(entities_df.shape)


(922, 2)
(897, 2)


In [327]:
# Add date entities
dates = ner_dataset['text'].str.extractall(r'([0-9]{2}/[0-9]{2}/[0-9]+)').dropna().drop_duplicates().reset_index(drop=True)
text_dates = ner_dataset['text'].str.extractall(r'([0-9]+ de [A-Za-z]+ de [0-9]+)').dropna().drop_duplicates().reset_index(drop=True)
dates['label'] = 'DATE'
text_dates['label'] = 'DATE'
dates.columns = ['entity', 'label']
text_dates.columns = ['entity', 'label']
entities_df = pd.concat([entities_df, dates, text_dates])
entities_df = entities_df.drop_duplicates()

In [328]:
dates

Unnamed: 0,entity,label
0,15/07/2016,DATE
1,28/08/2016,DATE
2,01/09/2016,DATE
3,29/08/2016,DATE
4,02/09/2016,DATE
...,...,...
533,06/06/2019,DATE
534,11/06/2019,DATE
535,17/06/2019,DATE
536,05/06/2019,DATE


In [329]:
#Sort DataFrame by entity string length
entities_df = entities_df.sort_values(by=['entity'], key=lambda row: row.str.len(), ascending=False).reset_index(drop = True)

In [330]:
entities_df[entities_df['entity'] == '01/09/2016']

Unnamed: 0,entity,label
1272,01/09/2016,DATE


In [331]:
# Find if a string's position overlaps with other strings in a list of positions
def overlapping_string(start, end, strings_pos: list):
    is_overlapping = False
    for string_start, string_end, _ in strings_pos:
        if (start >= string_start and start <= string_end) or (end >= string_start and end <= string_end):
            #print(start, end, string_start, string_end)
            is_overlapping = True
    return is_overlapping

In [332]:
# Find all substrings that match an entity in our entities list. They are added to the sentence labels.
def find_matching_entities(sentence: str, entities: pd.DataFrame):
    matches = []
    for index, entry in entities.iterrows():
        entity = entry['entity']

        for match in re.finditer(entity, sentence):
            #print('Found: ', entity, match.start(), match.end())
            overlapping = overlapping_string(match.start(), match.end(), matches)

            if not overlapping:
                #print('No overlap')
                matches.append([match.start(), match.end(), entry['label']])
                
    return matches

In [333]:
print(tagged_sentences['text'].iloc[2])
find_matching_entities(tagged_sentences['text'].iloc[2], entities_df)

Autorizar o afastamento do país de CRISTINE MARIA WARMLING, Professor do Magistério Superior, lotada e em exercício no Departamento de Odontologia Preventiva e Social da Faculdade de Odontologia, com a finalidade de participar do "3ème Congrès de la Societé Internationale d'Ergologie", em Aix-en-Provence - França, no período compreendido entre 28/08/2016 e 01/09/2016, com ônus limitado.


[[119, 166, 'ORGANIZATION'],
 [250, 284, 'ORGANIZATION'],
 [60, 92, 'OCCUPATION'],
 [170, 194, 'ORGANIZATION'],
 [35, 58, 'PERSON'],
 [290, 305, 'LOCATION'],
 [346, 356, 'DATE'],
 [359, 369, 'DATE'],
 [308, 314, 'LOCATION']]

In [334]:
replicated_labels = pd.Series()
tagged_sentences['label'] = tagged_sentences['text'].apply(lambda row: find_matching_entities(row, entities_df))

  replicated_labels = pd.Series()


In [335]:
tagged_sentences['label'].iloc[37]

[[122, 179, 'ORGANIZATION'],
 [242, 278, 'ORGANIZATION'],
 [63, 95, 'OCCUPATION'],
 [35, 61, 'PERSON'],
 [183, 202, 'ORGANIZATION'],
 [328, 338, 'DATE'],
 [341, 351, 'DATE'],
 [290, 296, 'LOCATION'],
 [381, 386, 'ORGANIZATION'],
 [283, 288, 'LOCATION']]

In [71]:
# replicated_data = pd.concat([tagged_sentences, untagged_sentences])
# print(
#     tagged_sentences.shape,
#     '+', untagged_sentences.shape,
#     '=', replicated_data.shape)

(80, 7) + (649, 7) = (729, 7)


In [336]:
tagged_sentences.to_json(os.path.join(path_to_train, 'unique_sentences_replicated.jsonl'),lines=True, orient = 'records')