# Preparing the test set with gold annotations

This test dataset was released after the competition was over.

In [1]:
import helper
import spacy
import glob
from typing import Tuple, List
from pathlib import Path
from tqdm import tqdm
import pandas as pd

from allennlp.data.token_indexers import PretrainedTransformerIndexer, PretrainedTransformerMismatchedIndexer
from allennlp.data import Token, Vocabulary

In [2]:
nlp = spacy.load('es_core_news_sm')

In [3]:
nlp.tokenizer = helper.custom_tokenizer(nlp)

In [13]:
def create_df(txt_files, ann_files, nlp, align_offsets = True):
    dfs = []
    for txt, ann in tqdm(zip(txt_files, ann_files), total=len(txt_files)):

        doc = helper.brat2doc(
            Path(txt), 
            Path(ann), 
            nlp, 
            align_offsets=align_offsets, 
            remove_parents=True, 
            remove_siblings=True
        )
        dfs.append(helper.doc2df(doc, Path(txt).name))
        
    return dfs

def find_misaligned_annotations(df, verbose=True):
    # find misaligned annotations?
    idx = []
    for row in df.itertuples():
        if '-' in row.labels:
            idx.append(row.Index)
    return idx

In [14]:
ann_files_train = sorted(glob.glob("data/NER/cantemist-ner-gold/*.ann"))
txt_files_train = sorted(glob.glob("data/NER/cantemist-ner-gold/*.txt"))

In [15]:
df_test = pd.concat(create_df(txt_files_train, ann_files_train, nlp, align_offsets=False), ignore_index=True)

100%|██████████| 300/300 [00:19<00:00, 15.21it/s]


In [19]:
idx = find_misaligned_annotations(df_test)
print(len(idx), idx)
df_test.loc[idx, :]
df_test.drop(idx, inplace=True)

8 [360, 2470, 2727, 3513, 4450, 5133, 6175, 8298]


In [20]:
# fix invalid tag sequences caused by our sentence splitting, that is annotations that contain a '.'
for row in df_test.itertuples():
    if row.labels[-1].startswith("B-"):
        row.labels[-1] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[-1].startswith("I-"):
        row.labels[-1] = "L-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("L-"):
        row.labels[0] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("I-"):
        row.labels[0] = "B-MORFOLOGIA_NEOPLASIA"
        print(row)

In [22]:
df_test

Unnamed: 0,text_org,text,labels,file,sentence_offset,entity_text
0,Anamnesis\nSe trata de un paciente de 67 años ...,"[Anamnesis, \n, Se, trata, de, un, paciente, d...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco1006.txt,0,[]
1,No ha sido intervenido quirúrgicamente y no ti...,"[No, ha, sido, intervenido, quirúrgicamente, y...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]",cc_onco1006.txt,260,[]
2,Valorado en Consulta de Digestivo en julio de ...,"[Valorado, en, Consulta, de, Digestivo, en, ju...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco1006.txt,346,[]
3,"En analítica sanguínea, no se evidencian parám...","[En, analítica, sanguínea, ,, no, se, evidenci...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco1006.txt,563,"[tumor, tumor]"
4,/ml (0-37 U/ml).,"[/, ml, (, 0-37, U, /, ml, ), .]","[O, O, O, O, O, O, O, O, O]",cc_onco1006.txt,715,[]
...,...,...,...,...,...,...
10777,15.3 300 U/ml (valores normales menores que 31...,"[15.3, 300, U, /, ml, (, valores, normales, me...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco978.txt,5306,[]
10778,"Ante la progresión clínica y analítica, se sus...","[Ante, la, progresión, clínica, y, analítica, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco978.txt,5527,[]
10779,"Ante dicha sintomatología, se realiza ecografí...","[Ante, dicha, sintomatología, ,, se, realiza, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco978.txt,5672,[]
10780,"Actualmente, a fecha de junio de 2017 y tras 3...","[Actualmente, ,, a, fecha, de, junio, de, 2017...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",cc_onco978.txt,5899,[]


In [21]:
df_test.to_json("data/NER/gold_test.json", lines=True, orient="records")