In [2]:
#!pip install biome

In [1]:
import helper
import spacy
import glob
from typing import Tuple, List
from pathlib import Path
from tqdm import tqdm
import pandas as pd

from allennlp.data.token_indexers import PretrainedTransformerIndexer, PretrainedTransformerMismatchedIndexer
from allennlp.data import Token, Vocabulary

In [3]:
def create_df(txt_files, ann_files, nlp, align_offsets = True):
    dfs = []
    for txt, ann in tqdm(zip(txt_files, ann_files), total=len(txt_files)):

        doc = helper.brat2doc(
            Path(txt), 
            Path(ann), 
            nlp, 
            align_offsets=align_offsets, 
            remove_parents=True, 
            remove_siblings=True
        )
        dfs.append(helper.doc2df(doc, Path(txt).name))
        
    return dfs

def find_misaligned_annotations(df, verbose=True):
    # find misaligned annotations?
    idx = []
    for row in df.itertuples():
        if '-' in row.labels:
            idx.append(row.Index)
    return idx

def get_destroyer_tokens(df, model_name="distilbert-base-multilingual-cased") -> Tuple[List, List]:
    destroyed_tokens = []
    destroyed_tags = []
    offs = []
    indexer = PretrainedTransformerMismatchedIndexer(model_name=model_name)
    vocab = Vocabulary()
    for row in tqdm(df.itertuples(), total=len(df)):
        tokens = [Token(tok) for tok in row.text]
        token_indexes = [indexer.tokens_to_indices([tok], vocabulary=vocab) for tok in tokens]
        token_ids = [token_index["token_ids"] for token_index in token_indexes]
        token_offs = [token_index["offsets"] for token_index in token_indexes]
        is_empty = [len(ids) <= 2 for ids in token_ids]
        if any(is_empty):
            idx = [i for i in range(len(is_empty)) if is_empty[i] is True]
            offs += [token_offs[i] for i in idx]
            for tok, tag in [(row.text[i], row.labels[i]) for i in idx]:
                if tok not in destroyed_tokens:
                    destroyed_tokens.append(tok)
                if tag not in destroyed_tags:
                    destroyed_tags.append(tag)
                    
    return destroyed_tokens, destroyed_tags, offs

def replace_tokens_with_char(df: pd.DataFrame, tokens: List[str], char: str) -> int:
    changes = 0
    for row in tqdm(df.itertuples(), total=len(df)):
        idx = [i for i in range(len(row.text)) if row.text[i] in tokens]
        for i in idx:
            row.text[i] = char
            changes += 1
    return changes

In [3]:
#!python -m spacy download es

In [4]:
nlp = spacy.load('es')

In [5]:
nlp.tokenizer = helper.custom_tokenizer(nlp)

# Create data sets without aligning tokens and offsets

## training set

In [6]:
ann_files_train = sorted(glob.glob("data/train-set-to-publish/cantemist-ner/cc*.ann"))
txt_files_train = sorted(glob.glob("data/train-set-to-publish/cantemist-ner/cc*.txt"))

In [None]:
df_train = pd.concat(create_df(txt_files_train, ann_files_train, nlp, align_offsets=False), ignore_index=True)

In [None]:
idx = find_misaligned_annotations(df_train)
print(len(idx), idx)
df_train.loc[idx, :]
df_train.drop(idx, inplace=True)

In [14]:
# fix invalid tag sequences caused by our sentence splitting, that is annotations that contain a '.'
for row in df_train.itertuples():
    if row.labels[-1].startswith("B-"):
        row.labels[-1] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[-1].startswith("I-"):
        row.labels[-1] = "L-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("L-"):
        row.labels[0] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("I-"):
        row.labels[0] = "B-MORFOLOGIA_NEOPLASIA"
        print(row)

Pandas(Index=4678, text_org='Páncreas y madre fallecida de Ca.', text=['Páncreas', 'y', 'madre', 'fallecida', 'de', 'Ca', '.'], labels=['O', 'O', 'O', 'O', 'O', 'B-MORFOLOGIA_NEOPLASIA', 'L-MORFOLOGIA_NEOPLASIA'], file='cc_onco307.txt', sentence_offset=254)
Pandas(Index=4679, text_org='Microcítico de pulmón.\n', text=['Microcítico', 'de', 'pulmón', '.', '\n'], labels=['U-MORFOLOGIA_NEOPLASIA', 'O', 'O', 'O', 'O'], file='cc_onco307.txt', sentence_offset=288)
Pandas(Index=6099, text_org='-Recaída a distancia con ILE 5.9 años de Ca.', text=['-', 'Recaída', 'a', 'distancia', 'con', 'ILE', '5.9', 'años', 'de', 'Ca', '.'], labels=['O', 'B-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'L-MORFOLOGIA_NEOPLASIA'], file='cc_onco364.txt', sentence_offset=5282)
Pandas(Index=6100, text_org='renal de células claras ope

### Prepare input for the BERT tokenizer

For some tokens the BERT tokenizer/indexer/embedder returns an empty vector. We replace these "destroyer tokens" with a "friendly char" that is not looking for a fight. 

In [10]:
toks, tags, offs = get_destroyer_tokens(df_train)
toks, tags

(['\n',
  '\n\n',
  '\n\n\n',
  '\n\n\n\n',
  '\n  \n',
  ' ',
  '\t',
  '\n \n',
  '\n ',
  '\uf0e8',
  '\uf076'],
 ['O'])

In [11]:
replace_tokens_with_char(df_train, toks, "æ")

11888

### Prepare input for the XLM-R tokenizer

For some tokens the XLM-R tokenizer/indexer/embedder returns an empty vector. We replace these "destroyer tokens" with a "friendly char" that is not looking for a fight. There 

In [15]:
toks, tags, offs = get_destroyer_tokens(df_train, model_name="xlm-roberta-base")
toks, tags

100%|██████████| 18479/18479 [00:33<00:00, 559.30it/s]


(['\n', '\n\n', '\n\n\n', '\n\n\n\n', '\n  \n', ' ', '\t', '\n \n', '\n '],
 ['O'])

In [11]:
replace_tokens_with_char(df_train, toks, "æ")

11888

## dev1 set

In [6]:
ann_files_dev1 = sorted(glob.glob("data/dev-set1-to-publish/cantemist-ner/cc*.ann"))
txt_files_dev1 = sorted(glob.glob("data/dev-set1-to-publish/cantemist-ner/cc*.txt"))

In [7]:
df_dev1 = pd.concat(create_df(txt_files_dev1, ann_files_dev1, nlp, align_offsets=False), ignore_index=True)

100%|██████████| 250/250 [00:23<00:00, 10.85it/s]


In [8]:
idx = find_misaligned_annotations(df_dev1)
print(idx, len(idx))
df_dev1.loc[idx, :]
df_dev1.drop(idx, inplace=True)

[2145, 3809] 2


In [9]:
# fix invalid tag sequences caused by our sentence splitting, that is annotations that contain a '.'
for row in df_dev1.itertuples():
    if row.labels[-1].startswith("B-"):
        row.labels[-1] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[-1].startswith("I-"):
        row.labels[-1] = "L-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("L-"):
        row.labels[0] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("I-"):
        row.labels[0] = "B-MORFOLOGIA_NEOPLASIA"
        print(row)

Pandas(Index=8738, text_org='Se realiza RMN cerebral que confirma los hallazgos del PET y se remite a Neurocirugía para extirpación de las lesiones bajo el juicio clínico de posibles metástasis de Ca.', text=['Se', 'realiza', 'RMN', 'cerebral', 'que', 'confirma', 'los', 'hallazgos', 'del', 'PET', 'y', 'se', 'remite', 'a', 'Neurocirugía', 'para', 'extirpación', 'de', 'las', 'lesiones', 'bajo', 'el', 'juicio', 'clínico', 'de', 'posibles', 'metástasis', 'de', 'Ca', '.'], labels=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'L-MORFOLOGIA_NEOPLASIA'], file='cc_onco970.txt', sentence_offset=180, entity_text=[])
Pandas(Index=8739, text_org='papilar de tiroides.', text=['papilar', 'de', 'tiroides', '.'], labels=['B-MORFOLOGIA_NEOPLASIA', 'I-MORFOLOGIA_NEOPLASIA', 'L-MORFOLOGIA_NEOPLASIA', 'O'], file='cc_onco970.txt', sentence_offset=

### Prepare input for the BERT tokenizer

For some tokens the BERT tokenizer/indexer/embedder returns an empty vector. We replace these "destroyer tokens" with a "friendly char" that is not looking for a fight. 

In [10]:
toks, tags, offs = get_destroyer_tokens(df_dev1)
toks, tags

100%|██████████| 9073/9073 [00:14<00:00, 604.96it/s]


(['\n', '\n\n', '\n\n\n', ' ', '\uf0a7'], ['O'])

In [11]:
replace_tokens_with_char(df_dev1, toks, "æ")

100%|██████████| 9073/9073 [00:00<00:00, 148099.55it/s]


5903

## dev2 set

In [6]:
ann_files_dev2 = sorted(glob.glob("data/dev-set2-to-publish/cantemist-ner/cc*.ann"))
txt_files_dev2 = sorted(glob.glob("data/dev-set2-to-publish/cantemist-ner/cc*.txt"))

In [7]:
df_dev2 = pd.concat(create_df(txt_files_dev2, ann_files_dev2, nlp, align_offsets=False), ignore_index=True)

100%|██████████| 250/250 [00:54<00:00,  4.62it/s]


In [8]:
idx = find_misaligned_annotations(df_dev2)
print(idx, len(idx))
df_dev2.loc[idx, :]
df_dev2.drop(idx, inplace=True)

[807, 2233, 3739, 4934, 6334, 6337, 6682, 6956] 8


In [9]:
# fix invalid tag sequences caused by our sentence splitting, that is annotations that contain a '.'
for row in df_dev2.itertuples():
    if row.labels[-1].startswith("B-"):
        row.labels[-1] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[-1].startswith("I-"):
        row.labels[-1] = "L-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("L-"):
        row.labels[0] = "U-MORFOLOGIA_NEOPLASIA"
        print(row)
    if row.labels[0].startswith("I-"):
        row.labels[0] = "B-MORFOLOGIA_NEOPLASIA"
        print(row)

### Prepare input for the BERT tokenizer

For some tokens the BERT tokenizer/indexer/embedder returns an empty vector. We replace these "destroyer tokens" with a "friendly char" that is not looking for a fight. 

In [22]:
toks, tags, offs = get_destroyer_tokens(df_dev2)
toks, tags

(['\n', '\n\n', '\n ', '\n\n\n'], ['O'])

In [23]:
replace_tokens_with_char(df_dev2, toks, "æ")

5385

### Save json

In [11]:
df_dev2.to_json("data/NER_david/dev2_wo_align.json", lines=True, orient="records")

## test set (for the language model)

In [25]:
txt_files_test = sorted(glob.glob("data/test-background-set-to-publish/*.txt"))
txt_paths_test = [Path(txt_file) for txt_file in txt_files_test]

In [29]:
dfs = []
for txt_path in tqdm(txt_paths_test):
    text = txt_path.read_text()
    doc = nlp(text)
    dfs.append(helper.doc2df(doc, txt_path.name))
               
df_test = pd.concat(dfs, ignore_index=True)

100%|██████████| 5232/5232 [03:56<00:00, 22.14it/s]


### Prepare input for the BERT tokenizer

For some tokens the BERT tokenizer/indexer/embedder returns an empty vector. We replace these "destroyer tokens" with a "friendly char" that is not looking for a fight. 

In [32]:
toks, tags, offs = get_destroyer_tokens(df_test)
toks, tags

100%|██████████| 87623/87623 [02:28<00:00, 589.06it/s]


(['\n',
  '\n\n',
  '\n\n\n',
  '\x85',
  '\x85 ',
  '\x99',
  '\n\xa0\n',
  '\n\n\xa0\n',
  '\xad',
  '\n\n\n\n',
  '\n\n\n\n\n\n',
  '�',
  '\n \n',
  ' \n\n',
  '\n\n \n',
  '\n\n \n\n',
  ' ',
  '  ',
  '\n\n ',
  '\n \n ',
  '\n\n \n\n ',
  '  \n\n',
  '\u2028\n\n',
  '\u2028      \n\n',
  '\u2028\u2028\n',
  '                        \n\n',
  '\n\n  ',
  '\n  ',
  '\n  \n  \n',
  '\n \n\n',
  '\n  \n',
  '\n  \n\n',
  '\n \n \n',
  '\n ',
  '\t',
  '\t\t',
  '\t\t\n',
  '\t\t\t'],
 ['_SP', 'PROPN___', 'NUM__NumForm=Digit'])

In [34]:
replace_tokens_with_char(df_test, toks, "æ")

100%|██████████| 87623/87623 [00:01<00:00, 65701.51it/s]


33039

## save data frames

In [130]:
df_train.to_json("data/NER/train_for_bert.json", lines=True, orient="records")

In [12]:
df_dev1.to_json("data/NER/dev1_for_bert.json", lines=True, orient="records")

In [35]:
df_test.to_json("data/NER/test_for_bert.json", lines=True, orient="records")