In [None]:
# This code converts our document-level results to an inline format compatible with Dai et al., 2020.
# It also includes cloning Dai's evaluation script for calculating results.

In [None]:
from collections import defaultdict
import csv
import re

def is_header(text):
    # ShaRe14:
    #pattern = r'.* DISCHARGE_SUMMARY .*'
    # ShaRe13:
    pattern = r'.*\s(DISCHARGE_SUMMARY|RADIOLOGY_REPORT|ECHO_REPORT)\s.*'
    return bool(re.match(pattern, text.strip()))

def assemble_documents(csv_file_path):
    documents = dict()
    current_doc = []
    current_labels = []
    current_doc_id = None
    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row

        for row in csv_reader:
            if len(row) < 1:
                continue

            text = row[0]
            labels = row[2]
            if is_header(text):
                if current_doc:
                    documents[current_doc_id]= ('\n'.join(current_doc), current_labels)
                current_doc = [text]
                current_labels = [labels]
                current_doc_id = text.split('|')[0].strip()
            else:
                current_doc.append(text)
                current_labels.append(labels)

        # Add the last document if it exists
        if current_doc:
            documents[current_doc_id] = ('\n'.join(current_doc), current_labels)

    return documents
# provide path of DocDiscNER_NER or DocDiscNER_SC
csv_file_path = '/content/ShaRe13-MajVoteOf3.csv'
assembled_documents = assemble_documents(csv_file_path)
print(len(assembled_documents))

99


In [None]:
def process_input_file(input_file_path):
    documents = {}
    current_doc_id = None
    current_doc_sentences = []
    current_doc_annotations = []

    with open(input_file_path, 'r') as f:
        while True:
            try:
                sentence = next(f).strip()
                gold = next(f).strip()
                gold = gold.split("|") if len(gold) > 0 else []
                assert len(next(f).strip()) == 0  # Empty line

                if is_header(sentence):
                    if current_doc_id:
                        documents[current_doc_id] = {
                            'sentences': current_doc_sentences,
                            'annotations': current_doc_annotations,
                            'predictions': assembled_documents[current_doc_id]
                        }
                    current_doc_id = sentence.split('||||')[0].strip()
                    current_doc_sentences = [sentence]
                    current_doc_annotations = [gold]
                else:
                    current_doc_sentences.append(sentence)
                    current_doc_annotations.append(gold)

            except StopIteration:
                # End of file reached
                if current_doc_id:
                    documents[current_doc_id] = {
                        'sentences': current_doc_sentences,
                        'annotations': current_doc_annotations,
                        'predictions': assembled_documents[current_doc_id]
                    }
                break

    return documents

In [None]:
def split_disorders(disorder_list, case_sensitive=False):
    all_disorders = []
    for disorder_string in disorder_list:
        # Split by semicolon and strip whitespace
        disorders = [d.strip() for d in disorder_string.split(';')]
        # Extract the text after "disorder:" for each item
        disorders = [d.split('disorder:')[-1].strip().replace('’', "'").replace("'", " '").replace('-', ' - ').replace(',', ' ,').replace('.', ' . ').replace('/', ' / ').replace('&', ' & ') for d in disorders]
        if not case_sensitive:
            disorders = [d.lower() for d in disorders]
        all_disorders.extend(disorders)
    return all_disorders

In [None]:
def find_disorder_spans_disc(input_tokens, disorders, case_sensitive=False):
    spans = []
    mentions = []

    def normalize(text):
        return text if case_sensitive else text.lower().replace("*", "")

    # Tokenize disorders
    tokenized_disorders = {normalize(d): d.split() for d in disorders}

    for i in range(len(input_tokens)):
        for j in range(i, len(input_tokens)):
            span = ' '.join(input_tokens[i:j+1])
            normalized_span = normalize(span)

            # Check for full span match
            if normalized_span in tokenized_disorders:
                spans.append(f"{i},{j} Disorder")
                mentions.append(span)

    return spans, mentions

In [None]:
def merge_consecutive_spans(spans):
    if not spans:
        return []

    # Sort the spans based on the start index
    sorted_spans = sorted(spans, key=lambda x: x[0])

    merged = [sorted_spans[0]]

    for current in sorted_spans[1:]:
        last = merged[-1]

        # If the current span starts immediately after the last one ends
        if current[0] == last[1] + 1:
            # Merge by updating the end of the last span
            merged[-1] = (last[0], current[1])
        else:
            # If not consecutive, add as a new span
            merged.append(current)

    return merged

# Example usage
spans = [(0,0), (6,6), (7,7), (9,9), (10,10)]
result = merge_consecutive_spans(spans)
print(result)

[(0, 0), (6, 7), (9, 10)]


In [None]:
def find_discontinuous_mentions(sentence, mentions, case_sensitive=True):
    tokens = sentence.split()
    results = {}

    if not case_sensitive:
        tokens = [token.lower() for token in tokens]

    for mention in mentions:
        mention_parts = mention.split()
        if not case_sensitive:
            mention_parts = [part.lower() for part in mention_parts]

        spans = []
        found_all = True

        for part in mention_parts:
            try:
                index = tokens.index(part)
                spans.append((index, index))
            except ValueError:
                found_all = False
                break

        if found_all:
            results[mention] = merge_consecutive_spans(spans)

    return results


# Example usage
sentence = "Abd : S / NT / ND , + BS , - HSM , multiple surgical scars ."
mentions = ['Abd ND', 'Abd NT', 'Abd S', 'Abd surgical scars']

result = find_discontinuous_mentions(sentence, mentions)
print(result)

{'Abd ND': [(0, 0), (6, 6)], 'Abd NT': [(0, 0), (4, 4)], 'Abd S': [(0, 0), (2, 2)], 'Abd surgical scars': [(0, 0), (15, 16)]}


In [None]:
# the path here is for the gold test set preprocessed in the Dai et al., 2020 format.
test_file = '/content/test.txt'
processed_documents = process_input_file(test_file)

missing = 0

out_file = ""

# Print the processed documents
for doc_id, doc_data in processed_documents.items():
    # print(f"Document ID: {doc_id}")
    # print("Sentences:")
    all_preds = split_disorders(doc_data['predictions'][1], case_sensitive=True)
    missing_preds = all_preds
    output = []
    # print(all_preds)
    for annotation, sentence in zip(doc_data['annotations'], doc_data['sentences']):
        spans, matches = find_disorder_spans_disc(sentence.split(), all_preds, case_sensitive=True)
        # print(f"{sentence}")
        # print(f"g: {annotation}")
        # print(f"p: {spans}")
        output.append((sentence, spans, annotation))
        missing_preds = [p for p in missing_preds if p not in matches]

    # print(f"Missing predictions: {missing_preds}")
    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            mentions = find_discontinuous_mentions(sent, missing_preds, case_sensitive=True)
            if mentions:
                for mention in mentions.values():
                    output[i][1].append(','.join(f"{s},{e}" for s,e in mention) +" Disorder")
                # print(output[i])
                missing_preds = [p for p in missing_preds if p not in mentions.keys()]
                if not missing_preds:
                    break
    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            spans, matches = find_disorder_spans_disc(sent.split(), missing_preds, case_sensitive=False)
            if spans:
                for span in spans:
                    output[i][1].extend(spans)
                # print(output[i])
                missing_preds = [p for p in missing_preds if p.lower() not in set(m.lower().replace("*", "")  for m in matches)]
                if not missing_preds:
                    break
    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            mentions = find_discontinuous_mentions(sent, missing_preds, case_sensitive=False)
            if mentions:
                for mention in mentions.values():
                    output[i][1].append(','.join(f"{s},{e}" for s,e in mention) +" Disorder")
                # print(output[i])
                missing_preds = [p for p in missing_preds if p.lower() not in set(m.lower() for m in mentions.keys())]
                if not missing_preds:
                    break
    for sent, spans, gt in output:
        out_file += sent + '\n'
        out_file += '|'.join(spans) +  '\n\n'
    # print(out_file)
    # break

    # if missing_preds:
    #     print(doc_id)
    #     print(doc_data['sentences'])
    #     print(f"Missing predictions: {missing_preds}")
    #     missing += len(missing_preds)
        # assert False
        # break
        # print('\n'.join(doc_data['sentences']))
print(missing)

0


In [None]:
with open('ours_to_dai.txt', 'w+') as f:
    f.write(out_file)

In [None]:
# Clone the repository
!git clone https://github.com/dainlp/acl2020-transition-discontinuous-ner.git

# Navigate into the directory
%cd xdai/ner


Cloning into 'acl2020-transition-discontinuous-ner'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 109 (delta 0), reused 0 (delta 0), pack-reused 101[K
Receiving objects: 100% (109/109), 423.43 KiB | 3.31 MiB/s, done.
Resolving deltas: 100% (17/17), done.
[Errno 2] No such file or directory: 'xdai/ner'
/content


In [None]:
%cd /content/acl2020-transition-discontinuous-ner/code/xdai/ner

/content/acl2020-transition-discontinuous-ner/code/xdai/ner


In [None]:
# rename (ours_to_dai.txt) to (test.txt) and put it in folder /pred/test.txt
# put the gold which is in Dai's format in folder /gold/test.txt
!python evaluate.py --gold_filepath /content/gold/test.txt --pred_filepath /content/pred/test.txt

micro-precision 0.8140188057149835
micro-recall 0.8414541782378188
micro-f1 0.8275091552355534
sentences_with_disc-micro-precision 0.6760037348272643
sentences_with_disc-micro-recall 0.7218344965104686
sentences_with_disc-micro-f1 0.6981677917068466
disc-mention-micro-precision 0.5098591549295775
disc-mention-micro-recall 0.6407079646017699
disc-mention-micro-f1 0.567843137254902
