In [None]:
# For CADEC dataset
# This code converts our document-level results to an inline format compatible with Dai et al., 2020.
# It also includes cloning Dai's evaluation script for calculating results.

In [None]:
from collections import defaultdict
import csv
import re
import pandas as pd
# here provide a version of Doc-level preprocessed test set that hasn't been resolved using CER. to maintain the original text
original_csv = pd.read_csv('/content/CADEC_Test.csv').to_dict(orient='records')

def assemble_documents(csv_file_path, original_csv):
    documents = dict()
    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row

        for i, row in enumerate(csv_reader):
            if len(row) < 1:
                continue

            text = row[0]
            original_text = original_csv[i]['Text']
            # print(text[:100])
            # print(original_text[:100])
            # print('----'*40)
            labels = row[2]
            if labels == 'nan':
                labels = ''
            current_doc_id = original_text.split('\n')[0].strip().replace(" ", "")
            if current_doc_id == 'Nosideeffects.': # this sentence appears twice in the dataset
                current_doc_id = 'Nosideeffects.' + original_text.split('\n')[1].strip().replace(" ", "")
                print(current_doc_id)
                print(current_doc_id)
            current_doc_id = current_doc_id.lower()
            documents[current_doc_id]= (text, labels, original_text)

        # Add the last document if it exists

    return documents
# provide path of results from DocDiscNER_NER or DocDiscNER_SC
csv_file_path = '/content/CADEC-MajVoteOf3.csv'
assembled_documents = assemble_documents(csv_file_path, original_csv)
print(len(assembled_documents))

Nosideeffects.Mydoctor(conservativeandtop-rate)saysthattakingLipitorisanobrainer.
Nosideeffects.Mydoctor(conservativeandtop-rate)saysthattakingLipitorisanobrainer.
Nosideeffects.MyLDL,totalcholesterolandtriglyceridesfinallywheretheyshouldbe.
Nosideeffects.MyLDL,totalcholesterolandtriglyceridesfinallywheretheyshouldbe.
189


In [None]:
for k in assembled_documents.keys():
    if k.startswith("nosideeffects"):
        print(k)

nosideeffectsnotedwhiletakinga40mgdoseeachday.
nosideeffects.mydoctor(conservativeandtop-rate)saysthattakinglipitorisanobrainer.
nosideeffects.myldl,totalcholesterolandtriglyceridesfinallywheretheyshouldbe.


In [None]:
def is_header(sentence, next_sentence, assembled_documents):
    key = sentence.strip().replace(r" ", "")#.rsplit('.', 1)[0]
    if key == 'Nosideeffects.':
        key = 'Nosideeffects.' + next_sentence.strip().replace(" ", "")#.rsplit('.', 1)[0]
    key = key.lower()
    if key in assembled_documents:
        return key
    else:
        return None


def process_sentence_file(file_path, assembled_documents):
    documents = {}
    current_doc_id = None
    current_doc_sentences = []
    current_doc_annotations = []

    with open(file_path, 'r') as file:
        lines = file.read().splitlines()

    for i, (line, annotations, empty_line) in enumerate(zip(lines, lines[1:], lines[2:])):
        line = line.strip()
        if i % 3 == 0 and line:
                annotations = annotations.split("|") if annotations else []
                next_line = lines[i + 3] if i + 3 < len(lines) else ''
                # Start a new document
                matching_doc_id = is_header(line, next_line, assembled_documents)
                # if line.startswith("Kidney failure , difficulty swallowing ,"):
                #     print(line)
                #     print(matching_doc_id)
                #     print(line.strip().replace(r" ", "").rsplit('.', 1)[0][:100].lower())
                if matching_doc_id:
                    # If we found a new document key, save the previous document (if any)
                    if current_doc_id:
                        documents[current_doc_id] = {
                            'sentences': current_doc_sentences,
                            'annotations': current_doc_annotations,
                            'text': assembled_documents.get(current_doc_id, {})[0],
                            'predictions': assembled_documents.get(current_doc_id, {})[1],
                            'original_text': assembled_documents.get(current_doc_id, {})[2]
                        }
                    # Start a new document
                    current_doc_id = matching_doc_id
                    current_doc_sentences = [line]
                    current_doc_annotations = [annotations]
                else:
                    # Continue with the current document
                    current_doc_sentences.append(line)
                    current_doc_annotations.append(annotations)



    # Save the last document if there is one
    if current_doc_id is not None:
        documents[current_doc_id] = {
            'sentences': current_doc_sentences,
            'annotations': current_doc_annotations,
            'text': assembled_documents.get(current_doc_id, {})[0],
            'predictions': assembled_documents.get(current_doc_id, {})[1],
            'original_text': assembled_documents.get(current_doc_id, {})[2]
        }

    return documents

# the path here is for the gold test set preprocessed in the Dai et al., 2020 format.

#file_path = 'Dai Preprocessing/test.txt'
file_path = '/content/test.txt'

documents = process_sentence_file(file_path, assembled_documents)

# Print results
for doc_id, doc_content in documents.items():
    orig_sents = [c for c in doc_content['original_text'].strip().split('\n') if c.strip()]
    if not len(doc_content['sentences']) == len(orig_sents):
        print(f"Document ID: {doc_id}")
        print(f"Sentences: {doc_content['sentences']}")
        print(f"Original Sentences: {orig_sents}")
        print(f"Annotations: {doc_content['annotations']}")
        print(f"Predictions: {doc_content['predictions']}")
        print(f"Text: {doc_content['text']}")
        print(f"Original Text: {doc_content['original_text']}")
        print()
    # break


In [None]:
import string
for d in documents.values():
    if not [c.lower() for sent in d['sentences'] for c in sent if c in string.ascii_letters] == [c.lower() for c in d['original_text'] if c in string.ascii_letters]:
        print(''.join(c.lower() for sent in d['sentences'] for c in sent if c in string.ascii_letters))
        print(''.join(c.lower() for c in d['original_text'] if c in string.ascii_letters))
        print(d['text'])
        print('----')

In [None]:
documents

{'brainfoganddecreasedcognitiveskills.': {'sentences': ['Brain fog and decreased cognitive skills .',
   'Does wonders for reducing LDL Cholestrol .',
   'Short term memory and brain fog were brutal .',
   'Since I also have anxiety issues , I was not sure if Lipitor caused side effects .',
   'Stopped liptor and improved diet and memory improved .'],
  'annotations': [['0,1 ADR', '3,5 ADR'], [], ['0,2 ADR', '4,5 ADR'], [], []],
  'text': 'Brain fog and decreased cognitive skills. Does wonders for reducing LDL Cholestrol. Short term memory and brain fog were brutal. Since I also have anxiety issues, I was not sure if Lipitor caused side effects. Stopped liptor and improved diet and memory improved.',
  'predictions': 'disorder: Brain fog; disorder: decreased cognitive skills; disorder: short term memory; disorder: brain fog; disorder: anxiety issues',
  'original_text': 'Brain fog and decreased cognitive skills.\nDoes wonders for reducing LDL Cholestrol.\nShort term memory and brain fo

In [None]:
def split_disorders(disorder_list, case_sensitive=False):
    if not disorder_list.strip():
        return []
    # for disorder_string in disorder_list:
    #     # Split by semicolon and strip whitespace
    disorders = [d.strip() for d in disorder_list.split(';')]
    # Extract the text after "disorder:" for each item
    typed_disorders = []
    for d in disorders:
        if d.count(":") != 1:
            print(f"{d} in {disorders} is weird")
            continue
        ner_type, span = d.split(':')
        span = span.strip().replace('’', "'").replace("'", " '").replace('-', ' - ').replace(',', ' ,').replace('.', ' . ').replace('/', ' / ').replace('&', ' & ')
        typed_disorders.append((ner_type, span))
    if not case_sensitive:
        typed_disorders = [(t,d.lower()) for t, d in typed_disorders]
    return typed_disorders

data = 'disorder: Muscle twitching; disorder: stiff neck; disorder: lightheadedness; disorder: twitching; disorder: light head'
print(split_disorders(data))
print(split_disorders("disorder: vibrating sensation; disorder: Entire body ached; disorder: toes felt arthritic; disorder: problems with coordination; disorder: problems with memory disorder: fell down stairs"))

[('disorder', 'muscle twitching'), ('disorder', 'stiff neck'), ('disorder', 'lightheadedness'), ('disorder', 'twitching'), ('disorder', 'light head')]
disorder: problems with memory disorder: fell down stairs in ['disorder: vibrating sensation', 'disorder: Entire body ached', 'disorder: toes felt arthritic', 'disorder: problems with coordination', 'disorder: problems with memory disorder: fell down stairs'] is weird
[('disorder', 'vibrating sensation'), ('disorder', 'entire body ached'), ('disorder', 'toes felt arthritic'), ('disorder', 'problems with coordination')]


In [None]:
def find_disorder_spans_disc(input_tokens, disorders, case_sensitive=False):
    spans = []
    mentions = []

    def normalize(text):
        return text if case_sensitive else text.lower().replace("*", "")

    # Tokenize disorders
    tokenized_disorders = {normalize(d): (t, d.split()) for t, d in disorders}

    for i in range(len(input_tokens)):
        for j in range(i, len(input_tokens)):
            span = ' '.join(input_tokens[i:j+1])
            normalized_span = normalize(span)

            # Check for full span match
            if normalized_span in tokenized_disorders:
                disorder_type, _ = tokenized_disorders[normalized_span]
                spans.append(f"{i},{j} {disorder_type}")
                mentions.append(span)

    return spans, mentions


input_tokens = ["The", "patient", "experienced", "severe", "cough", "and", "mild", "fever"]
disorders = [("ADR", "severe cough"), ("SYMPTOM", "mild fever"), ("DISEASE", "flu")]

spans, mentions = find_disorder_spans_disc(input_tokens, disorders)

print("Spans:", spans)
print("Mentions:", mentions)

Spans: ['3,4 ADR', '6,7 SYMPTOM']
Mentions: ['severe cough', 'mild fever']


In [None]:
def merge_consecutive_spans(spans):
    if not spans:
        return []

    # Sort the spans based on the start index
    sorted_spans = sorted(spans, key=lambda x: x[0])

    merged = [sorted_spans[0]]

    for current in sorted_spans[1:]:
        last = merged[-1]

        # If the current span starts immediately after the last one ends
        if current[0] == last[1] + 1:
            # Merge by updating the end of the last span
            merged[-1] = (last[0], current[1])
        else:
            # If not consecutive, add as a new span
            merged.append(current)

    return merged

spans = [(0,0), (6,6), (7,7), (9,9), (10,10)]
result = merge_consecutive_spans(spans)
print(result)

[(0, 0), (6, 7), (9, 10)]


In [None]:
def find_discontinuous_mentions(sentence, mentions, case_sensitive=True):
    tokens = sentence.split()
    results = {}

    if not case_sensitive:
        tokens = [token.lower() for token in tokens]

    for mention_type, mention_text in mentions:
        mention_parts = mention_text.split()
        if not case_sensitive:
            mention_parts = [part.lower() for part in mention_parts]

        spans = []
        found_all = True

        for part in mention_parts:
            try:
                index = tokens.index(part)
                spans.append((index, index))
            except ValueError:
                found_all = False
                break

        if found_all:
            merged_spans = merge_consecutive_spans(spans)
            results[mention_text] = (mention_type, merged_spans)

    return results

sentence = "Abd : S / NT / ND , + BS , - HSM , multiple surgical scars ."
mentions = [('Disorder', 'Abd ND'), ('test1', 'Abd NT'), ('test2', 'Abd S'), ('Disorder', 'Abd surgical scars')]

result = find_discontinuous_mentions(sentence, mentions)
print(result)

{'Abd ND': ('Disorder', [(0, 0), (6, 6)]), 'Abd NT': ('test1', [(0, 0), (4, 4)]), 'Abd S': ('test2', [(0, 0), (2, 2)]), 'Abd surgical scars': ('Disorder', [(0, 0), (15, 16)])}


In [None]:

# test_file = 'Dai Preprocessing/test.txt'
# processed_documents = process_input_file(test_file)

missing = 0

out_file = ""

for doc_id, doc_data in documents.items():
    # print(doc_data['predictions'])
    try:
        all_preds = split_disorders(doc_data['predictions'], case_sensitive=True)
    except:
        print(doc_data['predictions'])
        print(doc_data)
        assert False
    missing_preds = all_preds
    output = []
    for annotation, sentence in zip(doc_data['annotations'], doc_data['sentences']):
        spans, matches = find_disorder_spans_disc(sentence.split(), all_preds, case_sensitive=True)
        output.append((sentence, spans, annotation))
        missing_preds = [(s,p) for s,p in missing_preds if p not in matches]

    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            mentions = find_discontinuous_mentions(sent, missing_preds, case_sensitive=True)
            if mentions:
                for type, mention in mentions.values():
                    output[i][1].append(','.join(f"{s},{e}" for s,e in mention) +f" {type}")
                missing_preds = [(s,p) for s, p in missing_preds if p not in mentions.keys()]
                if not missing_preds:
                    break
    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            spans, matches = find_disorder_spans_disc(sent.split(), missing_preds, case_sensitive=False)
            if spans:
                for span in spans:
                    output[i][1].extend(spans)
                missing_preds = [(s,p) for s, p in missing_preds if p.lower() not in set(m.lower().replace("*", "")  for m in matches)]
                if not missing_preds:
                    break
    if missing_preds:
        for i, sent in enumerate(doc_data['sentences']):
            mentions = find_discontinuous_mentions(sent, missing_preds, case_sensitive=False)
            if mentions:
                for type, mention in mentions.values():
                    output[i][1].append(','.join(f"{s},{e}" for s,e in mention) +" ADR")
                missing_preds = [(s,p) for s, p in missing_preds if p.lower() not in set(m.lower() for m in mentions.keys())]
                if not missing_preds:
                    break
    for sent, spans, gt in output:
        out_file += sent + '\n'
        out_file += '|'.join(map(lambda x: x.replace('disorder', 'ADR'), spans)) +  '\n\n'
    # print(out_file)
    # break

    # if missing_preds:
    #     print(doc_id)
    #     print(doc_data['sentences'])
    #     print(f"Missing predictions: {missing_preds}")
    #     missing += len(missing_preds)
        # assert False
        # break
        # print('\n'.join(doc_data['sentences']))
print(missing)

 in ['disorder: constant pain in right thigh', '', 'disorder: abdominal pain', 'disorder: intermittent pain throughout body', 'disorder: unable to walk', 'disorder: excruciating pain in the leg', 'disorder: tingling of feet', 'disorder: elevated liver function'] is weird
stomach problems in ['disorder: Psychosis', 'disorder: stomach upset', 'disorder: shaking', 'stomach problems', 'disorder: break down'] is weird
 in ['disorder: Muscle pain in left elbow', 'disorder: pain in feet', ''] is weird
 in ['disorder: Severe Joint Pain', 'disorder: total cholestrol is 300', 'disorder: hdl is 72', ''] is weird
 in ['disorder: stomach cramps', 'disorder: diarrhoa', 'disorder: heavy menstral bleeding with clots even though i had just finished my cycle a week before', '', 'disorder: could not pass urine', 'disorder: full of blood', 'disorder: felt very weak', 'disorder: could not stop shaking', 'disorder: bleeding'] is weird
disorder: problems with memory disorder: fell down stairs in ['disorder: 

In [None]:
preds = doc_data['predictions']

In [None]:
print(preds)

disorder: Muscle twitching; disorder: stiff neck; disorder: lightheadedness; disorder: twitching; disorder: light head


In [None]:
with open('ours_to_dai.txt', 'w+') as f:
    f.write(out_file)

In [None]:
# Clone the repository
!git clone https://github.com/dainlp/acl2020-transition-discontinuous-ner.git

# Navigate into the directory
%cd xdai/ner

In [None]:
%cd /content/acl2020-transition-discontinuous-ner/code/xdai/ner

In [None]:
# rename (ours_to_dai.txt) to (test.txt) and put it in folder /pred/test.txt
# put the gold which is in Dai's format in folder /gold/test.txt
!python evaluate.py --gold_filepath /content/gold/test.txt --pred_filepath /content/pred/test.txt