In [None]:
# This is a document-level evaluation, providing a quick assessment of model performance using our format.
# However, for comparability with SOTA models, we reported evaluation results using (Dai et al., 2020) evaluation script after converting our documents to sentences.

In [None]:
# ShaRe 13 and 14 Evaluation, remove the text column befor evaluation.

In [None]:
# implemented something similar to Dai's logic
import os
def evaluate_spans(predictions, targets):
    TP = 0
    FP = 0
    FN = 0

    pred_spans = [span.strip().replace('disorder:', '') for span in predictions.split(';') if span.strip()]
    target_spans = [span.strip().replace('disorder:', '') for span in targets.split(';') if span.strip()]

    for pred in pred_spans:
        if pred in target_spans:
            TP += 1
            target_spans.remove(pred)  # Remove the matched span from target_spans
        else:
            FP += 1

    # Calculate the remaining unmatched gold spans as false negatives
    FN = len(target_spans)

    return TP, FP, FN

def evaluate_file(file_path):
    total_TP = 0
    total_FP = 0
    total_FN = 0
    temp_file_path = "/content/Metrics.tsv"
    header_written = False

    with open(file_path, 'r') as file, open(temp_file_path, 'w') as temp_file:
        for line in file:
            if not header_written:
                header_written = True
                temp_file.write(line.strip() + "\tTP\tFP\tFN\n")
                continue

            parts = line.strip().split('\t')
            prediction = parts[0]
            target = parts[1]
            true_positives, false_positives, false_negatives = evaluate_spans(prediction, target)
            temp_file.write(f"{line.strip()}\t{true_positives}\t{false_positives}\t{false_negatives}\n")
            total_TP += true_positives
            total_FP += false_positives
            total_FN += false_negatives

    precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
    recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    print("TP:", total_TP)
    print("FP:", total_FP)
    print("FN:", total_FN)

    print("P:", precision)
    print("R:", recall)
    print("F1:", f1_score)

# Example usage:
evaluate_file("/content/EvalLLaMA2-7B.tsv")

TP: 5968
FP: 2743
FN: 2013
P: 0.6851107794742279
R: 0.7477759679238191
F1: 0.7150730889048647


In [None]:
# CADEC evaluation, No need to remove any columns from the results.

In [None]:
# Here I added if len(parts) >= 3: for CADEC cuz some rows has empty output and target
import os
def evaluate_spans(predictions, targets):
    TP = 0
    FP = 0
    FN = 0
# For Format1
    #pred_spans = [span.strip().replace('disorder:', '') for span in predictions.split(';') if span.strip()]
    #target_spans = [span.strip().replace('disorder:', '') for span in targets.split(';') if span.strip()]
# For Format2
    pred_spans = [span.strip() for span in predictions.split(';') if span.strip()]
    target_spans = [span.strip() for span in targets.split(';') if span.strip()]

    for pred in pred_spans:
        if pred in target_spans:
            TP += 1
            target_spans.remove(pred)  # Remove the matched span from target_spans
        else:
            FP += 1

    # Calculate the remaining unmatched gold spans as false negatives
    FN = len(target_spans)

    return TP, FP, FN

def evaluate_file(file_path):
    total_TP = 0
    total_FP = 0
    total_FN = 0
    temp_file_path = "/content/Metrics.tsv"
    header_written = False

    with open(file_path, 'r') as file, open(temp_file_path, 'w') as temp_file:
        for line in file:
            if not header_written:
                header_written = True
                temp_file.write(line.strip() + "\tTP\tFP\tFN\n")
                continue

            parts = line.strip().split('\t')
            if len(parts) >= 3:
                prediction = parts[1]
                target = parts[2]
                true_positives, false_positives, false_negatives = evaluate_spans(prediction, target)
                temp_file.write(f"{line.strip()}\t{true_positives}\t{false_positives}\t{false_negatives}\n")
                total_TP += true_positives
                total_FP += false_positives
                total_FN += false_negatives
            else:
                temp_file.write(f"{line.strip()}\n")
    precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0
    recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    print("TP:", total_TP)
    print("FP:", total_FP)
    print("FN:", total_FN)

    print("P:", precision)
    print("R:", recall)
    print("F1:", f1_score)

# Example usage:
evaluate_file("/content/LLaMA2-7B.tsv")

TP: 629
FP: 333
FN: 360
P: 0.6538461538461539
R: 0.6359959555106168
F1: 0.6447975397232188


In [None]:
# Here I'm just playing to ensure correct metrics are calculated

In [None]:
predictions = "disorder: Abdominal pain; disorder: abdominal discomfort; disorder: nausea; disorder: severe pain; disorder: ventral hernia; disorder: symptoms; disorder: PTA; disorder: ventral hernia; disorder: symptoms; disorder: PTA; disorder: abdominal discomfort; disorder: nausea; disorder: severe pain; disorder: epigastric ventral hernia; disorder: omental fat stranding; disorder: ventral hernia; disorder: omental inflammatory fat stranding; disorder: omental fat stranding; disorder: inflammatory fat"
pred_spans = [span.strip().replace('disorder:', '') for span in predictions.split(';') if span.strip()]

print(pred_spans)

[' Abdominal pain', ' abdominal discomfort', ' nausea', ' severe pain', ' ventral hernia', ' symptoms', ' PTA', ' ventral hernia', ' symptoms', ' PTA', ' abdominal discomfort', ' nausea', ' severe pain', ' epigastric ventral hernia', ' omental fat stranding', ' ventral hernia', ' omental inflammatory fat stranding', ' omental fat stranding', ' inflammatory fat']


In [None]:
#target = "disorder: Abdominal pain; disorder: adhesions; disorder: recurrent ventral hernia; disorder: end stage renal disease; disorder: epigastric ventral hernia; disorder: recurrence ventral hernia; disorder: symptoms; disorder: abdominal discomfort; disorder: nausea; disorder: severe pain; disorder: epigastric ventral hernia; disorder: ventral hernia; disorder: inflammatory fat stranding; disorder: prominent loops of small bowel; disorder: fecalization of bowel; disorder: inflammatory fat stranding; disorder: loops of bowel distal; disorder: prominent loops; disorder: decompressed; disorder: incarcerated hernia; disorder: obstruction; disorder: end-stage renal disease; disorder: glomerulonephritis; disorder: IgA diagnosed; disorder: anuric; disorder: calciphylaxis; disorder: hypercalcemia; disorder: arteriovenous fistula; disorder: chronic pain; disorder: rheumatic heart disease; disorder: endocarditis; disorder: hypertension; disorder: Anxiety; disorder: pain; disorder: JVD; disorder: tachy; disorder: murmur; disorder: TTP; disorder: gaurding; disorder: non-distended; disorder: rebound; disorder: Prominent loops; disorder: ventral hernia; disorder: fecalization of bowel contents; disorder: inflammatory fat stranding; disorder: bowel ischemia; disorder: perforation; disorder: ventral hernia; disorder: collapsed; disorder: incarcerated hernia; disorder: bowel obstruction; disorder: bowel loops; disorder: ventral hernia; disorder: lesions; disorder: hemangiomas; disorder: hepatic cyst; disorder: renal osteodystrophy; disorder: subclavian vein stenosis; disorder: complication; disorder: Recurrent ventral hernia; disorder: small bowel obstruction; disorder: compromised bowel; disorder: Chronic renal failure; disorder: wound; disorder: wound; disorder: pain; disorder: pain; disorder: Fever; disorder: Inability to eat; disorder: persistant vomiting; disorder: symptoms"
#prediction = "disorder: Abdominal pain; disorder: abdominal discomfort; disorder: nausea; disorder: severe pain; disorder: ventral hernia; disorder: symptoms; disorder: PTA; disorder: ventral hernia; disorder: symptoms; disorder: PTA; disorder: abdominal discomfort; disorder: nausea; disorder: severe pain; disorder: epigastric ventral hernia; disorder: omental fat stranding; disorder: ventral hernia; disorder: omental inflammatory fat stranding; disorder: omental fat stranding; disorder: inflammatory fat"
target = "disorder: chronic obstructive pulmonary disease; disorder: coronary artery disease; disorder: mass; disorder: mass; disorder: collapse; disorder: dyspnea on exertion; disorder: dyspnea on exertion; disorder: lower extremity edema; disorder: left against medical advise; disorder: shortness of breath; disorder: dyspnea; disorder: lower extremity edema; disorder: chronic cough; disorder: weight loss; disorder: hemoptysis; disorder: deep vein thrombosis; disorder: pleural effusion; disorder: cardiomegaly; disorder: INR decreased; disorder: mass; disorder: acute respiratory distress; disorder: respiratory failure; disorder: distress; disorder: mass; disorder: Atrial fibrillation; disorder: Chronic obstructive pulmonary disease; disorder: Rheumatic heart disease; disorder: Coronary artery disease; disorder: Pulmonary artery hypertension; disorder: Biatrial enlargement; disorder: Hyperlipidemia; disorder: Hypothyroidism; disorder: hepatitis; disorder: decreased breath sounds; disorder: coarse breath sounds; disorder: rales; disorder: mechanical heart sounds; disorder: pitting edema; disorder: tenderness; disorder: opacification; disorder: hemothorax; disorder: large right atrium; disorder: atrial fibrillation; disorder: right axis deviation; disorder: T-wave inversions; disorder: ST changes; disorder: mass; disorder: mass; disorder: malignancy; disorder: non-small cell lung cancer; disorder: malignancy; disorder: hypercalcemic; disorder: lung cancer; disorder: mass; disorder: compression; disorder: respiratory function; disorder: symptoms; disorder: respiratory function; disorder: Respiratory failure; disorder: respiratory failure; disorder: lung mass; disorder: copious amount of secretions; disorder: lung mass; disorder: lung disease; disorder: chronic obstructive pulmonary disease; disorder: chronic obstructive pulmonary disease; disorder: terminal illness; disorder: respiratory arrest; disorder: respiratory arrest; disorder: rate junctional; disorder: asystolic; disorder: Lung cancer; disorder: atrial fibrillation; disorder: Chronic obstructive pulmonary disease; disorder: Hypercalcemia malignancy; disorder: Coronary artery disease"
prediction = "disorder: chronic obstructive pulmonary disease; disorder: coronary artery disease; disorder: mitral valve; disorder: mass; disorder: collapse; disorder: dyspnea on exertion; disorder: dyspnea on exertion; disorder: lower extremity edema; disorder: chronic cough; disorder: weight loss; disorder: hemoptysis; disorder: decreased breath sounds; disorder: decreased heart rate; disorder: decreased heart sounds; disorder: scattered rales; disorder: mechanical heart sounds; disorder: Abdomen benign; disorder: pit"
def evaluate_spans(predictions, targets):
    TP = 0
    FP = 0
    FN = 0

    pred_spans = [span.strip().replace('disorder:', '') for span in predictions.split(';') if span.strip()]
    target_spans = [span.strip().replace('disorder:', '') for span in targets.split(';') if span.strip()]

    for pred in pred_spans:
        if pred in target_spans:
            TP += 1
            target_spans.remove(pred)  # Remove the matched span from target_spans
        else:
            FP += 1

    # Calculate the remaining unmatched gold spans as false negatives
    FN = len(target_spans)

    return TP, FP, FN

print(evaluate_spans(prediction, target))


(12, 6, 64)
