In [None]:
# Install medcat
! pip install medcat~=1.16.0
# install seaborn
! pip install seaborn
try:
    from medcat.cat import CAT
except:
    print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
    exit()



In [None]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns

from matplotlib import pyplot as plt
from medcat.cat import CAT

In [None]:
!rm -f models/medmen_wstatus_2021_oct.zip
!mkdir -p models

# Download from the updated location (follows redirects)
!wget -q -L -O models/medmen_wstatus_2021_oct.zip \
  https://cogstack-medcat-example-models.s3.eu-west-2.amazonaws.com/medcat-example-models/medmen_wstatus_2021_oct.zip

# Sanity checks
!ls -lh models/medmen_wstatus_2021_oct.zip
!file models/medmen_wstatus_2021_oct.zip
!python - << 'PY'
import zipfile
p="models/medmen_wstatus_2021_oct.zip"
print("is_zipfile:", zipfile.is_zipfile(p))


-rw-r--r-- 1 root root 536M Aug 25  2023 models/medmen_wstatus_2021_oct.zip
models/medmen_wstatus_2021_oct.zip: Zip archive data, at least v2.0 to extract, compression method=store
is_zipfile: True


In [None]:
cat = CAT.load_model_pack("models/medmen_wstatus_2021_oct.zip")

INFO:medcat.cat:Found an existing unzipped model pack at: models/medmen_wstatus_2021_oct, the provided zip will not be touched.
INFO:medcat.cat:Loading model pack with dill format
INFO:medcat.cat:{
  "Model ID": null,
  "Last Modified On": null,
  "History (from least to most recent)": [],
  "Description": "No description",
  "Source Ontology": null,
  "Location": null,
  "MetaCAT models": {},
  "Basic CDB Stats": {},
  "Performance": {
    "ner": {},
    "meta": {}
  },
  "Important Parameters (Partial view, all available in cat.config)": {
    "config.ner.min_name_len": {
      "value": 3,
      "description": "Minimum detection length (found terms/mentions shorter than this will not be detected)."
    },
    "config.ner.upper_case_limit_len": {
      "value": 3,
      "description": "All detected terms shorter than this value have to be uppercase, otherwise they will be ignored."
    },
    "config.linking.similarity_threshold": {
      "value": 0.2,
      "description": "If the con

In [None]:
import pandas as pd

df = pd.read_csv("/content/output.csv")

clean = (
    df.iloc[33333: 33734, 0]
      .astype(str)
      .str.replace(r'^FINDINGS:\s*', '', regex=True)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
      .reset_index(drop=True)
)

gt_concepts_col = df.iloc[33333: 33734, 1].reset_index(drop=True)


In [None]:
def medcat_pred_concepts(text: str):
    out = cat.get_entities(text)   # dict with "entities"
    ents = out.get("entities", {})

    preds = []
    for _, e in ents.items():
        # Prefer canonical name if present, else surface text
        name = (e.get("detected_name") or e.get("source_value") or "").strip().lower()
        if not name:
            continue

        # Negation/status: depends on model pack; medmen_wstatus includes status-type info
        # Common fields you may see: "meta_anns", "status", "negated" depending on version/pack.
        # We'll handle a few robustly.
        neg = False
        if "negated" in e:
            neg = bool(e["negated"])
        elif "status" in e and isinstance(e["status"], str):
            neg = e["status"].lower() in {"negated", "negative", "absent"}
        elif "meta_anns" in e and isinstance(e["meta_anns"], dict):
            # example-style: { "Status": { "value": "NEGATED", ... } }
            for k, v in e["meta_anns"].items():
                if isinstance(v, dict) and "value" in v and str(v["value"]).lower() in {"negated", "negative", "absent"}:
                    neg = True

        if neg and not name.startswith("no "):
            name = "no " + name

        preds.append(name)

    # de-duplicate, preserve order
    return list(dict.fromkeys(preds))


pred_concepts = [medcat_pred_concepts(t) for t in clean]
for i in range(5):
  print(clean[i])
  print(pred_concepts[i])
  print()


Single portable view of the chest. There is superior traction of the left hilum. Subtle opacity projects over the left scapula in the region of the overlying cardiac lead. Findings are suggestive of underlying scarring. Elsewhere the lungs are clear. Cardiac silhouette is top-normal in size. For technique. No acute osseous abnormality seen, hypertrophic changes seen spine.
['single', 'portable', 'chest', 'superior', 'traction', 'left', 'region', 'cardiac', 'finding', 'suggestive~of', 'lung', 'normal', 'size', 'technique', 'acute', 'osseous', 'abnormality', 'hypertrophic', 'change', 'spine']

The heart is mildly enlarged with a left ventricular configuration. The mediastinal and hilar contours appear unchanged. The lungs appear clear. Blunting of the right posterior costophrenic sulcus may reflect a trace pleural effusion on that side only. Mild degenerative changes are similar along the thoracic spine.
['heart', 'mildly', 'enlarged', 'left', 'configuration', 'mediastinal', 'contour', '

In [None]:
import ast

parsed_gt_concepts = []
for concepts_str in gt_concepts_col:
    # Convert string representation of list to actual list
    concepts_list = ast.literal_eval(concepts_str)

    # Clean each concept: lowercase and strip whitespace
    cleaned_concepts = [concept.lower().strip() for concept in concepts_list]
    parsed_gt_concepts.append(cleaned_concepts)

# Print the first 5 entries to verify
print("First 5 parsed and cleaned ground truth concepts:")
for i in range(5):
    print(parsed_gt_concepts[i])

First 5 parsed and cleaned ground truth concepts:
['single portable view', 'superior traction of left hilum', 'opacity over left scapula', 'suggestive of underlying scarring', 'clear lungs', 'normal cardiac silhouette', 'no acute osseous abnormality']
['mildly enlarged heart with left ventricular configuration', 'normal mediastinal and hilar contours', 'clear lungs', 'trace pleural effusion on right side', 'mild degenerative changes in thoracic spine']
['pa and lateral views', 'moderately enlarged heart with left ventricular configuration', 'interstitial pulmonary edema', 'no pleural effusion', 'no pneumothorax', 'no focal consolidation', 'normal mediastinal contour', 'intact bony structures']
['normal lung volumes', 'no focal consolidation', 'no effusion', 'no pneumothorax', 'stable mediastinal and hilar contours', 'normal heart size']
['well expanded lungs', 'prominent interstitial markings', 'mild pulmonary edema', 'right apical density', 'lung nodule', 'free intraperitoneal air', '

In [None]:
import re

def align_concepts_with_gt(original_text: str, medcat_concepts: list, gt_concepts: list):
    final_concepts = []

    # Convert original_text to lowercase for case-insensitive matching
    text_lower = original_text.lower()

    # Prioritize ground truth concepts if they are present in the original text
    for gt_concept in gt_concepts:
        # Check for whole word match, case-insensitive
        # Escaping special characters in gt_concept to use in regex pattern
        pattern = r'\b' + re.escape(gt_concept) + r'\b'
        if re.search(pattern, text_lower):
            final_concepts.append(gt_concept)

    # Add unique MedCAT concepts, avoiding duplicates and substrings of already added concepts
    for medcat_concept in medcat_concepts:
        is_redundant = False
        # Check if it's already in final_concepts (exact match) or if it's a substring
        if medcat_concept in final_concepts:
            is_redundant = True
        else:
            for final_c in final_concepts:
                # Check if medcat_concept is a substring of an existing final_concept
                if medcat_concept in final_c and medcat_concept != final_c:
                    is_redundant = True
                    break
                # Check if an existing final_concept is a substring of medcat_concept
                if final_c in medcat_concept and medcat_concept != final_c:
                    # Decide which one to keep. For now, prioritize the longer, more specific one if GT is not involved.
                    # Since GT is prioritized, if final_c is from GT, keep it. If medcat_concept is longer and unique, add it.
                    pass # This logic can be refined, for now, simple check for existing substring covers basic redundancy.

        if not is_redundant:
            final_concepts.append(medcat_concept)

    return list(dict.fromkeys(final_concepts)) # Remove any accidental duplicates while preserving order


# Apply the function to the data
aligned_concepts = []
for i in range(len(clean)):
    aligned_concepts.append(align_concepts_with_gt(clean[i], pred_concepts[i], parsed_gt_concepts[i]))

# Print the first 5 aligned concepts to verify
print("First 5 aligned concepts:")
for i in range(5):
    print(f"Original Text: {clean[i]}")
    print(f"Aligned Concepts: {aligned_concepts[i]}")
    print() # Empty line for better readability

First 5 aligned concepts:
Original Text: Single portable view of the chest. There is superior traction of the left hilum. Subtle opacity projects over the left scapula in the region of the overlying cardiac lead. Findings are suggestive of underlying scarring. Elsewhere the lungs are clear. Cardiac silhouette is top-normal in size. For technique. No acute osseous abnormality seen, hypertrophic changes seen spine.
Aligned Concepts: ['single portable view', 'suggestive of underlying scarring', 'no acute osseous abnormality', 'chest', 'superior', 'traction', 'left', 'region', 'cardiac', 'finding', 'suggestive~of', 'lung', 'size', 'technique', 'hypertrophic', 'change', 'spine']

Original Text: The heart is mildly enlarged with a left ventricular configuration. The mediastinal and hilar contours appear unchanged. The lungs appear clear. Blunting of the right posterior costophrenic sulcus may reflect a trace pleural effusion on that side only. Mild degenerative changes are similar along the 

In [None]:
print("First 5 entries showing original text, predicted concepts, parsed ground truth concepts, and final aligned concepts:")
for i in range(5):
    print(f"\n--- Entry {i+1} ---")
    print(f"Original Text: {clean[i]}")
    print(f"Predicted Concepts (MedCAT): {pred_concepts[i]}")
    print(f"Parsed Ground Truth Concepts: {parsed_gt_concepts[i]}")
    print(f"Final Aligned Concepts: {aligned_concepts[i]}")

First 5 entries showing original text, predicted concepts, parsed ground truth concepts, and final aligned concepts:

--- Entry 1 ---
Original Text: Single portable view of the chest. There is superior traction of the left hilum. Subtle opacity projects over the left scapula in the region of the overlying cardiac lead. Findings are suggestive of underlying scarring. Elsewhere the lungs are clear. Cardiac silhouette is top-normal in size. For technique. No acute osseous abnormality seen, hypertrophic changes seen spine.
Predicted Concepts (MedCAT): ['single', 'portable', 'chest', 'superior', 'traction', 'left', 'region', 'cardiac', 'finding', 'suggestive~of', 'lung', 'normal', 'size', 'technique', 'acute', 'osseous', 'abnormality', 'hypertrophic', 'change', 'spine']
Parsed Ground Truth Concepts: ['single portable view', 'superior traction of left hilum', 'opacity over left scapula', 'suggestive of underlying scarring', 'clear lungs', 'normal cardiac silhouette', 'no acute osseous abnorm

In [None]:
def similarity2(a: str, b: str) -> float:
    a_tokens = set(a.lower().split())
    b_tokens = set(b.lower().split())

    if not a_tokens and not b_tokens:
        return 1.0
    if not a_tokens or not b_tokens:
        return 0.0

    intersection = a_tokens & b_tokens
    union = a_tokens | b_tokens

    return (2*len(intersection)) / (len(a_tokens)+len(b_tokens))


In [None]:
def fuzzy_counts_one(gt_list, pred_list, threshold):
    gt = [g.strip().lower() for g in gt_list if g.strip()]
    pr = [p.strip().lower() for p in pred_list if p.strip()]

    used_gt = set()
    TP = 0

    for p in pr:
        best_j = None
        best_score = 0.0

        for j, g in enumerate(gt):
            if j in used_gt:
                continue

            score = similarity2(p, g)
            if score > best_score:
                best_score = score
                best_j = j

        if best_j is not None and best_score >= threshold:
            TP += 1
            used_gt.add(best_j)

    FP = len(pr) - TP
    FN = len(gt) - TP

    return TP, FP, FN


In [None]:
def fuzzy_prf(gt_norm, pred_norm, threshold):
    TP = FP = FN = 0

    for gt, pr in zip(gt_norm, pred_norm):
        t, f, n = fuzzy_counts_one(gt, pr, threshold)
        TP += t
        FP += f
        FN += n

    precision = TP / (TP + FP) if TP + FP else 0
    recall    = TP / (TP + FN) if TP + FN else 0
    f1        = (2 * precision * recall / (precision + recall)) if precision + recall else 0

    return precision, recall, f1, (TP, FP, FN)


In [None]:
for th in [0.5,0.6,0.7,0.75, 0.8, 0.85,0.9]:
    P, R, F1, counts = fuzzy_prf(parsed_gt_concepts, aligned_concepts, threshold=th)
    print(th, P, R, F1)

0.5 0.29323553382233086 0.6314496314496314 0.4004897595725734
0.6 0.1973920130399348 0.4250614250614251 0.2695903829029386
0.7 0.15028524857375714 0.3236223236223236 0.20525378450578807
0.75 0.15028524857375714 0.3236223236223236 0.20525378450578807
0.8 0.15028524857375714 0.3236223236223236 0.20525378450578807
0.85 0.15028524857375714 0.3236223236223236 0.20525378450578807
0.9 0.15028524857375714 0.3236223236223236 0.20525378450578807
