# Introduction
In clinical care settings, ECG interpretations are often recorded as free-text. It can be challenging to translate these into binary labels for training and evaluation due to synonyms, acronyms, grammar, typographical errors, evolving medical terminology, and implied findings.

Specifically, we:
- Apply pattern matching (patterns were hand-curated), maintaining positional information
- Derive a series of entities (e.g., 'tachycardia', 'infarction'), descriptors (e.g., 'probably', 'moderate', 'acute'), and connectives (e.g., 'associated with', 'transitions to')
- Distill elevant information from the descriptors and connectives down into their corresponding entities
- Apply a knowledge graph encoding label relationships to recursively mark labels as true, e.g., labeling *Ventricular tachycardia* when *Torsades de Pointes* was stated.
- Map the resulting entities into labels which can be flexibly manipulated.


In [None]:
import os
import pandas as pd

root = os.path.dirname(os.getcwd())
labeler_dir = os.path.join(root, 'data/mimic_iv_ecg/labeler')

labeler_results_file = os.path.join(labeler_dir, 'labeler_res.pkl')
if os.path.exists(labeler_results_file):
    os.remove(labeler_results_file)

# Load labeler

In [None]:
!pip install tqdm
!pip install networkx

In [None]:
import sys
sys.path.insert(0, os.path.join(root, 'labeler/'))

from pattern_labeler import PatternLabelerConfig, PatternLabeler
from preprocess import preprocess_texts

In [None]:
labeler_config = PatternLabelerConfig.from_json(labeler_dir)
labeler_config.entity_templates

In [None]:
labeler = PatternLabeler(labeler_config)
labeler

In [None]:
labeler.plot_ancestor_subgraph(
    "Tachycardia",
    figsize=(15, 8),
    node_size=100,
    font_size=8,
)

# Preprocess

In [None]:
interpretations = pd.Series([
    "Sinus rhythm; Possible right atrial abnormality",
    "Sinus tach; Normal electrocardiogram except for rate",
    "Normal sinus rhtyhm; Normal ECG; missing lead v2",
    "Accelerated idioventricular rhythm; LAD; Borderline ECG",
    "Stach with PVC(s); Possible seotal infarct; Undefined",
])
texts = preprocess_texts(interpretations.copy())
texts.rename("text", inplace=True)
texts

# Parse

In [None]:
labeler_res = labeler(
    texts=texts.copy(),
    restore_path=labeler_results_file,
)

# Analyze results

In [None]:
import pickle

with open(labeler_results_file, "rb") as f:
    labeler_res = pickle.load(f)

## View unmatched text

In [None]:
unmatched = labeler_res.text_results['unmatched'][
    labeler_res.text_results['unmatched'] != ''
].copy()
unmatched = unmatched.str.replace("[^\w\s]", "", regex=True).str.strip()
unmatched = unmatched[unmatched != ''].copy()
unmatched

# Create labels

In [None]:
labels_flat = labeler_res.labels_flat.copy()

In [None]:
vcs = labels_flat[
    ~labels_flat['name'].str.contains(" - ", regex=False)
]['name'].value_counts()
vcs

In [None]:
# Removed from UHN labels:
# Normal sinus rhythm
# 2nd degree atrioventricular block
# Ventricular pacing
# Atrial pacing

CONFIRM_LABELS = """
Poor data quality
Sinus rhythm
Sinus tachycardia
Premature ventricular contraction
Tachycardia
Right atrial abnormality
""".split("\n")
CONFIRM_LABELS = [label for label in CONFIRM_LABELS if label != ""]

In [None]:
vcs.loc[CONFIRM_LABELS] / len(texts)

In [None]:
labels_flat_final = labels_flat[labels_flat["name"].isin(CONFIRM_LABELS)]
labels = pd.get_dummies(labels_flat_final['name'])[CONFIRM_LABELS]
labels.index.name = 'idx'
labels = labels.groupby('idx').any()

# Add in rows which had no labels
no_label_rows = pd.DataFrame(index=texts.index[~texts.index.isin(labels.index)].copy(), columns=CONFIRM_LABELS)
no_label_rows.loc[:, :] = False
labels = pd.concat([labels, no_label_rows]).sort_index()
labels.index.name = 'idx'
labels

In [None]:
assert len(labels) == len(texts)

# Labeler definition example

If you're looking to define your own labeler, it can be easier to start from Python code, rather than writing the JSON. It can then be converted to JSON for easier distribution and versioning.

In [None]:
from typing import Dict, List, Optional, Union

from pattern_labeler import (
    AttachedDescriptorTem,
    CompoundTem,
    Connective,
    DescriptorTem,
    Entity,
    EntityPattern,
    EntityTem,
    SplitDescriptorTem,
    TravelingDescriptorsTem,
    DescriptorPattern
)

ENTITY_TEMPLATES: List[EntityTem] = [
    EntityTem("Sinus rhythm"),
    EntityTem("Arrhythmia"),
    EntityTem("Tachycardia", sup="Arrhythmia"),
    EntityTem("Sinus tachycardia", sup=["Sinus rhythm", "Tachycardia"]),
    EntityTem("Ectopic beat"),
    EntityTem("Ectopic ventricular contraction", sup="Ectopic beat"),
    EntityTem("Bifascicular block"),
    EntityTem("Right bundle branch block"),
    EntityTem("Fascicular block"),
]

ENTITY_PATTERNS: List[EntityPattern] = [
    EntityPattern("tachycardia", "Tachycardia"),
    EntityPattern("ectopic beat", "Ectopic beat"),
]

DESCRIPTOR_TEMPLATES: List[DescriptorTem] = [
    DescriptorTem("Severe", category="severity"),
    DescriptorTem("Sinus", category="location"),
    DescriptorTem("Atrial", category="location"),
    DescriptorTem("Ventricular", category="location"),
    DescriptorTem("Multiple", category="quantity"),
    DescriptorTem("Possible", category="uncertainty"),
    DescriptorTem("Probable", category="uncertainty"),
]

DESCRIPTOR_PATTERNS: List[DescriptorPattern] = [
    DescriptorPattern("severe", "Severe"),
    DescriptorPattern("atrial", "Atrial"),
    DescriptorPattern("possible", "Possible"),
    DescriptorPattern("probably", "Probable"),
    DescriptorPattern("multiple", "Multiple"),
]

# === Connectives ===
CONNECTIVES: List[Connective] = [
    Connective("and"),
    Connective("suggests", descriptors=[None, "Probable"], tags="causal"),
]

SPLIT_DESCRIPTOR_TEMPLATES: List[SplitDescriptorTem] = [
    SplitDescriptorTem(
        "Atrioventricular",
        split=["Atrial", "Ventricular"],
        patterns="atrioventricular",
    ),
]

COMPOUND_TEMPLATES: List[CompoundTem] = [
    CompoundTem(
        "Bifascicular block",
        ["Right bundle branch block", "Fascicular block"],
    ),
]

ATTACHED_DESCRIPTOR_TEMPLATES: List[AttachedDescriptorTem] = [
    AttachedDescriptorTem("Atrial tachycardia", "Tachycardia", "Atrial"),
]

TRAVELING_DESCRIPTOR_TEMPLATES: List[TravelingDescriptorsTem] = [
    TravelingDescriptorsTem("Ectopic Beat", ["Multiple"]),
]

UNCERTAINTY_MAP: Dict[str, float] = {
    "Possible": 0.5,
    "Probable": 0.7,
}

In [None]:
labeler_config = PatternLabelerConfig(
    ENTITY_TEMPLATES,
    ENTITY_PATTERNS,
    descriptor_templates=DESCRIPTOR_TEMPLATES,
    descriptor_patterns=DESCRIPTOR_PATTERNS,
    split_descriptor_templates=SPLIT_DESCRIPTOR_TEMPLATES,
    connectives=CONNECTIVES,
    compound_templates=COMPOUND_TEMPLATES,
    attached_descriptor_templates=ATTACHED_DESCRIPTOR_TEMPLATES,
    traveling_descriptor_templates=TRAVELING_DESCRIPTOR_TEMPLATES,
    uncertainty_map=UNCERTAINTY_MAP,
)
labeler = PatternLabeler(labeler_config)
labeler

In [None]:
custom_labeler_dir = os.path.join(root, 'data/custom_labeler')
labeler_config.to_json(custom_labeler_dir)