In [3]:
from striprtf.striprtf import rtf_to_text
import json
import re

RTF_PATH = "ServiceTypeProfiles.rtf"
OUTPUT_JSON = "service_terms.json"


def is_heading(line):
    """
    Heuristic to detect subheadings:
    - Not empty
    - Not BLOCK lines
    - Not numeric section headers
    - Next lines are actual phrases
    """
    if not line.strip():
        return False

    if line.startswith("BLOCK"):
        return False

    if re.match(r"^\d+(\.\d+)*", line):
        return False

    # Avoid obvious phrase-like sentences
    if len(line.split()) <= 2:
        return False

    return True


def build_service_json(text):
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    result = {}

    i = 0
    while i < len(lines) - 5:
        line = lines[i]

        if is_heading(line):
            next_five = lines[i + 1:i + 6]

            # Ensure these are real phrases, not another heading
            if all(len(p.split()) >= 2 for p in next_five):
                result[line] = {
                    "terms": next_five
                }
                i += 6
                continue

        i += 1

    return result


def main():
    with open(RTF_PATH, "r", encoding="utf-8", errors="ignore") as f:
        rtf_content = f.read()

    plain_text = rtf_to_text(rtf_content)

    service_json = build_service_json(plain_text)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(service_json, f, indent=2, ensure_ascii=False)

    print(f"‚úÖ Extracted {len(service_json)} subheadings")
    print(f"üìÅ Saved to {OUTPUT_JSON}")


if __name__ == "__main__":
    main()


‚úÖ Extracted 153 subheadings
üìÅ Saved to service_terms.json


In [48]:
import json

with open("service_terms.json", "r", encoding="utf-8") as f:
    data = json.load(f)

len(data)

191

In [49]:
import pandas as pd

file_path = "ground_truth/EDI X12 271 Code reference.xlsx"

df = pd.read_excel(file_path, sheet_name=12)
eb03 = df["Definition"].to_list()
eb03

['Medical Care',
 'Surgical',
 'Consultation',
 'Diagnostic X-Ray',
 'Diagnostic Lab',
 'Radiation Therapy',
 'Anesthesia',
 'Surgical Assistance',
 'Other Medical',
 'Blood Charges',
 'Used Durable Medical Equipment',
 'Durable Medical Equipment Purchase',
 'Ambulatory Service Center Facility',
 'Renal Supplies in the Home',
 'Alternate Method Dialysis',
 'Chronic Renal Disease (CRD) Equipment',
 'Pre-Admission Testing',
 'Durable Medical Equipment Rental',
 'Pneumonia Vaccine',
 'Second Surgical Opinion',
 'Third Surgical Opinion',
 'Social Work',
 'Diagnostic Dental',
 'Periodontics',
 'Restorative',
 'Endodontics',
 'Maxillofacial Prosthetics',
 'Adjunctive Dental Services',
 'Health Benefit Plan Coverage',
 'Plan Waiting Period',
 'Chiropractic',
 'Chiropractic Office Visits',
 'Dental Care',
 'Dental Crowns',
 'Dental Accident',
 'Orthodontics',
 'Prosthodontics',
 'Oral Surgery',
 'Routine (Preventive) Dental',
 'Home Health Care',
 'Home Health Prescriptions',
 'Home Health Vis

In [50]:
heading = list(data.keys())
heading

['Professional (Physician)',
 'Surgical Benefits - Professional (Physician)',
 'Independent Medical Evaluation',
 'Professional (Physician) Visit - Office',
 'Physician Visit - Office: Sick',
 'Physician Visit - Office: Well',
 'Professional (Physician) Visit - Inpatient',
 'Professional (Physician) Visit - Outpatient',
 'Professional (Physician) Visit - Nursing Home',
 'Professional (Physician) Visit - Skilled Nursing Facility',
 'Professional (Physician) Visit - Home',
 'Anesthesiologist',
 'Surgical Assistance',
 'Hospital',
 'Hospital - Inpatient',
 'Hospital - Room and Board',
 'Hospital - Outpatient',
 'Hospital - Ambulatory Surgical',
 'Ambulatory Service Center Facility',
 'Hospital - Emergency Medical',
 'Hospital - Emergency Accident',
 'Intensive Care',
 'Coronary Care',
 'Neonatal Intensive Care',
 'Emergency Services',
 'Urgent Care',
 'Diagnostic X-Ray',
 'Screening X-ray',
 'MRI/CAT Scan',
 'Diagnostic Lab',
 'Screening laboratory',
 'Pathology',
 'Mammogram, High Risk P

In [51]:
for service in eb03:
    if service not in heading:
        print(service)