In [12]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("../data/medical_transcription.csv")


In [14]:
text = df["transcription"].iloc[0]
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "->", ent.label_)


23-year-old -> DATE
Seattle -> GPE
Claritin -> PERSON
Zyrtec -> LANGUAGE
Allegra -> ORG
last summer -> DATE
two weeks ago -> DATE
daily -> DATE
Ortho Tri-Cyclen -> PERSON
Allegra -> ORG
130 pounds -> QUANTITY
Nasal -> ORG
Allergic -> ORG
Zyrtec -> NORP
Allegra -> ORG
Samples of Nasonex -> ORG
two -> CARDINAL
three weeks -> DATE


In [16]:
TARGET_LABELS = {
    "DISEASE",
    "DRUG",
    "DATE",
    "PROCEDURE",
    "ORG"
}

In [18]:
filtered_entities = []

for ent in doc.ents:
    if ent.label_ in TARGET_LABELS:
        filtered_entities.append({
            "text": ent.text,
            "label": ent.label_
        })

filtered_entities


[{'text': '23-year-old', 'label': 'DATE'},
 {'text': 'Allegra', 'label': 'ORG'},
 {'text': 'last summer', 'label': 'DATE'},
 {'text': 'two weeks ago', 'label': 'DATE'},
 {'text': 'daily', 'label': 'DATE'},
 {'text': 'Allegra', 'label': 'ORG'},
 {'text': 'Nasal', 'label': 'ORG'},
 {'text': 'Allergic', 'label': 'ORG'},
 {'text': 'Allegra', 'label': 'ORG'},
 {'text': 'Samples of Nasonex', 'label': 'ORG'},
 {'text': 'three weeks', 'label': 'DATE'}]

In [22]:
def extract_entities(text):
    doc = nlp(text)
    entities = []

    for ent in doc.ents:
        if ent.label_ in TARGET_LABELS:
            entities.append({
                "text": ent.text,
                "label": ent.label_
            })

    return entities


In [24]:
extract_entities(df["transcription"].iloc[3])


[{'text': 'Trace', 'label': 'ORG'}]

In [26]:
df["entities"] = df["transcription"].head(10).apply(extract_entities)
df[["medical_specialty", "entities"]]


Unnamed: 0,medical_specialty,entities
0,Allergy / Immunology,"[{'text': '23-year-old', 'label': 'DATE'}, {'t..."
1,Bariatrics,"[{'text': '13 years ago', 'label': 'DATE'}, {'..."
2,Bariatrics,"[{'text': 'ABC', 'label': 'ORG'}, {'text': 'to..."
3,Cardiovascular / Pulmonary,"[{'text': 'Trace', 'label': 'ORG'}]"
4,Cardiovascular / Pulmonary,[]
...,...,...
4994,Allergy / Immunology,
4995,Allergy / Immunology,
4996,Allergy / Immunology,
4997,Allergy / Immunology,


Entity Extraction Notes:
- Used spaCy for baseline NLP
- Extracted only relevant entities to reduce noise
- Focused on information extraction, not diagnosis
- Output structured JSON-like data for downstream use
