In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
dutch_stopwords = stopwords.words('dutch')

In [2]:
# Read the preprocessed data
df_specific = pd.read_csv('a:/df_cleaned.csv')
df_specific.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"['dhr', 'aj', 'dingemans', 'huisarts', 'street..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"['samenvatting', 'rectaal', 'bloedverlie', 'ob..."
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"['coloscopie', 'betreffen', 'mw', 'initials', ..."
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"['gastroscopie', 'betreffen', 'mw', 'initials'..."
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"['samenvatting', 'rectaal', 'bloedverlie', 'ee..."


In [3]:
# Define regex pattern
# pattern = re.compile(r"\b(maagdarmbloeding|gastro[- ]intestinale bloeding|bloeding uit het maag[- ]darmkanaal|gib|bloedverlies gastro[- ]intestinaal)\b", re.IGNORECASE)

terms = [
    r"maagdarmbloeding",
    r"gastro[-\s]?intestinale bloeding",
    # r"bloeding\s*uit\s*het\s*maag[-\s]?darmkanaal",
    # r"bloedverlies\s*gastro[-\s]?intestinaal",
    # r"a?gib?",
    # r" gi ",
    r" gib ",
    r" agib ",
    # r"\bgastro\b",
    r"gi\s*bloeding",
]


# Escape multi-word phrases properly
# pattern_str = "|".join([fr"\b{t}\b" if " " not in t and "[- ]" not in t else t for t in terms])
pattern_str = "|".join(terms)
agib_pattern = re.compile(pattern_str, re.IGNORECASE)



In [4]:
def extract_relevant_sections(text, relevant_headers=None):
    if relevant_headers is None:
        relevant_headers = ["anamnese", "conclusie", "beleid", "tractusanamnese", "triagekeuze"]

    # Split op kopjes
    sections = re.split(r"(?<=\n)([A-Z][a-z]+(?:\s*[A-Z][a-z]+)*):", text)
    
    # Combine sections into dict: {header: content}
    it = iter(sections)
    section_dict = {}
    current = next(it, None)
    while current:
        header = current.strip().lower()
        content = next(it, "")
        section_dict[header] = content.strip()
        current = next(it, None)

    # Extract relevant parts only
    combined = ""
    for header in relevant_headers:
        if header in section_dict:
            combined += section_dict[header] + "\n"
    
    return combined


In [5]:

def extract_context(text, pattern, window=5):
    matches = []
    for match in pattern.finditer(text):
        start, end = match.start(), match.end()
        before = text[:start]
        after = text[end:]

        # Get words before match
        before_words = re.findall(r'\w+', before)[-window:]
        # Get words after match
        after_words = re.findall(r'\w+', after)[:window]

        context = ' '.join(before_words + [match.group()] + after_words)
        matches.append(context)

    return matches[0] if matches else None





In [6]:
# Apply regex
df_specific['agib_mention'] = df_specific['alltext'].apply(lambda x: bool(agib_pattern.search(str(x))))

# Optionally exclude negated matches
negation_pattern = re.compile(r"\b(geen|niet|zonder)\b.{0,20}?\b(bloed(braken)?|hematemesis|melena|maagdarmbloeding|gastro[- ]intestinale bloeding|rectaal bloedverlies|vers bloed per anum|bloeding uit het maag[- ]darmkanaal)\b", re.IGNORECASE)
df_specific['negated'] = df_specific['alltext'].apply(lambda x: bool(negation_pattern.search(str(x))))
df_specific['agib_final'] = df_specific['agib_mention'] & ~df_specific['negated']


# Apply function
df_specific['agib_context'] = df_specific['alltext'].apply(lambda x: extract_context(str(x), agib_pattern))
df_specific.head()
# Function to extract the exact match
def extract_match(text, pattern):
    match = pattern.search(text)
    return match.group() if match else None

# Apply function to extract the exact match
df_specific['agib_match'] = df_specific['alltext'].apply(lambda x: extract_match(str(x), agib_pattern))
df_specific

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,agib_mention,negated,agib_final,agib_context,agib_match
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"['dhr', 'aj', 'dingemans', 'huisarts', 'street...",False,False,False,,
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"['samenvatting', 'rectaal', 'bloedverlie', 'ob...",False,False,False,,
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"['coloscopie', 'betreffen', 'mw', 'initials', ...",False,False,False,,
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"['gastroscopie', 'betreffen', 'mw', 'initials'...",False,False,False,,
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"['samenvatting', 'rectaal', 'bloedverlie', 'ee...",False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...
9572,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,Consult,Samenvatting: \n1e consult\r\n-Type 1e consult...,2015-03-20 08:13:00,2015-03-20 08:13:00,samenvatting consult type consult uitbreiden a...,"['samenvatting', 'consult', 'type', 'consult',...",False,False,False,,
9573,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Type decursus: De...,2015-01-14 15:39:00,2015-01-14 15:39:00,samenvatting decursus type decursus decursus s...,"['samenvatting', 'decursus', 'type', 'decursus...",False,False,False,,
9574,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nVerpleegkundige verslaglegging...,2014-12-21 09:31:00,2014-12-21 09:31:00,samenvatting verpleegkundig verslaglegging ver...,"['samenvatting', 'verpleegkundig', 'verslagleg...",False,False,False,,
9575,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,2010-11-10 21:03:00,2010-11-10 21:03:00,samenvatting medisch dossier vk sputovamo leef...,"['samenvatting', 'medisch', 'dossier', 'vk', '...",False,False,False,,


In [7]:
# Nieuwe kolom met alleen relevante tekst
df_specific['agib_relevant_text'] = df_specific['alltext'].apply(lambda x: extract_relevant_sections(str(x)))

# Match alleen in relevante tekst
df_specific['agib_mention_relevant'] = df_specific['agib_relevant_text'].apply(lambda x: bool(agib_pattern.search(x)))

# Context + match optioneel:
df_specific['agib_match'] = df_specific['agib_relevant_text'].apply(lambda x: extract_match(str(x), agib_pattern))
df_specific['agib_context'] = df_specific['agib_relevant_text'].apply(lambda x: extract_context(str(x), agib_pattern))


In [8]:
# # Apply regex
# df_specific['agib_mention'] = df_specific['alltext'].apply(lambda x: bool(agib_pattern.search(str(x))))

# # Optionally exclude negated matches
# negation_pattern = re.compile(r"\b(geen|niet|zonder)\b.{0,20}?\b(bloed(braken)?|hematemesis|melena|maagdarmbloeding|gastro[- ]intestinale bloeding|rectaal bloedverlies|vers bloed per anum|bloeding uit het maag[- ]darmkanaal)\b", re.IGNORECASE)
# df_specific['negated'] = df_specific['alltext'].apply(lambda x: bool(negation_pattern.search(str(x))))
# df_specific['agib_final'] = df_specific['agib_mention'] & ~df['negated']

In [9]:
def extract_context(text, pattern, window=5):
    matches = []
    for match in pattern.finditer(text):
        start, end = match.start(), match.end()
        before = text[:start]
        after = text[end:]

        # Get words before match
        before_words = re.findall(r'\w+', before)[-window:]
        # Get words after match
        after_words = re.findall(r'\w+', after)[:window]

        context = ' '.join(before_words + [match.group()] + after_words)
        matches.append(context)

    return matches[0] if matches else None



# Apply function
df_specific['agib_context'] = df_specific['alltext'].apply(lambda x: extract_context(str(x), agib_pattern))
df_specific.head()
# Function to extract the exact match
def extract_match(text, pattern):
    match = pattern.search(text)
    return match.group() if match else None

# Apply function to extract the exact match
df_specific['agib_match'] = df_specific['alltext'].apply(lambda x: extract_match(str(x), agib_pattern))
df_specific

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,agib_mention,negated,agib_final,agib_context,agib_match,agib_relevant_text,agib_mention_relevant
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"['dhr', 'aj', 'dingemans', 'huisarts', 'street...",False,False,False,,,,False
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"['samenvatting', 'rectaal', 'bloedverlie', 'ob...",False,False,False,,,,False
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"['coloscopie', 'betreffen', 'mw', 'initials', ...",False,False,False,,,,False
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"['gastroscopie', 'betreffen', 'mw', 'initials'...",False,False,False,,,,False
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"['samenvatting', 'rectaal', 'bloedverlie', 'ee...",False,False,False,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9572,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,Consult,Samenvatting: \n1e consult\r\n-Type 1e consult...,2015-03-20 08:13:00,2015-03-20 08:13:00,samenvatting consult type consult uitbreiden a...,"['samenvatting', 'consult', 'type', 'consult',...",False,False,False,,,,False
9573,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Type decursus: De...,2015-01-14 15:39:00,2015-01-14 15:39:00,samenvatting decursus type decursus decursus s...,"['samenvatting', 'decursus', 'type', 'decursus...",False,False,False,,,,False
9574,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nVerpleegkundige verslaglegging...,2014-12-21 09:31:00,2014-12-21 09:31:00,samenvatting verpleegkundig verslaglegging ver...,"['samenvatting', 'verpleegkundig', 'verslagleg...",False,False,False,,,,False
9575,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,2010-11-10 21:03:00,2010-11-10 21:03:00,samenvatting medisch dossier vk sputovamo leef...,"['samenvatting', 'medisch', 'dossier', 'vk', '...",False,False,False,,,,False


In [10]:
print("Aantal agib_mention == True:", df_specific['agib_mention'].sum())


Aantal agib_mention == True: 197


In [11]:
for i, row in df_specific[df_specific['agib_match'].notnull()].iterrows():
    print(f"🩸 Date: {row['date']}")
    print(f"🩸 Match: {row['agib_match']}")
    print(f"📍 Context: {row['agib_context']}")
    print(f"📜 Content: {row['verslagen_report_content']}")
    print("-----")


🩸 Date: 2016-08-22 15:14:00
🩸 Match: gibloeding
📍 Context: telefoonnumm phonenumber mts triage triagekeus gibloeding gibloeding zwart donkerrode ontlasting triage
📜 Content: Samenvatting: 
Verpleegkundige verslaglegging
-Verantwoordelijk verpleegkundige:: <ObjectGegevens>
[ Vplk SPUTOVAMO ]
-Leeftijd: 83
[ Collumcare ]
[ Sepsis ]
[ Anamnese ]
-Leeftijd anamnese: 83
-(Hoofd)klacht: <ObjectGegevens>
-A = Allergieën (alleen verpleegkundig): AMOXICILLINE / PENICILLINES / AMOXICILLINE/AMPICILLINE / 
-M = Medication: zie lijst
-P = Past: AF
-L = Last meal: 12.30 uur
-E = Event: Rectaal bloedverlies donker  sinds zaterdag    begonnen met buikpijn   na gebruik antibiotica  zwaar gevoel in de buik A vrij B gb C rectaal bloedverlies D alert
[ Neurologische score ]
[ Trombosedienst ]
-Pijnscore invullen?: Ja
-Patientkenmerk: SEH patiënt
[ Pijnscore ]
-Pijnscore: 1
-Score tbv grafiek: 1 
-Verpleegkundige handelingen: IV canule (V), Lab, Urinesediment en Urinekweek
[ Contacten ]
[ Contactpersonen ]

In [12]:
for i, row in df_specific[df_specific['pseudo_id'] == 'C71B153E9F184C29E8D39654E41C8B54586E3AF4'].iterrows():
    print(f"🩸 Date: {row['date']}")
    print(f"🩸 Match: {row['agib_match']}")
    print(f"📍 Context: {row['agib_context']}")
    print(f"📜 Content: {row['verslagen_report_content']}")
    print("-----")


🩸 Date: 2020-01-08 12:58:00
🩸 Match: None
📍 Context: None
📜 Content: Reden van komst / Verwijzing: 
Reden van komst (brief): Aanrijding fiets 

Anamnese: 
Patiënte is op de fiets aangereden door auto (ongeveer 20km/u). Daarbij met hoofd op vooruit gekomen en daarna op straat gevallen. Geen bewustzijnsverlies.
Anamnestisch bekend met onregelmatig hartritme, echter geen documentatie hiervan. Niet onder behandeling van een cardioloog. 
Allergie: geen bekend.
CLOPIDOGREL TABLET   75MG (ORAAL), 1 x per dag 75 milligram
FERROFUMARAAT TABLET 200MG (ORAAL), 1 x per 2 dagen 1 stuk 7-1-2020 laatste
FOLIUMZUUR TABLET 0,5MG (ORAAL), 1 x per dag 1 stuk
LERCANIDIPINE TABLET OMHULD 10MG (ORAAL), 1 x per dag 1 stuk
ENALAPRIL/HYDROCHLOORTHIAZIDE TABLET 20/12,5MG (ORAAL), 1 x per dag 1 stuk




Lichamelijk onderzoek: 
A: vrij, CWK drukpijnlijk C3-C4 
B: SaO2 98% bij kl.  Pul: VAG bdz. Drukpijn sternum+. Geen uitwendig letsel. Geen drukpijn ribben
C: Bloeddruk 158/65mmHg, 122/min . Abd: soepel, niet druk

In [13]:
pseudo_ids_with_agib = df_specific[df_specific['agib_mention'] == True]['pseudo_id'].unique()
pseudo_ids_df = pd.DataFrame(pseudo_ids_with_agib, columns=['pseudo_id'])
pseudo_ids_df

Unnamed: 0,pseudo_id
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6
1,0BC512A65442D0BB4B00FBE05E7EA6283E5C11FD
2,0DD73490997F61870C32B3BB7C9CDE4E801FFF7D
3,0E041554A0B23505AD762D006AB320CF4AF8F969
4,12D65DA1F4CFAC101DE53C050C9037D97F42FC18
5,21031C9814F3BD6EFACD9FBC5AF409820CDABAB4
6,2562E6AE0D16F0504B50CEE14D0D1F0A37596ED8
7,37099C38CFE1055CF6950B3D61CEC774849364D9
8,37BFA228DFA04BE0596DCCC63D245B42B3A15727
9,395219FA6E57FE6DA93F36F3EB68F4667D325895
