In [31]:
import pandas as pd
import re

In [32]:
df_reports = pd.read_csv('a:/bloeding-met-patientenlijst-gedetailleerd/verslagen-abb-corrected.csv')
df_reports['verslagen_report_start_date'] = pd.to_datetime(df_reports['verslagen_report_start_date'], errors='coerce')

df_reports.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_age_at_time_of_event,verslagen_report_specialism,verslagen_report_start_date,abbreviations_corrected
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",87,"Maag-, Darm- en Leverziekten",2020-11-26 15:06:00,"meneer a.j. dingemans, huisarts\r\n[streetname..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,87,"Maag-, Darm- en Leverziekten",2020-11-26 09:53:00,samenvatting: \nrectaal bloedverlies onder beg...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,87,"Maag-, Darm- en Leverziekten",2020-11-25 14:13:00,coloscopie\r\n\r\nbetreft\r\nmevrouw [initials...
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,87,"Maag-, Darm- en Leverziekten",2020-11-25 13:48:00,gastroscopie\r\n\r\nbetreft\r\nmevrouw [initia...
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,87,"Maag-, Darm- en Leverziekten",2020-11-25 08:47:00,samenvatting: \nrectaal bloedverlies ; eenmali...


In [33]:
df_measures = df_reports[['pseudo_id', 'verslagen_report_start_date', 'verslagen_report_specialism', 'verslagen_report_content']]
df_measures.head()

df = df_measures.copy()

In [34]:
df = df_measures.copy()

In [35]:
agib_terms = [
    r"acute gi\s*bloeding",                       
    r"acute maagdarmbloeding",                    
    r"acute gastro[-\s]?intestinale bloeding",    
    r"agib"
]

agib_terms_MDL = agib_terms + [
    r"acute bloeding",               
    r"massaal bloedverlies",         
    # r"shock",          
    r"spoed scopie",                                  
    r"melaena met hypotensie",
    # r"anemie"        
]

gib_terms = [
    r"mel[ae]na",
    r"hematemesis",
    r"hematochez[iy]e",
    r"rectaal bloed",
    r"rectaal verlies",
    r"zwart(e)? ontlasting",
    r"bloed in ontlasting",
    r"divertikelbloeding",
    r"(bloed)?stolsels",
    r"gastro[-\s]?intestinale bloeding",
    r"hemorragie"
]

gib_terms_MDL = gib_terms + [
    r"bloedverlies",
    r"vers bloed per anum",
    r"bloedende divertikels?",
    r"maagdarmbloeding",
    r"invloed van antistolling",
    r"hemoglobinedaling",
    r"bloedende ulcus",
    r"bloedend varices?",
    r"anemie"
]


In [36]:
def build_pattern(term_list):
    return re.compile(r"(" + "|".join(term_list) + r")", flags=re.IGNORECASE)

def extract_matches_and_context(text, pattern, window=5):
    text = str(text)
    matches = []
    for match in pattern.finditer(text):
        print("\u2705 Match found:", match.group())  # DEBUG LINE
        start, end = match.span()
        before = text[:start].split()[-window:]
        after = text[end:].split()[:window]
        context = " ".join(before + [match.group()] + after)
        matches.append(context)
    return matches if matches else None

def extract_match(text, pattern):
    match = pattern.search(text.lower())
    return match.group() if match else None

def apply_term_matching_with_specialism(df, text_col, spec_col, termlist_general, termlist_mdl, target_specialism, prefix):
    pattern_general = build_pattern(termlist_general)
    pattern_mdl = build_pattern(termlist_mdl)

    def match_fn(row):
        pattern = pattern_mdl if row[spec_col] == target_specialism else pattern_general
        return extract_matches_and_context(row[text_col], pattern)
    
    def match_fn_simple(row):
        pattern = pattern_mdl if row[spec_col] == target_specialism else pattern_general
        return extract_match(row[text_col], pattern)
    
    df[f'{prefix}_match'] = df.apply(match_fn_simple, axis=1)
    df[f'{prefix}_context'] = df.apply(match_fn, axis=1)
    df[f'has_{prefix}'] = df[f'{prefix}_context'].apply(lambda x: x is not None)
    return df

In [37]:
# Apply Patterns to Data
# Apply with specialism-aware logic
apply_term_matching_with_specialism(
    df, 
    text_col='verslagen_report_content', 
    spec_col='verslagen_report_specialism', 
    termlist_general=agib_terms, 
    termlist_mdl=agib_terms_MDL, 
    target_specialism='Maag-, Darm- en Leverziekten', 
    prefix='agib')

apply_term_matching_with_specialism(
    df, 
    text_col='verslagen_report_content', 
    spec_col='verslagen_report_specialism', 
    termlist_general=gib_terms, 
    termlist_mdl=gib_terms_MDL, 
    target_specialism='Maag-, Darm- en Leverziekten', 
    prefix='gib')

✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: Acute bloeding
✅ Match found: acute bloeding
✅ Match found: acute bloeding
✅ Match found: acute bloeding
✅ Match found: rectaal bloed
✅ Match fou

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 15:06:00,"Maag-, Darm- en Leverziekten","Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",,,False,rectaal bloed,[in verband met melaena en rectaal bloed verli...,True
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 09:53:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies obv diver...,,,False,rectaal bloed,[Samenvatting: Rectaal bloed verlies obv diver...,True
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 14:13:00,"Maag-, Darm- en Leverziekten",COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,,,False,bloedverlies,[3 Verwijzer: J.T. Kamphuis Indicatie: bloedve...,True
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 13:48:00,"Maag-, Darm- en Leverziekten",GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,,,False,melena,[3 Verwijzer: J.T. Kamphuis Indicatie: Melena ...,True
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 08:47:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies ; eenmali...,,,False,rectaal bloed,[Samenvatting: Rectaal bloed verlies ; eenmali...,True
...,...,...,...,...,...,...,...,...,...,...
11085,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-03-20 08:13:00,Interne Geneeskunde,Samenvatting: \n1e consult\r\n-Type 1e consult...,,,False,,,False
11086,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-01-14 15:39:00,Interne Geneeskunde,Samenvatting: \nDecursus\r\n-Type decursus: De...,,,False,,,False
11087,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2014-12-21 09:31:00,Spoedeisende Hulp,Samenvatting: \nVerpleegkundige verslaglegging...,,,False,,,False
11088,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2010-11-10 21:03:00,Spoedeisende Hulp,Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,,,False,,,False


In [38]:
df['has_agib'].value_counts(), df['has_gib'].value_counts()

(False    11067
 True        23
 Name: has_agib, dtype: int64,
 False    7538
 True     3552
 Name: has_gib, dtype: int64)

In [39]:
len(df[df['has_agib']]['pseudo_id'].unique()), len(df[df['has_gib']]['pseudo_id'].unique())

(3, 104)

In [40]:
len(df[df['has_agib'] | df['has_gib']]['pseudo_id'].unique())

104

In [41]:
def extract_context(text, pattern, window=5):
    text = text.lower()
    matches = []
    for match in pattern.finditer(text):
        start, end = match.start(), match.end()
        before = text[:start]
        after = text[end:]
        before_words = re.findall(r'\w+', before)[-window:]
        after_words = re.findall(r'\w+', after)[:window]
        context = ' '.join(before_words + [match.group()] + after_words)
        matches.append(context)
    return matches[0] if matches else None

def extract_match(text, pattern):
    match = pattern.search(text.lower())
    return match.group() if match else None

In [42]:
# including puncation
negation_pattern = re.compile(r"\b(geen|niet|zonder|uitsluiten|ontkent|ontkend|ontkennen|negatief voor|geen aanwijzing voor|kan uitgesloten worden|geen tekenen van)\b.{0,20}?\b(acute gi\s*bloeding|acute maagdarmbloeding|acute gastro[-\s]?intestinale bloeding| agib |acute bloeding|massaal bloedverlies|shock door bloeding|spoed scopie|hematemesis|melaena met hypotensie|mel[ae]na|hematemesis|hematochez[iy]e|rectaal bloed|rectaal verlies|zwart(e)? ontlasting|bloed in ontlasting|divertikelbloeding|(bloed)?stolsels|gastro[-\s]?intestinale bloeding|hemorragie|bloedverlies|vers bloed per anum|bloedende divertikels?|maagdarmbloeding|invloed van antistolling|hemoglobinedaling|bloedende ulcus|bloedend varices?|anemie)\b", re.IGNORECASE)

# exluding puncation
negation_pattern = re.compile(r"\b(geen|niet|zonder|uitsluiten|ontkent|ontkend|ontkennen|negatief voor|geen aanwijzing voor|kan uitgesloten worden|geen tekenen van)\b.[^.\n;:!?]{0,20}?\b(acute gi\s*bloeding|acute maagdarmbloeding|acute gastro[-\s]?intestinale bloeding| agib |acute bloeding|massaal bloedverlies|shock door bloeding|spoed scopie|hematemesis|melaena met hypotensie|mel[ae]na|hematemesis|hematochez[iy]e|rectaal bloed|rectaal verlies|zwart(e)? ontlasting|bloed in ontlasting|divertikelbloeding|(bloed)?stolsels|gastro[-\s]?intestinale bloeding|hemorragie|bloedverlies|vers bloed per anum|bloedende divertikels?|maagdarmbloeding|invloed van antistolling|hemoglobinedaling|bloedende ulcus|bloedend varices?|anemie)\b", re.IGNORECASE)


df['negated'] = df['verslagen_report_content'].apply(lambda x: bool(negation_pattern.search(str(x))))
df['negated_match'] = df['verslagen_report_content'].apply(lambda x: extract_match(str(x), negation_pattern))
df['negated_context'] = df['verslagen_report_content'].apply(lambda x: extract_matches_and_context(str(x), negation_pattern))

✅ Match found: geen rectaal bloedverlies
✅ Match found: Geen rectaal bloedverlies
✅ Match found: Geen bloedverlies
✅ Match found: Geen stolsels
✅ Match found: niet gebraakt (geen hematemesis
✅ Match found: Niet afgevallen, voor melena
✅ Match found: niet gebraakt (geen hematemesis
✅ Match found: Niet afgevallen, voor melena
✅ Match found: niet gebraakt (geen hematemesis
✅ Match found: Niet afgevallen, voor melena
✅ Match found: geen bloedverlies
✅ Match found: Geen bloedverlies
✅ Match found: Geen melena
✅ Match found: geen melena
✅ Match found: geen melena
✅ Match found: Geen melena
✅ Match found: geen bloedverlies
✅ Match found: geen rectaal bloedverlies
✅ Match found: geen bloedverlies
✅ Match found: geen bloedverlies
✅ Match found: geen bloedverlies
✅ Match found: geen bloedverlies
✅ Match found: geen helderrood bloedverlies
✅ Match found: geen helderrood bloedverlies
✅ Match found: geen melena
✅ Match found: geen rectaal bloedverlies
✅ Match found: geen actief (rectaal) bloedverli

In [43]:
df['negated'].value_counts()

False    9993
True     1097
Name: negated, dtype: int64

In [44]:
df[df['has_agib'] == True]['pseudo_id'].unique()#, df[df['has_gib'] == True]['pseudo_id'].unique()

array(['BEBEFC726C577A7B40F1A467F4E59746FBC7F76B',
       'CF58B7215ED673FD2AC116C49953A9941E73F597',
       'FD8C682C1F4FDA1E5EC0B760D30875556419BD71'], dtype=object)

In [45]:
# final_lst = []

# for i in range(len(df)):
#     if df['negated'][i] == True and df['has_agib'][i] == True:
#         word = df['agib_match'][i]
#         # print('word:', word)
#         if word in df['negated_match'][i]:
#             # print(word)
#             # print(df['negated_match'][i])
#             final_lst.append(True)
#     elif df['negated'][i] == True and df['has_gib'][i] == True:
#         word = df['gib_match'][i]
#         # print('word:', word)
#         if word in df['negated_match'][i]:
#             # print(word)
#             # print(df['negated_match'][i])
#             final_lst.append(True)
#     elif df['negated'][i] == False and df['has_agib'][i] == False:
#         final_lst.append(False)
#     elif df['negated'][i] == False and df['has_gib'][i] == False:
#         final_lst.append(False)

#     elif df['negated'][i] == True and df['has_agib'][i] == False:
#         final_lst.append(False)
#     elif df['negated'][i] == True and df['has_gib'][i] == False:
#         final_lst.append(False)
    
#     elif df['negated'][i] == False and df['has_agib'][i] == True:
#         final_lst.append(True)
#     elif df['negated'][i] == False and df['has_gib'][i] == True:
#         final_lst.append(True)
    
#     else:
#         print('else')


# df['(a)gbi_final'] = final_lst
# df


In [46]:
# # Initialize the column with False
# df['(a)gib_final'] = False

# # Condition 1: If matched term appears in the negated match string → set to True (only if not already True)
# mask_agib = (df['negated'] == True) & (df['has_agib'] == True)
# mask_gib  = (df['negated'] == True) & (df['has_gib'] == True)

# df.loc[mask_agib & (df['agib_match'].str.lower().fillna('').str.strip().isin(
#     df['negated_match'].str.lower().fillna('').str.strip())), '(a)gib_final'] = True

# df.loc[mask_gib & (df['gib_match'].str.lower().fillna('').str.strip().isin(
#     df['negated_match'].str.lower().fillna('').str.strip())), '(a)gib_final'] = True

# # Condition 2: If there's no negation and AGIB or GIB is present → set to True
# df.loc[(df['negated'] == False) & ((df['has_agib'] == True) | (df['has_gib'] == True)), '(a)gib_final'] = True


# Step 1: Initialize the column with False
df['(a)gib_final'] = False

# Step 2: Define row-wise checking function
def is_final_match(row):
    # No AGIB or GIB match at all → skip
    if not row.get('has_agib', False) and not row.get('has_gib', False):
        return False
    
    # No negation at all → valid match
    if not row.get('negated', False):
        return True

    # Normalize text
    negated = str(row.get('negated_match', '')).lower().strip()
    agib = str(row.get('agib_match', '')).lower().strip()
    gib = str(row.get('gib_match', '')).lower().strip()
    print(f"Negated: {negated}, AGIB: {agib}, GIB: {gib}")  # DEBUG LINE

    # If matched term is NOT in negated text, it's valid
    if agib and agib not in negated:
        print(f"AGIB match found: {agib} not in {negated}. True printed")  # DEBUG LINE
        return True
    if gib and gib not in negated:
        print(f"GIB match found: {gib} not in {negated}. True printed")
        return True

    # Else, negation matches the exact term
    return False

# Step 3: Apply the logic
df['(a)gib_final'] = df.apply(is_final_match, axis=1)



Negated: geen rectaal bloedverlies, AGIB: none, GIB: rectaal bloed
AGIB match found: none not in geen rectaal bloedverlies. True printed
Negated: geen rectaal bloedverlies, AGIB: none, GIB: rectaal bloed
AGIB match found: none not in geen rectaal bloedverlies. True printed
Negated: geen stolsels, AGIB: none, GIB: rectaal bloed
AGIB match found: none not in geen stolsels. True printed
Negated: niet gebraakt (geen hematemesis, AGIB: none, GIB: melena
AGIB match found: none not in niet gebraakt (geen hematemesis. True printed
Negated: niet gebraakt (geen hematemesis, AGIB: none, GIB: hematemesis
AGIB match found: none not in niet gebraakt (geen hematemesis. True printed
Negated: niet gebraakt (geen hematemesis, AGIB: none, GIB: hematemesis
AGIB match found: none not in niet gebraakt (geen hematemesis. True printed
Negated: geen bloedverlies, AGIB: none, GIB: bloedverlies
AGIB match found: none not in geen bloedverlies. True printed
Negated: geen bloedverlies, AGIB: none, GIB: melena
AGIB 

In [47]:
df['(a)gib_final'].value_counts()

False    7538
True     3552
Name: (a)gib_final, dtype: int64

In [48]:
df[df['(a)gib_final']]['pseudo_id'].nunique()

104

In [49]:
# df['gib_final'] = df['has_gib'] & ~df['negated']
# len(df[df['gib_final'] == True]['pseudo_id'].unique())

In [50]:
# set(df[df['(a)gib_final']]['pseudo_id'].unique()) - set(df[df['gib_final'] == True]['pseudo_id'].unique())

In [51]:
df[(df['(a)gib_final'] == True) & (df['pseudo_id'] == '37099C38CFE1055CF6950B3D61CEC774849364D9')]

# should be false

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,negated,negated_match,negated_context,(a)gib_final
2446,37099C38CFE1055CF6950B3D61CEC774849364D9,2019-09-12 13:54:00,"Maag-, Darm- en Leverziekten",Aan de weledelgeleerde vrouwe\r\ndrs. E.B. Amb...,,,False,anemie,[AIE rechts (=asympt) 2017 Ferriprieve anemie ...,True,False,,,True
2459,37099C38CFE1055CF6950B3D61CEC774849364D9,2019-02-11 15:30:00,"Maag-, Darm- en Leverziekten",Reden van komst / Verwijzing: \nReden verwijzi...,,,False,anemie,[Reden verwijzing: Langer bestaande ferripriev...,True,True,geen macroscopische bloedverlies,[dat er een anemie is. Geen macroscopische blo...,True
2460,37099C38CFE1055CF6950B3D61CEC774849364D9,2019-02-11 14:01:00,"Maag-, Darm- en Leverziekten",Aan de weledelgeleerde vrouwe\r\ndrs. E.B. Amb...,,,False,anemie,[Reden van verwijzing Recidief ferriprieve ane...,True,True,geen macroscopische bloedverlies,[dat er een anemie is. Geen macroscopische blo...,True
2494,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-12 00:00:00,"Maag-, Darm- en Leverziekten",Naam: [LASTNAME] DE-[LASTNAME]*M [BIRTHDATE] V...,,,False,anemie,[duodenum; verkrijgingswijze: biopt; klinische...,True,False,,,True
2495,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-11 14:28:00,"Maag-, Darm- en Leverziekten","Máxima Medisch Centrum,\r\nmaag-darm-leverzie...",,,False,anemie,[Verwijzer: F.S. Jonkers. Indicatie: Ferriprie...,True,False,,,True
2496,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-11 14:27:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nEndoscopie verslag\r\n[ 370060...,,,False,anemie,[Ter Borg -Indicatie aanvraag: Ferriprieve ane...,True,False,,,True
2497,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-11 14:26:00,"Maag-, Darm- en Leverziekten","Máxima Medisch Centrum,\r\nmaag-darm-leverzie...",,,False,anemie,[Verwijzer: F.S. Jonkers. Indicatie: Ferriprie...,True,False,,,True
2498,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-11 14:25:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nEndoscopie verslag\r\n[ 370060...,,,False,anemie,[Ter Borg -Indicatie aanvraag: Ferriprieve ane...,True,False,,,True
2500,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-04 11:45:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nEerste consult\r\n-Type 1e con...,,,False,anemie,[-Intern: Dr Jonkers -Verwijsreden: Ijzergebre...,True,True,geen zichtbaar bloedverlies,"[Def patroon, dagelijks gevormde defecatie gee...",True


In [52]:
df[(df['(a)gib_final'] == True) & (df['pseudo_id'] == '8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC')]

#should be false except last row: 5359

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,negated,negated_match,negated_context,(a)gib_final
5296,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-07-30 10:54:00,Interne Geneeskunde,Samenvatting: \n2017 Diabetes mellitus type 2\...,,,False,zwarte ontlasting,"[Geen bloed verlies gezien, geen zwarte ontlas...",True,True,geen zwarte ontlasting,"[cell. Geen bloed verlies gezien, geen zwarte ...",True
5324,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-04-23 21:44:00,"Maag-, Darm- en Leverziekten",Aan de weledelgeleerde vrouwe\r\ndrs. J.E. Sno...,,,False,bloedverlies,[sigmoid verwijderd ). Anamnese Geen bloedverl...,True,True,geen bloedverlies,[in sigmoid verwijderd ). Anamnese Geen bloedv...,True
5334,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-03-27 11:11:00,"Maag-, Darm- en Leverziekten",COLOSCOPIE\r\n\r\nBetreft\r\nDhr. [INITIALS] [...,,,False,anemie,[Verwijzer: J.T. Kamphuis Indicatie: REc anemi...,True,False,,,True
5335,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-03-27 10:52:00,"Maag-, Darm- en Leverziekten",GASTROSCOPIE\r\n\r\nBetreft\r\nDhr. [INITIALS]...,,,False,anemie,[Verwijzer: J.T. Kamphuis Indicatie: Rec anemi...,True,False,,,True
5346,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-02-21 09:44:00,"Maag-, Darm- en Leverziekten",Reden van komst / Verwijzing: \nReden verwijzi...,,,False,bloedverlies,[gastro en coloscopie Anamnese: Geen bloedverl...,True,True,geen bloedverlies,[Voorstel: gastro en coloscopie Anamnese: Geen...,True
5348,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-02-12 11:13:00,Interne Geneeskunde,Samenvatting: \nVoorgeschiedenis\n2017 Diabete...,,,False,zwarte ontlasting,[geen zichtbaar bloedverlies gehad. Geen zwart...,True,True,geen zichtbaar bloedverlies,"[daarna opnieuw bekijken. Laag Hb, geen zichtb...",True
5354,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-01-28 12:22:00,Interne Geneeskunde,Samenvatting: \nVoorgeschiedenis\n2017 Diabete...,,,False,zwarte ontlasting,[opnemen indien bloedverlies bij ontlasting/ z...,True,True,geen bloedverlies,[graag naar huis. Geen klachten. Geen bloedver...,True
5355,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-01-28 09:46:00,Interne Geneeskunde,Samenvatting: \nVoorgeschiedenis\n2017 Diabete...,,,False,zwarte ontlasting,"[prima. Geen bleodverlies bemerkt, geen zwarte...",True,True,geen zwarte ontlasting,"[gaat prima. Geen bleodverlies bemerkt, geen z...",True
5356,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-01-27 11:15:00,Interne Geneeskunde,Samenvatting: \nVoorgeschiedenis\n2017 Diabete...,,,False,melena,[bloed bij de def of melena Wil graag opgenome...,True,True,geen bloed bij de def of melena,"[gaat goed, Bij po insepctie geen bloed bij de...",True
5359,8BDF23D9D3F6550DC2EAB386B28FD58203CCF2CC,2019-01-25 09:59:00,Interne Geneeskunde,Samenvatting: \nVoorgeschiedenis\n2017 Diabete...,,,False,melena,"[po-inspectie ivm laag Hb ( melena , rectaal b...",True,True,geen macroscopisch bloedverlies,[plotseling Hb 3.2. Onverklaard Hb: geen macro...,True


In [53]:
# df_both = df[(df['has_agib'] == True) | (df['negated'] == True)]    
# df_both[df_both['pseudo_id'] == 'CF58B7215ED673FD2AC116C49953A9941E73F597']

In [54]:
patient_agib_lst = df[df['has_agib'] | df['has_gib']]['pseudo_id'].unique()
patient_final_lst = df[df['(a)gib_final']]['pseudo_id'].unique()

removed_patients = set(patient_agib_lst) - set(patient_final_lst)
removed_patients

set()

In [55]:
df_both = df[(df['has_agib'] == True) | (df['negated'] == True)]    
df_both[df_both['pseudo_id'] == '37099C38CFE1055CF6950B3D61CEC774849364D9']

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,negated,negated_match,negated_context,(a)gib_final
2459,37099C38CFE1055CF6950B3D61CEC774849364D9,2019-02-11 15:30:00,"Maag-, Darm- en Leverziekten",Reden van komst / Verwijzing: \nReden verwijzi...,,,False,anemie,[Reden verwijzing: Langer bestaande ferripriev...,True,True,geen macroscopische bloedverlies,[dat er een anemie is. Geen macroscopische blo...,True
2460,37099C38CFE1055CF6950B3D61CEC774849364D9,2019-02-11 14:01:00,"Maag-, Darm- en Leverziekten",Aan de weledelgeleerde vrouwe\r\ndrs. E.B. Amb...,,,False,anemie,[Reden van verwijzing Recidief ferriprieve ane...,True,True,geen macroscopische bloedverlies,[dat er een anemie is. Geen macroscopische blo...,True
2500,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-09-04 11:45:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nEerste consult\r\n-Type 1e con...,,,False,anemie,[-Intern: Dr Jonkers -Verwijsreden: Ijzergebre...,True,True,geen zichtbaar bloedverlies,"[Def patroon, dagelijks gevormde defecatie gee...",True
2503,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-08-30 08:51:00,Interne Geneeskunde,Samenvatting: \nDecursus\r\n-Type decursus: De...,,,False,,,False,True,geen bloedverlies,"[met symptomatische microcytaire anemie, anamn...",False
2504,37099C38CFE1055CF6950B3D61CEC774849364D9,2017-08-29 14:19:00,Interne Geneeskunde,Samenvatting: \nOpname\r\n-Type registratie: U...,,,False,,,False,True,geen bloedverlies,"[met symptomatische microcytaire anemie, anamn...",False


In [56]:
df_both = df[(df['has_gib'] == True)]    
df_both[df_both['pseudo_id'] == 'FAA79717FF2C725767E9469350ACECF640E5FCBC']

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,negated,negated_match,negated_context,(a)gib_final
10854,FAA79717FF2C725767E9469350ACECF640E5FCBC,2020-06-15 11:12:00,"Maag-, Darm- en Leverziekten",Reden van komst / Verwijzing: \nReden verwijzi...,,,False,anemie,[komst / Verwijzing: Reden verwijzing: anemie ...,True,False,,,True
10966,FAA79717FF2C725767E9469350ACECF640E5FCBC,2015-02-24 18:39:00,Interne Geneeskunde,Samenvatting: \nKlinische registratie\r\n-Datu...,,,False,stolsels,[VW (SSF) RvEscalatie: Nabloeding met stolsels...,True,False,,,True


In [57]:
# # Initialize the column with False
# df['agib_final'] = False

# # Condition 1: If matched term appears in the negated match string → set to True (only if not already True)
# mask_agib = (df['negated'] == True) & (df['has_agib'] == True)
# # print('mask_agib:', mask_agib)
# # mask_gib  = (df['negated'] == True) & (df['has_gib'] == True)

# df.loc[mask_agib & (df['agib_match'].str.lower().fillna('').str.strip().isin(
#     df['negated_match'].str.lower().fillna('').str.strip())), 'agib_final'] = True


# # df.loc[mask_gib & (df['gib_match'].str.lower().fillna('').str.strip().isin(
# #     df['negated_match'].str.lower().fillna('').str.strip())), '(a)gib_final'] = True

# # Condition 2: If there's no negation and AGIB or GIB is present → set to True
# df.loc[(df['negated'] == False) & (df['has_agib'] == True), 'agib_final'] = True

# Initialize final column with default False
df['agib_final'] = False

# ✅ Condition 1: Smart substring check — if negated term DOES NOT contain the matched one → it's a valid match
def check_if_match_survives(row):
    if not row.get('has_agib', False):
        return False  # No AGIB match → can't be final

    if not row.get('negated', False):
        return True  # No negation → match stands

    agib_match = str(row.get('agib_match', '')).lower().strip()
    negated_text = str(row.get('negated_match', '')).lower().strip()
    print(f"Checking: {agib_match} in {negated_text}")  # DEBUG LINE

    return agib_match not in negated_text  # Only cancel match if it’s directly negated

# ✅ Apply the logic row-by-row
df['agib_final'] = df.apply(check_if_match_survives, axis=1)



Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen tekenen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen bloedverlies
Checking: acute bloeding in geen direct bloedverlies


In [58]:
patient_agib_final_lst = df[df['agib_final']== True]['pseudo_id'].unique()
print(patient_agib_final_lst)
len(patient_agib_final_lst)

['BEBEFC726C577A7B40F1A467F4E59746FBC7F76B'
 'CF58B7215ED673FD2AC116C49953A9941E73F597'
 'FD8C682C1F4FDA1E5EC0B760D30875556419BD71']


3

In [59]:
df_both = df[(df['has_agib'] == True) & (df['negated'] == True)]    
df_both[df_both['pseudo_id'] == 'CF58B7215ED673FD2AC116C49953A9941E73F597']
# df_both[df_both['has_agib'] == True]['pseudo_id'].unique()

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,negated,negated_match,negated_context,(a)gib_final,agib_final
8313,CF58B7215ED673FD2AC116C49953A9941E73F597,2021-06-18 18:05:00,"Maag-, Darm- en Leverziekten",Aan de weledelgeleerde vrouwe\r\ndrs. S. Marce...,acute bloeding,[vulling. Geen aanwijzing voor een acute bloed...,True,anemie,[met collaps op basis van anemie en hypotensie...,True,True,geen direct bloedverlies,"[borst gehad, wel iets dyspnoeïsch. Geen direc...",True,True


In [60]:
df_both = df[(df['has_agib'] == True) & (df['negated'] == True)]    
# df_both[df_both['pseudo_id'] == '3B21EF377C1BA327A67C2C951A6CB78BEAD5B3FE']['agib_context'].values[0]

In [61]:
df[(df['has_agib'] == True) & (df['negated'] == True)][['agib_match', 'negated_match', 'agib_final']]
# df[(df['has_agib'] == True) & (df['negated'] == False)][['agib_final']]


Unnamed: 0,agib_match,negated_match,agib_final
7483,acute bloeding,geen bloedverlies,True
7490,acute bloeding,geen bloedverlies,True
7496,acute bloeding,geen bloedverlies,True
7498,acute bloeding,geen bloedverlies,True
7501,acute bloeding,geen bloedverlies,True
7504,acute bloeding,geen bloedverlies,True
7506,acute bloeding,geen tekenen bloedverlies,True
7507,acute bloeding,geen bloedverlies,True
7508,acute bloeding,geen bloedverlies,True
8313,acute bloeding,geen direct bloedverlies,True


In [62]:
# Where agib_match and negated_match differ but final is False (which may be wrong!)
mask = (
    (df['has_agib'] == True) &
    (df['negated'] == True) &
    (~df['agib_match'].str.lower().fillna('').str.strip().isin(
        df['negated_match'].str.lower().fillna('').str.strip())) &
    (df['agib_final'] == False)
)

df.loc[mask, ['verslagen_report_content', 'agib_match', 'negated_match', 'agib_final']]


Unnamed: 0,verslagen_report_content,agib_match,negated_match,agib_final


GIB patients

In [63]:
# # Initialize final column with default False
# df['gib_final'] = False

# # ✅ Condition 1: Smart substring check — if negated term DOES NOT contain the matched one → it's a valid match
# def check_if_match_survives(row):
#     if not row.get('has_gib', False):
#         return False  # No AGIB match → can't be final

#     if not row.get('negated', False):
#         return True  # No negation → match stands

#     agib_match = str(row.get('gib_match', '')).lower().strip()
#     negated_text = str(row.get('negated_match', '')).lower().strip()
#     print(f"Checking: {agib_match} in {negated_text}")  # DEBUG LINE

#     return agib_match not in negated_text  # Only cancel match if it’s directly negated

# # ✅ Apply the logic row-by-row
# df['gib_final'] = df.apply(check_if_match_survives, axis=1)

# Step 1: Initialize the column with False
df['gib_final'] = False

# Step 2: Define row-wise checking function
def is_final_match(row):
    # No AGIB or GIB match at all → skip
    if not row.get('has_gib', False):
        return False
    
    # No negation at all → valid match
    if not row.get('negated', False):
        return True

    # Normalize text
    negated = str(row.get('negated_match_gib', '')).lower().strip()
    gib = str(row.get('gib_match', '')).lower().strip()
    print(f"Negated: {negated}, GIB: {gib}")  # DEBUG LINE

    # If matched term is NOT in negated text, it's valid
    if gib and gib not in negated:
        print(f"GIB match found: {gib} not in {negated}. True printed")  # DEBUG LINE
        return True

    # Else, negation matches the exact term
    return False

# Step 3: Apply the logic
df['gib_final'] = df.apply(is_final_match, axis=1)

Negated: , GIB: rectaal bloed
GIB match found: rectaal bloed not in . True printed
Negated: , GIB: rectaal bloed
GIB match found: rectaal bloed not in . True printed
Negated: , GIB: rectaal bloed
GIB match found: rectaal bloed not in . True printed
Negated: , GIB: melena
GIB match found: melena not in . True printed
Negated: , GIB: hematemesis
GIB match found: hematemesis not in . True printed
Negated: , GIB: hematemesis
GIB match found: hematemesis not in . True printed
Negated: , GIB: bloedverlies
GIB match found: bloedverlies not in . True printed
Negated: , GIB: melena
GIB match found: melena not in . True printed
Negated: , GIB: melena
GIB match found: melena not in . True printed
Negated: , GIB: melena
GIB match found: melena not in . True printed
Negated: , GIB: melena
GIB match found: melena not in . True printed
Negated: , GIB: anemie
GIB match found: anemie not in . True printed
Negated: , GIB: anemie
GIB match found: anemie not in . True printed
Negated: , GIB: melena
GIB ma

In [64]:
scopie_terms = [
    r'\bgastroscopie\b', r'\bendoscopie\b', r'\bcoloscopie\b', r'\bercp\b',
    r'\bvideo[-\s]?endoscopie\b', r'\bendoscopisch\b',
    r'\bspoed[-\s]?gastroscopie\b', r'\burgente coloscopie\b'
]


treatment_terms = [
    r'\bpantoprazol\b', 
    r'\bperfusor\b', 
    r'\binfuus\b', 
    r'\btransfusie\b',
    r'\bacenocoumarol\b', 
    r'\bbloeddruk (laag|dalend)\b', 
    r'\bhypotens(ie|ief)\b',
    r'\bvitamine k\b',              # reversal agent for coumarin
    r'\bprotonpompremmer\b',        # class of GI bleed meds
    r'\biv vocht\b',                # fluid support in shock
    r'\bspoedopname\b'              # linked to acute presentation
]


In [65]:
# Apply Patterns to Data
# Apply with specialism-aware logic
apply_term_matching_with_specialism(
    df, 
    text_col='verslagen_report_content', 
    spec_col='verslagen_report_specialism', 
    termlist_general=scopie_terms, 
    termlist_mdl=scopie_terms, 
    target_specialism='Maag-, Darm- en Leverziekten', 
    prefix='scopie')

apply_term_matching_with_specialism(
    df, 
    text_col='verslagen_report_content', 
    spec_col='verslagen_report_specialism', 
    termlist_general=treatment_terms, 
    termlist_mdl=treatment_terms, 
    target_specialism='Maag-, Darm- en Leverziekten', 
    prefix='treatment')

✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: Coloscopie
✅ Match found: Coloscopie
✅ Match found: coloscopie
✅ Match found: coloscopie
✅ Match found: coloscopie
✅ Match found: coloscopie
✅ Match found: COLOSCOPIE
✅ Match found: coloscopie
✅ Match found: Coloscopie
✅ Match found: coloscopie
✅ Match found: GASTROSCOPIE
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: Coloscopie
✅ Match found: Coloscopie
✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: Gastroscopie
✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: Endoscopie
✅ Match found: Endoscopie
✅ Match found: gastroscopie
✅ Match found: Gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastroscopie
✅ Match found: gastr

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_match,agib_context,has_agib,gib_match,gib_context,has_gib,...,negated_context,(a)gib_final,agib_final,gib_final,scopie_match,scopie_context,has_scopie,treatment_match,treatment_context,has_treatment
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 15:06:00,"Maag-, Darm- en Leverziekten","Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",,,False,rectaal bloed,[in verband met melaena en rectaal bloed verli...,True,...,[Na de endoscopieën is er geen rectaal bloedve...,True,False,True,gastroscopie,[35 g/L (35 - 50) Gastroscopie 25-11-2020 Geen...,True,acenocoumarol,[ook helderrood rectaal bloedverlies onder ace...,True
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 09:53:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies obv diver...,,,False,rectaal bloed,[Samenvatting: Rectaal bloed verlies obv diver...,True,...,[Op colo pandiverticulose. Beloop: Vpk/ Geen r...,True,False,True,coloscopie,[schoondochter de uitslag van de coloscopie no...,True,acenocoumarol,[Rectaal bloedverlies obv divertikelbloeding; ...,True
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 14:13:00,"Maag-, Darm- en Leverziekten",COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,,,False,bloedverlies,[3 Verwijzer: J.T. Kamphuis Indicatie: bloedve...,True,...,,True,False,True,coloscopie,[COLOSCOPIE Betreft Mw. [INITIALS] [LASTNAME] ...,True,acenocoumarol,[en bloedverlies per anum onder acenocoumarol ...,True
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 13:48:00,"Maag-, Darm- en Leverziekten",GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,,,False,melena,[3 Verwijzer: J.T. Kamphuis Indicatie: Melena ...,True,...,,True,False,True,gastroscopie,[GASTROSCOPIE Betreft Mw. [INITIALS] [LASTNAME...,True,acenocoumarol,[en bloedverlies per anum onder acenocoumarol ...,True
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 08:47:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies ; eenmali...,,,False,rectaal bloed,[Samenvatting: Rectaal bloed verlies ; eenmali...,True,...,"[gehad, vannacht donkerrood bloedverlies gehad...",True,False,True,gastroscopie,[MCV 93 fL INR 1.6 Gastroscopie Geen bloed of ...,True,acenocoumarol,[ontlasting en ureum 13). Onder acenocoumarol ...,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11085,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-03-20 08:13:00,Interne Geneeskunde,Samenvatting: \n1e consult\r\n-Type 1e consult...,,,False,,,False,...,,False,False,False,gastroscopie,[diagnostiek in de vorm van gastroscopie . Zie...,True,,,False
11086,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-01-14 15:39:00,Interne Geneeskunde,Samenvatting: \nDecursus\r\n-Type decursus: De...,,,False,,,False,...,,False,False,False,,,False,,,False
11087,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2014-12-21 09:31:00,Spoedeisende Hulp,Samenvatting: \nVerpleegkundige verslaglegging...,,,False,,,False,...,,False,False,False,,,False,,,False
11088,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2010-11-10 21:03:00,Spoedeisende Hulp,Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,,,False,,,False,...,,False,False,False,,,False,,,False


In [66]:
def prepare_bleeding_target_pattern(term_list):
    # Remove surrounding \b from each term (if present)
    cleaned_terms = [re.sub(r'^\\b|\\b$', '', t) for t in term_list]
    # Join terms with OR and wrap with \b(...) \b
    return r"\b(" + "|".join(cleaned_terms) + r")\b"

prepare_bleeding_target_pattern(scopie_terms)
prepare_bleeding_target_pattern(treatment_terms)

'\\b(pantoprazol|perfusor|infuus|transfusie|acenocoumarol|bloeddruk (laag|dalend)|hypotens(ie|ief)|vitamine k|protonpompremmer|iv vocht|spoedopname)\\b'

In [67]:
# exluding puncation
negation_pattern_scopie = re.compile(r"\b(geen|niet|zonder|uitsluiten|ontkent|ontkend|ontkennen|negatief voor|geen aanwijzing voor|kan uitgesloten worden|geen tekenen van)\b.[^.\n;:!?]{0,7}?\b(gastroscopie|endoscopie|coloscopie|ercp|video[-\\s]?endoscopie|spoed[-\\s]?gastroscopie|urgente coloscopie)\b", re.IGNORECASE)
# negation_pattern_treatment = re.compile(r"\b(geen|niet|zonder|uitsluiten|ontkent|ontkend|ontkennen|negatief voor|geen aanwijzing voor|kan uitgesloten worden|geen tekenen van)\b.[^.\n;:!?]{0,20}?\b(pantoprazol|perfusor|infuus|transfusie|acenocoumarol|bloeddruk (laag|dalend)|hypotens(ie|ief)|vitamine k|protonpompremmer|iv vocht|spoedopname)\b", re.IGNORECASE)

df['negated_scopie'] = df['verslagen_report_content'].apply(lambda x: bool(negation_pattern_scopie.search(str(x))))
df['negated_match_scopie'] = df['verslagen_report_content'].apply(lambda x: extract_match(str(x), negation_pattern_scopie))
df['negated_context_scopie'] = df['verslagen_report_content'].apply(lambda x: extract_matches_and_context(str(x), negation_pattern_scopie))

# df['negated_treatment'] = df['verslagen_report_content'].apply(lambda x: bool(negation_pattern_treatment.search(str(x))))
# df['negated_match_treatment'] = df['verslagen_report_content'].apply(lambda x: extract_match(str(x), negation_pattern_treatment))
# df['negated_context_treatment'] = df['verslagen_report_content'].apply(lambda x: extract_matches_and_context(str(x), negation_pattern_treatment))

✅ Match found: geen coloscopie
✅ Match found: geen gastroscopie
✅ Match found: geen gastroscopie
✅ Match found: geen gastroscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen coloscopie
✅ Match found: geen gastroscopie
✅ Match found: geen nieuwe gastroscopie
✅ Match found: geen nieuwe gastroscopie
✅ Match found: geen gastroscopie
✅ Match found: geen endoscopie
✅ Match found: Geen coloscopie
✅ Match found: geen endoscopie
✅ Match found: geen endoscopie
✅ Match found: geen endoscopie
✅ Match found: geen endoscopie
✅ Match found: geen endoscopie
✅ Match found: geen endoscopie
✅ Match found: geen nieuwe coloscopie
✅ Match found: geen gastroscopie
✅ 

In [68]:
# Step 1: Initialize the column with False
df['scopie_final'] = False

# Step 2: Define row-wise checking function
def is_final_match(row):
    # No AGIB or GIB match at all → skip
    if not row.get('has_scopie', False):
        return False
    
    # No negation at all → valid match
    if not row.get('negated', False):
        return True

    # Normalize text
    negated = str(row.get('negated_match_scopie', '')).lower().strip()
    scopie = str(row.get('scopie_match', '')).lower().strip()
    print(f"Negated: {negated}, Scopie: {scopie}")  # DEBUG LINE

    # If matched term is NOT in negated text, it's valid
    if scopie and scopie not in negated:
        print(f"scopie match found: {scopie} not in {negated}. True printed")  # DEBUG LINE
        return True

    # Else, negation matches the exact term
    return False

# Step 3: Apply the logic
df['scopie_final'] = df.apply(is_final_match, axis=1)


Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: coloscopie
scopie match found: coloscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: coloscopie
scopie match found: coloscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: gastroscopie not in none. True printed
Negated: none, Scopie: gastroscopie
scopie match found: ga

In [69]:
patient_scopie_lst = df[df['has_scopie']== True]['pseudo_id'].unique()
print(patient_scopie_lst)
len(patient_scopie_lst)

['046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6'
 '088C9FD98B8B2CBCB597C17C07AC1845B21F0849'
 '0A5645E02FA818D1629926B6BEFA81CF91C25A46'
 '0BC512A65442D0BB4B00FBE05E7EA6283E5C11FD'
 '0DD73490997F61870C32B3BB7C9CDE4E801FFF7D'
 '0E041554A0B23505AD762D006AB320CF4AF8F969'
 '0E93D98E82D272810A15FDE70270EE41E9C6DB71'
 '12D65DA1F4CFAC101DE53C050C9037D97F42FC18'
 '15CF926899FB0141DABA60251D292FCC89C94C1B'
 '19AD86175C45ED2AA0F752E0178000144E7FEF28'
 '20AAD8E38E7C9D75E44F9EA52336B7003ED239BD'
 '21031C9814F3BD6EFACD9FBC5AF409820CDABAB4'
 '21A750B9A8AB03B4A250D19577A0734E080BC743'
 '2242661E0D65C316E93EF5328929944799379F22'
 '23E295F4F8F7550C76B6C22ABDA7DFFB3FCF1682'
 '2562E6AE0D16F0504B50CEE14D0D1F0A37596ED8'
 '29871744C90865C3425F00A1935BFF9D2354DA44'
 '29B3653AE690547AB14AC7FCD32B21A561D5FF9A'
 '2E7E9399C8366C94770E35BC822203C1B0BDAB07'
 '355F07B9B6AF154431F346CBC4A6722294250C43'
 '3605432FE03B28514E3927DA7E5C53BD177BFD31'
 '36B03320CFFDB5C757F2DF59633886CF0D05AF9D'
 '37099C38CFE1055CF6950B3D61CEC7

103

In [70]:
patient_scopie_final_lst = df[df['scopie_final']== True]['pseudo_id'].unique()
print(patient_scopie_final_lst)
len(patient_scopie_final_lst)

['046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6'
 '088C9FD98B8B2CBCB597C17C07AC1845B21F0849'
 '0A5645E02FA818D1629926B6BEFA81CF91C25A46'
 '0BC512A65442D0BB4B00FBE05E7EA6283E5C11FD'
 '0DD73490997F61870C32B3BB7C9CDE4E801FFF7D'
 '0E041554A0B23505AD762D006AB320CF4AF8F969'
 '0E93D98E82D272810A15FDE70270EE41E9C6DB71'
 '12D65DA1F4CFAC101DE53C050C9037D97F42FC18'
 '15CF926899FB0141DABA60251D292FCC89C94C1B'
 '19AD86175C45ED2AA0F752E0178000144E7FEF28'
 '20AAD8E38E7C9D75E44F9EA52336B7003ED239BD'
 '21031C9814F3BD6EFACD9FBC5AF409820CDABAB4'
 '21A750B9A8AB03B4A250D19577A0734E080BC743'
 '2242661E0D65C316E93EF5328929944799379F22'
 '23E295F4F8F7550C76B6C22ABDA7DFFB3FCF1682'
 '2562E6AE0D16F0504B50CEE14D0D1F0A37596ED8'
 '29871744C90865C3425F00A1935BFF9D2354DA44'
 '29B3653AE690547AB14AC7FCD32B21A561D5FF9A'
 '2E7E9399C8366C94770E35BC822203C1B0BDAB07'
 '355F07B9B6AF154431F346CBC4A6722294250C43'
 '3605432FE03B28514E3927DA7E5C53BD177BFD31'
 '36B03320CFFDB5C757F2DF59633886CF0D05AF9D'
 '37099C38CFE1055CF6950B3D61CEC7

103

In [71]:
negation_pattern_treatment = re.compile(r"\b(geen|niet|zonder|uitsluiten|ontkent|ontkend|ontkennen|negatief voor|geen aanwijzing voor|kan uitgesloten worden|geen tekenen van)\b.[^.\n;:!?]{0,20}?\b(pantoprazol|perfusor|infuus|transfusie|acenocoumarol|bloeddruk (laag|dalend)|hypotens(ie|ief)|vitamine k|protonpompremmer|iv vocht|spoedopname)\b", re.IGNORECASE)

df['negated_treatment'] = df['verslagen_report_content'].apply(lambda x: bool(negation_pattern_treatment.search(str(x))))
df['negated_match_treatment'] = df['verslagen_report_content'].apply(lambda x: extract_match(str(x), negation_pattern_treatment))
df['negated_context_treatment'] = df['verslagen_report_content'].apply(lambda x: extract_matches_and_context(str(x), negation_pattern_treatment))

✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: niet herstellend onder vitamine K
✅ Match found: Geen infuus
✅ Match found: Geen infuus
✅ Match found: geen aanvullend infuus
✅ Match found: geen aanvullend infuus
✅ Match found: geen aanvullend infuus
✅ Match found: Geen kortademigheid na transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: Geen geobjectiveerde hypotensie
✅ Match found: Geen geobjectiveerde hypotensie
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen transfusie
✅ Match found: geen indicatie voor transfusie
✅ Match found: geen infuus
✅ Match found: geen infuus
✅ Match found:

In [72]:
# Step 1: Initialize the column with False
df['treatment_final'] = False

# Step 2: Define row-wise checking function
def is_final_match(row):
    # No AGIB or GIB match at all → skip
    if not row.get('has_treatment', False):
        return False
    
    # No negation at all → valid match
    if not row.get('negated', False):
        return True

    # Normalize text
    negated = str(row.get('negated_match_treatment', '')).lower().strip()
    treatment = str(row.get('treatment_match', '')).lower().strip()
    print(f"Negated: {negated}, Treatment: {treatment}")  # DEBUG LINE

    # If matched term is NOT in negated text, it's valid
    if treatment and treatment not in negated:
        print(f"Treatment match found: {treatment} not in {negated}. True printed")  # DEBUG LINE
        return True

    # Else, negation matches the exact term
    return False

# Step 3: Apply the logic
df['treatment_final'] = df.apply(is_final_match, axis=1)


Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: none, Treatment: acenocoumarol
Treatment match found: acenocoumarol not in none. True printed
Negated: geen infuus, Treatment: infuus
Negated: geen infuus, Treatment: infuus
Negated: none, Treatment: infuus
Treatment match found: infuus not in none. True printed
Negated: none, Treatment: transfusie
Treatment match found: transfusie not in none. True printed
Negated: none

In [73]:
patient_treatment_lst = df[df['has_treatment']== True]['pseudo_id'].unique()
print(patient_treatment_lst)
len(patient_treatment_lst)

['046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6'
 '088C9FD98B8B2CBCB597C17C07AC1845B21F0849'
 '0A5645E02FA818D1629926B6BEFA81CF91C25A46'
 '0BC512A65442D0BB4B00FBE05E7EA6283E5C11FD'
 '0DD73490997F61870C32B3BB7C9CDE4E801FFF7D'
 '0E041554A0B23505AD762D006AB320CF4AF8F969'
 '0E93D98E82D272810A15FDE70270EE41E9C6DB71'
 '12D65DA1F4CFAC101DE53C050C9037D97F42FC18'
 '15CF926899FB0141DABA60251D292FCC89C94C1B'
 '19AD86175C45ED2AA0F752E0178000144E7FEF28'
 '20AAD8E38E7C9D75E44F9EA52336B7003ED239BD'
 '21031C9814F3BD6EFACD9FBC5AF409820CDABAB4'
 '21A750B9A8AB03B4A250D19577A0734E080BC743'
 '2242661E0D65C316E93EF5328929944799379F22'
 '23E295F4F8F7550C76B6C22ABDA7DFFB3FCF1682'
 '2562E6AE0D16F0504B50CEE14D0D1F0A37596ED8'
 '29871744C90865C3425F00A1935BFF9D2354DA44'
 '29B3653AE690547AB14AC7FCD32B21A561D5FF9A'
 '2E7E9399C8366C94770E35BC822203C1B0BDAB07'
 '355F07B9B6AF154431F346CBC4A6722294250C43'
 '3605432FE03B28514E3927DA7E5C53BD177BFD31'
 '36B03320CFFDB5C757F2DF59633886CF0D05AF9D'
 '37099C38CFE1055CF6950B3D61CEC7

105

In [74]:
patient_treatment_final_lst = df[df['treatment_final']== True]['pseudo_id'].unique()
print(patient_treatment_final_lst)
len(patient_treatment_final_lst)

['046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6'
 '088C9FD98B8B2CBCB597C17C07AC1845B21F0849'
 '0A5645E02FA818D1629926B6BEFA81CF91C25A46'
 '0BC512A65442D0BB4B00FBE05E7EA6283E5C11FD'
 '0DD73490997F61870C32B3BB7C9CDE4E801FFF7D'
 '0E041554A0B23505AD762D006AB320CF4AF8F969'
 '0E93D98E82D272810A15FDE70270EE41E9C6DB71'
 '12D65DA1F4CFAC101DE53C050C9037D97F42FC18'
 '15CF926899FB0141DABA60251D292FCC89C94C1B'
 '19AD86175C45ED2AA0F752E0178000144E7FEF28'
 '20AAD8E38E7C9D75E44F9EA52336B7003ED239BD'
 '21031C9814F3BD6EFACD9FBC5AF409820CDABAB4'
 '21A750B9A8AB03B4A250D19577A0734E080BC743'
 '2242661E0D65C316E93EF5328929944799379F22'
 '23E295F4F8F7550C76B6C22ABDA7DFFB3FCF1682'
 '2562E6AE0D16F0504B50CEE14D0D1F0A37596ED8'
 '29871744C90865C3425F00A1935BFF9D2354DA44'
 '29B3653AE690547AB14AC7FCD32B21A561D5FF9A'
 '2E7E9399C8366C94770E35BC822203C1B0BDAB07'
 '355F07B9B6AF154431F346CBC4A6722294250C43'
 '3605432FE03B28514E3927DA7E5C53BD177BFD31'
 '36B03320CFFDB5C757F2DF59633886CF0D05AF9D'
 '37099C38CFE1055CF6950B3D61CEC7

105

In [78]:
df_final = df[['pseudo_id', 'verslagen_report_start_date', 'verslagen_report_specialism', 'verslagen_report_content', 'agib_final', 
               'gib_final', 'scopie_final', 'treatment_final']]
df_final

Unnamed: 0,pseudo_id,verslagen_report_start_date,verslagen_report_specialism,verslagen_report_content,agib_final,gib_final,scopie_final,treatment_final
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 15:06:00,"Maag-, Darm- en Leverziekten","Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",False,True,True,True
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-26 09:53:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies obv diver...,False,True,True,True
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 14:13:00,"Maag-, Darm- en Leverziekten",COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,False,True,True,True
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 13:48:00,"Maag-, Darm- en Leverziekten",GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,False,True,True,True
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,2020-11-25 08:47:00,"Maag-, Darm- en Leverziekten",Samenvatting: \nRectaal bloedverlies ; eenmali...,False,True,True,True
...,...,...,...,...,...,...,...,...
11085,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-03-20 08:13:00,Interne Geneeskunde,Samenvatting: \n1e consult\r\n-Type 1e consult...,False,False,True,False
11086,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2015-01-14 15:39:00,Interne Geneeskunde,Samenvatting: \nDecursus\r\n-Type decursus: De...,False,False,False,False
11087,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2014-12-21 09:31:00,Spoedeisende Hulp,Samenvatting: \nVerpleegkundige verslaglegging...,False,False,False,False
11088,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,2010-11-10 21:03:00,Spoedeisende Hulp,Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,False,False,False,False


In [None]:
# df_final.to_csv('a:/bloeding-met-patientenlijst-gedetailleerd/df_regex.csv', index=False)