In [2]:
import pandas as pd
import re

from typing import Dict, Tuple, Union, List
from re import Pattern

import language_tool_python
tool = language_tool_python.LanguageTool('nl')


In [3]:
# pip install language-tool-python

In [4]:
# Ensure the required library is installed
# %pip install openpyxl

file_path_abbreviations = "Abbreviations.xlsx"
abbreviations_excel = pd.read_excel(file_path_abbreviations)
abbreviations_excel

Unnamed: 0,afkorting,betekenis,context word in front,context word behind,context word somewhere before,context not behind,other spellings,Nederlands woord?,Medicatie?,Uitleg woord
0,inco,incontinentie,,,,,"inco\., inc\., inc",nee,,
1,Cath,katheter,,,,,"catheter, cad, CAD, cateter, kateter",nee,,
2,Mw\.,mevrouw,,,,,"mw, mevr\., mevr, mvr",nee,,
3,i\.v\.m\.,in verband met,,,,,ivm,nee,,
4,Haldol,haldoperidol,,,,,,nee,,
...,...,...,...,...,...,...,...,...,...,...
359,prod,productie,,,,,,nee,,
360,tele,telemetrie,,,,,,nee,,
361,wkn,weken,,,,,wk,nee,,
362,PA,doktersassistent,,,,,,ja,,


In [4]:
def replacement_dict(abbreviations_df: pd.DataFrame) -> Dict[Pattern, Tuple[str, str]]:
    """
    Create a dictionary of regular expressions for abbreviations based on a DataFrame.

    Args:
        abbreviations_df (pd.DataFrame): DataFrame containing the abbreviations, their meanings, and contexts.

    Returns:
        Dict[Pattern, Tuple[str, str]]: A dictionary where the keys are compiled regex patterns and the values are tuples of abbreviation and its meaning.
    """
    abbreviations_dict = {}
    #special_characters = r'/:+'

    for index, row in abbreviations_df.iterrows():
        abbreviation = row['afkorting']
        other_spellings = row['other spellings']
        meaning = row['betekenis']
        context_front = row['context word in front']
        context_behind = row['context word behind']
        context_not_behind = row['context not behind']
        context_somewhere_front = row['context word somewhere before']
        #context_not_somewhere_front = row['context not somewhere before']
        dutch_word = row['Nederlands woord?']

        all_words = []
        if isinstance(abbreviation, str):
            if dutch_word != 'nee':
                 all_words.append(abbreviation)
            else:
                all_words.append(abbreviation.lower())
        if isinstance(other_spellings, str):
            all_words += [spelling.lower() for spelling in other_spellings.split(", ")]

        # Iterate over each word in all_words
        for word in all_words:
            # Check if the word is "+/-"
            if word == "+/-":
                pattern = r'\+/-'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == "+-":
                pattern = r'\+-'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif "->" in word:
                pattern = rf'({re.escape("->")}|{re.escape("-->")})'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == ">":
                pattern = rf'(?<!-)+{word}'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == "x-":
                pattern = rf'{word}\w+'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif "#" in word:
                pattern = rf'{word}\b'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            else:
                # If not, use the word as is for the pattern
                pattern = word

            # Handle context-based regular expressions
            if isinstance(context_somewhere_front, str):
                pattern = rf'((?<!\w){re.escape(context_somewhere_front.lower())})?\s*{re.escape(word)}\b'
            #elif isinstance(context_not_somewhere_front, str):
            #    pattern = rf'{word}(?!.*?(?<!\w){context_not_somewhere_front.lower()})'
            elif isinstance(context_behind, str) and isinstance(context_front, str):
                if context_behind == '[getal]' and context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(\d+){pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else: 
                        pattern = rf'(\d+){pattern}(\d+)'
                elif context_behind == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)}{pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)}{pattern}(\d+)(?!\w)'
                elif context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w)(\d+){pattern}{re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w)(\d+){pattern}{re.escape(context_behind)}(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern} {re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern} {re.escape(context_behind)}(?!\w)'
            elif isinstance(context_behind, str):
                if context_behind == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){pattern}(\d+)(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){pattern} {re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){pattern} {re.escape(context_behind)}(?!\w)'
            elif isinstance(context_front, str):
                if context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w)(\d+){pattern}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w)(\d+){pattern}(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern}(?!\w)'
            else:
                if isinstance(context_not_behind, str):
                    pattern = rf'(?<!\w){pattern}(?! ?{context_not_behind})(?!\w)'
                else:
                    pattern = rf'(?<!\w){pattern}(?!\w)'

            # Compile the regex pattern and add it to the abbreviations_dict
            abbreviations_dict[re.compile(pattern)] = (word, meaning)

    return abbreviations_dict


In [5]:
abbreviations_dict = replacement_dict(abbreviations_excel)

In [6]:
# Function to replace the abbreviations in the text

def replace_abbreviations(text: str, abbreviations_dict: Dict[Pattern, Tuple[str, str]]) -> str:

    """

    Replace abbreviations in the text with their meanings based on the provided dictionary.

 

    Args:

        text (str): The input text containing abbreviations.

        abbreviations_dict (Dict[Pattern, Tuple[str, str]]): Dictionary of regex patterns and their corresponding abbreviation and meaning.

 

    Returns:

        str: The text with abbreviations replaced by their meanings.

    """

    matched_patterns = []

    matched_patterns_lower = []

    for pattern, (abbreviation, meaning) in abbreviations_dict.items():

        if isinstance(meaning, str) and isinstance(text, str):

            matches = re.findall(pattern, text)

            for match in matches:

                if isinstance(match, tuple):

                    meaning = meaning.replace('[getal1]', match[0]).replace('[getal2]', match[1])

                else:

                    try:

                        integer = int(match)

                        meaning = meaning.replace('[getal]', match)

                    except:

                        meaning = meaning.replace('[woord]', re.sub(r'^x-', '', match))

                matched_patterns.append((match, meaning))

            text = re.sub(pattern, meaning, text)

    for pattern, (abbreviation, meaning) in abbreviations_dict.items():

        if isinstance(meaning, str) and isinstance(text, str):

            matches = re.findall(pattern, text.lower())

            for match in matches:

                if isinstance(match, tuple):

                    meaning = meaning.replace('[getal1]', match[0]).replace('[getal2]', match[1])

                else:

                    try:

                        integer = int(match)

                        meaning = meaning.replace('[getal]', match)

                    except:

                        meaning = meaning.replace('[woord]', re.sub(r'^x-', '', match))

                matched_patterns_lower.append((match, meaning))

            text = re.sub(pattern, meaning, text.lower())

    return text

In [7]:
# df = pd.read_csv('a:/df_cleaned.csv')
df = pd.read_csv('a:/bloeding-met-patientenlijst-gedetailleerd/bloeding-met-patientenlijst-4-verslagen.csv')
# df = pd.read_csv('a:/test-data/bloeding-met-patientenlijst-copy-4-verslagen.csv')
df.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_age_at_time_of_event,verslagen_report_specialism,verslagen_report_start_date
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",87,"Maag-, Darm- en Leverziekten",2020-11-26 15:06:00
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,87,"Maag-, Darm- en Leverziekten",2020-11-26 09:53:00
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,87,"Maag-, Darm- en Leverziekten",2020-11-25 14:13:00
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,87,"Maag-, Darm- en Leverziekten",2020-11-25 13:48:00
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,87,"Maag-, Darm- en Leverziekten",2020-11-25 08:47:00


In [8]:
print("abbreviations_dict:", abbreviations_dict)



In [9]:
# # Apply the function to the first column and store the result in a new column
# df_abbreviations = df.copy()
# # print(df_abbreviations.head())
# df_abbreviations['abbreviations_corrected'] = df['verslagen_report_content'].apply(lambda x: replace_abbreviations(x, abbreviations_dict))


In [12]:
# df_abbreviations.to_csv('a:/bloeding-met-patientenlijst-gedetailleerd/verslagen-abb-corrected.csv', index=False)
df_abbreviations = pd.read_csv('a:/bloeding-met-patientenlijst-gedetailleerd/verslagen-abb-corrected.csv')

In [13]:
df_abbreviations.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_age_at_time_of_event,verslagen_report_specialism,verslagen_report_start_date,abbreviations_corrected
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",87,"Maag-, Darm- en Leverziekten",2020-11-26 15:06:00,"meneer a.j. dingemans, huisarts\r\n[streetname..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,87,"Maag-, Darm- en Leverziekten",2020-11-26 09:53:00,samenvatting: \nrectaal bloedverlies onder beg...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,87,"Maag-, Darm- en Leverziekten",2020-11-25 14:13:00,coloscopie\r\n\r\nbetreft\r\nmevrouw [initials...
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,87,"Maag-, Darm- en Leverziekten",2020-11-25 13:48:00,gastroscopie\r\n\r\nbetreft\r\nmevrouw [initia...
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,87,"Maag-, Darm- en Leverziekten",2020-11-25 08:47:00,samenvatting: \nrectaal bloedverlies ; eenmali...


In [14]:
specific_id = df_abbreviations[df_abbreviations['pseudo_id'] == '046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6']
specific_id['pseudo_id'].unique()  

array(['046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6'], dtype=object)

In [15]:
for i, row in specific_id[specific_id['pseudo_id'].notnull()].iterrows():
    print(f"🩸 Pseudo ID: {row['pseudo_id']}")
    # print(f"🩸 Date: {row['date']}")
    print(f"📜 Content: {row['verslagen_report_content']}")
    print("-------")
    print(f"📜 Corrected: {row['abbreviations_corrected']}")
    print("----------------------------")

🩸 Pseudo ID: 046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6
📜 Content: Dhr. A.J. Dingemans, huisarts
[STREETNAME] NR  [CITY]





datum
29-11-2020
kenmerk
0007475372 / [PATIENTID]
BSN nr.
[BSN]
betreft
mevrouw [INITIALS] [LASTNAME], geb. [BIRTHDATE] (87)
[STREETNAME], [ZIP]  [CITY]
tel. [PHONENUMBER]

 
Geachte collega,

Bovengenoemde patiënte was opgenomen van 24-11-2020 tot en met 26-11-2020 op de afdeling Maag-, Darm- en Leverziekten in verband met melaena en rectaal bloedverlies. 

Voorgeschiedenis
2002 Diep veneuze trombose en longembolie
2013 Cholecystectomie
2015 Diverticulitis
2016 Atriumfibrilleren met spontane conversie naar sinusritme
2016 Melena, waarvoor geen verklaring werd gevonden. In verband met stabiel Hb en in overleg met patiënte expectatief beleid. 
2018 Vermoeidheid bij sinusbradycardie waarvoor stop metoprolol en tambocor. 

Anamnese
Vanmiddag rond 13.30u fors helderrood bloedverlies met stolsels. Vermengd met ontlasting, mogelijk was deze zwart van kleur. Sinds 5 dage

Preprocessing for spelling check

In [16]:
import pandas as pd
import re
import unicodedata

def clean_text_column(df, column_name, new_column_name=None):
    """
    Cleans formatting and placeholders in a text column of a DataFrame.
    
    Parameters:
    - df: pandas DataFrame
    - column_name: name of the column to clean
    - new_column_name: optional, name of the new column to store cleaned text
                       (if None, it will overwrite the original column)
    
    Returns:
    - df with the cleaned column
    """
    
    def clean_text(text):
        if pd.isnull(text):
            return text  # leave NaN as-is

        # Step 1: Replace \r, \n, \t with space
        text = re.sub(r'[\r\n\t]+', ' ', text)

        # Step 2: Remove placeholder tokens like [INITIALS], [LASTNAME]
        text = re.sub(r'\[[^\]]*\]', '', text)

        # Step 3: Collapse multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()

        # Step 4: Remove ASCII control characters (including \x07)
        text = re.sub(r'[\x00-\x1F\x7F]', '', text)

        # Step 5: Normalize Unicode (optional but helps with accents)
        text = unicodedata.normalize('NFKC', text)

        return text

    # Apply cleaning
    cleaned = df[column_name].apply(clean_text)

    # Assign to the desired column
    if new_column_name:
        df[new_column_name] = cleaned
    else:
        df[column_name] = cleaned

    return df

In [17]:
df_abbreviations = clean_text_column(df_abbreviations, 'abbreviations_corrected', 'abbreviations_corrected_cleaned')
df_abbreviations

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_age_at_time_of_event,verslagen_report_specialism,verslagen_report_start_date,abbreviations_corrected,abbreviations_corrected_cleaned
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",87,"Maag-, Darm- en Leverziekten",2020-11-26 15:06:00,"meneer a.j. dingemans, huisarts\r\n[streetname...","meneer a.j. dingemans, huisarts nr datum 29-11..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,87,"Maag-, Darm- en Leverziekten",2020-11-26 09:53:00,samenvatting: \nrectaal bloedverlies onder beg...,samenvatting: rectaal bloedverlies onder begel...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,87,"Maag-, Darm- en Leverziekten",2020-11-25 14:13:00,coloscopie\r\n\r\nbetreft\r\nmevrouw [initials...,"coloscopie betreft mevrouw adresgegevens: , ge..."
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,87,"Maag-, Darm- en Leverziekten",2020-11-25 13:48:00,gastroscopie\r\n\r\nbetreft\r\nmevrouw [initia...,"gastroscopie betreft mevrouw adresgegevens: , ..."
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,87,"Maag-, Darm- en Leverziekten",2020-11-25 08:47:00,samenvatting: \nrectaal bloedverlies ; eenmali...,samenvatting: rectaal bloedverlies ; eenmalig ...
...,...,...,...,...,...,...,...,...
11085,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,Consult,Samenvatting: \n1e consult\r\n-Type 1e consult...,83,Interne Geneeskunde,2015-03-20 08:13:00,samenvatting: \neerste consult\r\n-type eerste...,samenvatting: eerste consult -type eerste cons...
11086,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, Kliniek: vervolgconsult",Samenvatting: \nDecursus\r\n-Type decursus: De...,83,Interne Geneeskunde,2015-01-14 15:39:00,samenvatting: \ndecursus\r\n-type decursus: de...,samenvatting: decursus -type decursus: decursu...
11087,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nVerpleegkundige verslaglegging...,83,Spoedeisende Hulp,2014-12-21 09:31:00,samenvatting: \nverpleegkundige verslaglegging...,samenvatting: verpleegkundige verslaglegging -...
11088,FD8C682C1F4FDA1E5EC0B760D30875556419BD71,"Consult, SEH",Samenvatting: \nMedisch Dossier\r\n[ Vk Sputov...,79,Spoedeisende Hulp,2010-11-10 21:03:00,samenvatting: \nmedisch dossier\r\n[ vk sputov...,samenvatting: medisch dossier -leeftijd in jar...


In [18]:
specific_id = df_abbreviations[df_abbreviations['pseudo_id'] == '046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6']
specific_id['pseudo_id'].unique()  

for i, row in specific_id[specific_id['pseudo_id'].notnull()].iterrows():
    print(f"🩸 Pseudo ID: {row['pseudo_id']}")
    # print(f"🩸 Date: {row['date']}")
    print(f"📜 Content: {row['verslagen_report_content']}")
    print("---------------- corrected:") 
    print(f"📜 Corrected: {row['abbreviations_corrected']}")
    print("---------------------------- cleaned:")
    print(f"📜 Cleaned: {row['abbreviations_corrected_cleaned']}")
    print("----------------------------")

🩸 Pseudo ID: 046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6
📜 Content: Dhr. A.J. Dingemans, huisarts
[STREETNAME] NR  [CITY]





datum
29-11-2020
kenmerk
0007475372 / [PATIENTID]
BSN nr.
[BSN]
betreft
mevrouw [INITIALS] [LASTNAME], geb. [BIRTHDATE] (87)
[STREETNAME], [ZIP]  [CITY]
tel. [PHONENUMBER]

 
Geachte collega,

Bovengenoemde patiënte was opgenomen van 24-11-2020 tot en met 26-11-2020 op de afdeling Maag-, Darm- en Leverziekten in verband met melaena en rectaal bloedverlies. 

Voorgeschiedenis
2002 Diep veneuze trombose en longembolie
2013 Cholecystectomie
2015 Diverticulitis
2016 Atriumfibrilleren met spontane conversie naar sinusritme
2016 Melena, waarvoor geen verklaring werd gevonden. In verband met stabiel Hb en in overleg met patiënte expectatief beleid. 
2018 Vermoeidheid bij sinusbradycardie waarvoor stop metoprolol en tambocor. 

Anamnese
Vanmiddag rond 13.30u fors helderrood bloedverlies met stolsels. Vermengd met ontlasting, mogelijk was deze zwart van kleur. Sinds 5 dage

In [19]:
# def correct_spelling(text: Union[str, None], vocab: List[str]) -> Union[str, None]:
#     """
#     Correct spelling mistakes in the input text using LanguageTool and a specified vocabulary.

#     Args:
#         text (Union[str, None]): The input text to correct spelling.
#         vocab (List[str]): A list of words considered correct, which are not flagged as misspellings.

#     Returns:
#         Union[str, None]: The corrected text if input is a string; otherwise, the original input.
#     """
#     spelling_mistakes = []
#     if isinstance(text, str):
#         matches = tool.check(text)
#         is_correctly_spelled = lambda rule: rule.ruleIssueType == 'misspelling' and rule.matchedText in vocab
#         contains_digit = lambda s: any(char.isdigit() for char in s)
        
#         for match in matches:
#             if not is_correctly_spelled(match) and not contains_digit(match.matchedText):
#                 spelling_mistakes.append(match)
        
#         corrected_text = language_tool_python.utils.correct(text, spelling_mistakes)
#         return corrected_text
#     else:
#         return text


In [20]:
from typing import Union, List
import language_tool_python

# Initialize globally once
tool = language_tool_python.LanguageTool('nl')

# Define the function
def correct_spelling(text: Union[str, None], vocab: List[str]) -> Union[str, None]:
    if not isinstance(text, str):
        return text
    
    matches = tool.check(text)
    is_correctly_spelled = lambda rule: rule.ruleIssueType == 'misspelling' and rule.matchedText in vocab
    contains_digit = lambda s: any(char.isdigit() for char in s)

    spelling_mistakes = [
        match for match in matches
        if not is_correctly_spelled(match) and not contains_digit(match.matchedText)
    ]

    corrected_text = language_tool_python.utils.correct(text, spelling_mistakes)
    return corrected_text


In [23]:
# Create cache dictionary
cache = {}
vocab_tuple = tuple(abbreviations_dict)  # Convert to tuple if needed for hashing

# Function to wrap with cache
def correct_spelling_cached(text):
    print(f"Checking spelling for: {text}")
    if text in cache:
        print(f"Using cached result for: {text}")
        return cache[text]
    result = correct_spelling(text, vocab=abbreviations_dict)
    cache[text] = result
    
    return result



In [None]:
# Select one patient's row — you can also use df.loc[...] to filter by ID
sample_row = df_abbreviations.iloc[0]  # First row

# Extract the text
text_before = sample_row['abbreviations_corrected_cleaned']

# Clean + correct
text_after = correct_spelling(text_before, vocab=abbreviations_dict)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Original': [text_before],
    'Corrected': [text_after]
})

comparison_df
print(comparison_df['Original'][])
print('--------------------')
print(comparison_df['Corrected'])


0    meneer a.j. dingemans, huisarts nr datum 29-11...
Name: Original, dtype: object
--------------------
0    Meneer a.j. Dingemans, huisarts nr. datum 29-1...
Name: Corrected, dtype: object


In [29]:

comparison_df
print(comparison_df['Original'][0])
print('--------------------')
print(comparison_df['Corrected'][0])


meneer a.j. dingemans, huisarts nr datum 29-11-2020 kenmerk 0007475372 / bsn nr. betreft mevrouw , geen bijzonderheden (87) , tel. geachte collega, bovengenoemde patiënte was opgenomen van 24-11-2020 tot en met 26-11-2020 op de afdeling maag-, darm- en leverziekten in verband met melaena en rectaal bloedverlies. voorgeschiedenis 2002 diep veneuze trombose en longembolie 2013 cholecystectomie 2015 diverticulitis 2016 atriumfibrilleren met spontane conversie naar sinusritme 2016 melena, waarvoor geen verklaring werd gevonden. in verband met stabiel hemoglobine en in overleg met patiënte expectatief beleid. 2018 vermoeidheid bij sinusbradycardie waarvoor stop metoprolol en tambocor. anamnese vanmiddag rond 13.30 uur fors helderrood bloedverlies met stolsels. vermengd met ontlasting, mogelijk was deze zwart van kleur. sinds 5 dagen zeurende pijn in de bovenbuik; maagpijn, waarvoor ze is gestopt met koffie drinken en vet eten. in de afgelopen dagen wat minder ontlasting, bij ook minder inta

In [None]:
# # Apply to DataFrame
# df_abbreviations['spelling_corrected'] = df_abbreviations['abbreviations_corrected_cleaned'].apply(correct_spelling_cached)

Checking spelling for: meneer a.j. dingemans, huisarts nr datum 29-11-2020 kenmerk 0007475372 / bsn nr. betreft mevrouw , geen bijzonderheden (87) , tel. geachte collega, bovengenoemde patiënte was opgenomen van 24-11-2020 tot en met 26-11-2020 op de afdeling maag-, darm- en leverziekten in verband met melaena en rectaal bloedverlies. voorgeschiedenis 2002 diep veneuze trombose en longembolie 2013 cholecystectomie 2015 diverticulitis 2016 atriumfibrilleren met spontane conversie naar sinusritme 2016 melena, waarvoor geen verklaring werd gevonden. in verband met stabiel hemoglobine en in overleg met patiënte expectatief beleid. 2018 vermoeidheid bij sinusbradycardie waarvoor stop metoprolol en tambocor. anamnese vanmiddag rond 13.30 uur fors helderrood bloedverlies met stolsels. vermengd met ontlasting, mogelijk was deze zwart van kleur. sinds 5 dagen zeurende pijn in de bovenbuik; maagpijn, waarvoor ze is gestopt met koffie drinken en vet eten. in de afgelopen dagen wat minder ontlasti

In [None]:
# # Apply the function to the first column and store the result in a new column

# df_abbreviations['spelling_corrected'] = df_abbreviations['abbreviations_corrected_cleaned'].apply(lambda x: correct_spelling(x, abbreviations_dict))

In [None]:
# df_abbreviations.to_csv('a:/bloeding-met-patientenlijst-gedetailleerd/spelling-abb-corrected.csv', index=False)
# df_abbreviations.to_csv('a:/test-data/bloeding-met-patientenlijst-copy-4-verslagen-spelling-abb-corrected.csv')