In [2]:
import pandas as pd
import re

from typing import Dict, Tuple, Union, List
from re import Pattern

import language_tool_python
tool = language_tool_python.LanguageTool('nl')


In [3]:
# pip install language-tool-python

In [4]:
# Ensure the required library is installed
# %pip install openpyxl

file_path_abbreviations = "Abbreviations.xlsx"
abbreviations_excel = pd.read_excel(file_path_abbreviations)
abbreviations_excel.head()

Unnamed: 0,afkorting,betekenis,context word in front,context word behind,context word somewhere before,context not behind,other spellings,Nederlands woord?,Medicatie?,Uitleg woord
0,inco,incontinentie,,,,,"inco\., inc\., inc",nee,,
1,Cath,katheter,,,,,"catheter, cad, CAD, cateter, kateter",nee,,
2,Mw\.,mevrouw,,,,,"mw, mevr\., mevr, mvr",nee,,
3,i\.v\.m\.,in verband met,,,,,ivm,nee,,
4,Haldol,haldoperidol,,,,,,nee,,


In [5]:
def replacement_dict(abbreviations_df: pd.DataFrame) -> Dict[Pattern, Tuple[str, str]]:
    """
    Create a dictionary of regular expressions for abbreviations based on a DataFrame.

    Args:
        abbreviations_df (pd.DataFrame): DataFrame containing the abbreviations, their meanings, and contexts.

    Returns:
        Dict[Pattern, Tuple[str, str]]: A dictionary where the keys are compiled regex patterns and the values are tuples of abbreviation and its meaning.
    """
    abbreviations_dict = {}
    #special_characters = r'/:+'

    for index, row in abbreviations_df.iterrows():
        abbreviation = row['afkorting']
        other_spellings = row['other spellings']
        meaning = row['betekenis']
        context_front = row['context word in front']
        context_behind = row['context word behind']
        context_not_behind = row['context not behind']
        context_somewhere_front = row['context word somewhere before']
        #context_not_somewhere_front = row['context not somewhere before']
        dutch_word = row['Nederlands woord?']

        all_words = []
        if isinstance(abbreviation, str):
            if dutch_word != 'nee':
                 all_words.append(abbreviation)
            else:
                all_words.append(abbreviation.lower())
        if isinstance(other_spellings, str):
            all_words += [spelling.lower() for spelling in other_spellings.split(", ")]

        # Iterate over each word in all_words
        for word in all_words:
            # Check if the word is "+/-"
            if word == "+/-":
                pattern = r'\+/-'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == "+-":
                pattern = r'\+-'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif "->" in word:
                pattern = rf'({re.escape("->")}|{re.escape("-->")})'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == ">":
                pattern = rf'(?<!-)+{word}'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif word == "x-":
                pattern = rf'{word}\w+'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            elif "#" in word:
                pattern = rf'{word}\b'
                # Compile the regex pattern and add it to the abbreviations_dict
                abbreviations_dict[re.compile(pattern)] = (word, meaning)
                continue
            else:
                # If not, use the word as is for the pattern
                pattern = word

            # Handle context-based regular expressions
            if isinstance(context_somewhere_front, str):
                pattern = rf'((?<!\w){re.escape(context_somewhere_front.lower())})?\s*{re.escape(word)}\b'
            #elif isinstance(context_not_somewhere_front, str):
            #    pattern = rf'{word}(?!.*?(?<!\w){context_not_somewhere_front.lower()})'
            elif isinstance(context_behind, str) and isinstance(context_front, str):
                if context_behind == '[getal]' and context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(\d+){pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else: 
                        pattern = rf'(\d+){pattern}(\d+)'
                elif context_behind == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)}{pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)}{pattern}(\d+)(?!\w)'
                elif context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w)(\d+){pattern}{re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w)(\d+){pattern}{re.escape(context_behind)}(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern} {re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern} {re.escape(context_behind)}(?!\w)'
            elif isinstance(context_behind, str):
                if context_behind == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){pattern}(\d+)(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){pattern}(\d+)(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){pattern} {re.escape(context_behind)}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){pattern} {re.escape(context_behind)}(?!\w)'
            elif isinstance(context_front, str):
                if context_front == '[getal]':
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w)(\d+){pattern}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w)(\d+){pattern}(?!\w)'
                else:
                    if isinstance(context_not_behind, str):
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern}(?! ?{context_not_behind})(?!\w)'
                    else:
                        pattern = rf'(?<!\w){re.escape(context_front)} {pattern}(?!\w)'
            else:
                if isinstance(context_not_behind, str):
                    pattern = rf'(?<!\w){pattern}(?! ?{context_not_behind})(?!\w)'
                else:
                    pattern = rf'(?<!\w){pattern}(?!\w)'

            # Compile the regex pattern and add it to the abbreviations_dict
            abbreviations_dict[re.compile(pattern)] = (word, meaning)

    return abbreviations_dict


In [6]:
abbreviations_dict = replacement_dict(abbreviations_excel)

In [7]:
# Function to replace the abbreviations in the text

def replace_abbreviations(text: str, abbreviations_dict: Dict[Pattern, Tuple[str, str]]) -> str:

    """

    Replace abbreviations in the text with their meanings based on the provided dictionary.

 

    Args:

        text (str): The input text containing abbreviations.

        abbreviations_dict (Dict[Pattern, Tuple[str, str]]): Dictionary of regex patterns and their corresponding abbreviation and meaning.

 

    Returns:

        str: The text with abbreviations replaced by their meanings.

    """

    matched_patterns = []

    matched_patterns_lower = []

    for pattern, (abbreviation, meaning) in abbreviations_dict.items():

        if isinstance(meaning, str) and isinstance(text, str):

            matches = re.findall(pattern, text)

            for match in matches:

                if isinstance(match, tuple):

                    meaning = meaning.replace('[getal1]', match[0]).replace('[getal2]', match[1])

                else:

                    try:

                        integer = int(match)

                        meaning = meaning.replace('[getal]', match)

                    except:

                        meaning = meaning.replace('[woord]', re.sub(r'^x-', '', match))

                matched_patterns.append((match, meaning))

            text = re.sub(pattern, meaning, text)

    for pattern, (abbreviation, meaning) in abbreviations_dict.items():

        if isinstance(meaning, str) and isinstance(text, str):

            matches = re.findall(pattern, text.lower())

            for match in matches:

                if isinstance(match, tuple):

                    meaning = meaning.replace('[getal1]', match[0]).replace('[getal2]', match[1])

                else:

                    try:

                        integer = int(match)

                        meaning = meaning.replace('[getal]', match)

                    except:

                        meaning = meaning.replace('[woord]', re.sub(r'^x-', '', match))

                matched_patterns_lower.append((match, meaning))

            text = re.sub(pattern, meaning, text.lower())

    return text

In [8]:
df = pd.read_csv('a:/df_cleaned.csv')
df.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"['dhr', 'aj', 'dingemans', 'huisarts', 'street..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"['samenvatting', 'rectaal', 'bloedverlie', 'ob..."
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"['coloscopie', 'betreffen', 'mw', 'initials', ..."
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"['gastroscopie', 'betreffen', 'mw', 'initials'..."
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"['samenvatting', 'rectaal', 'bloedverlie', 'ee..."


In [9]:
print("abbreviations_dict:", abbreviations_dict)



In [10]:
# Apply the function to the first column and store the result in a new column
df_abbreviations = df.copy()
# print(df_abbreviations.head())
df_abbreviations['abbreviations_corrected'] = df['verslagen_report_content'].apply(lambda x: replace_abbreviations(x, abbreviations_dict))


In [11]:
df_abbreviations.head()

Unnamed: 0,pseudo_id,verslagen_report_tags,verslagen_report_content,verslagen_report_start_date,date,alltext,tokens,abbreviations_corrected
0,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Klinische Brief,"Dhr. A.J. Dingemans, huisarts\r\n[STREETNAME] ...",2020-11-26 15:06:00,2020-11-26 15:06:00,dhr aj dingemans huisarts streetnaam city datu...,"['dhr', 'aj', 'dingemans', 'huisarts', 'street...","meneer a.j. dingemans, huisarts\r\n[streetname..."
1,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies obv diver...,2020-11-26 09:53:00,2020-11-26 09:53:00,samenvatting rectaal bloedverlie obvn divertik...,"['samenvatting', 'rectaal', 'bloedverlie', 'ob...",samenvatting: \nrectaal bloedverlies onder beg...
2,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,COLOSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] [L...,2020-11-25 14:13:00,2020-11-25 14:13:00,coloscopie betreffen mw initials lastname adre...,"['coloscopie', 'betreffen', 'mw', 'initials', ...",coloscopie\r\n\r\nbetreft\r\nmevrouw [initials...
3,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,Poliklinische Brief,GASTROSCOPIE\r\n\r\nBetreft\r\nMw. [INITIALS] ...,2020-11-25 13:48:00,2020-11-25 13:48:00,gastroscopie betreffen mw initials lastname ad...,"['gastroscopie', 'betreffen', 'mw', 'initials'...",gastroscopie\r\n\r\nbetreft\r\nmevrouw [initia...
4,046D1FFEBDD40E1665D0ABA6DD8FC9F8BC4351C6,"Consult, Kliniek: vervolgconsult",Samenvatting: \nRectaal bloedverlies ; eenmali...,2020-11-25 08:47:00,2020-11-25 08:47:00,samenvatting rectaal bloedverlie eenmalig hd h...,"['samenvatting', 'rectaal', 'bloedverlie', 'ee...",samenvatting: \nrectaal bloedverlies ; eenmali...


In [12]:
def correct_spelling(text: Union[str, None], vocab: List[str]) -> Union[str, None]:
    """
    Correct spelling mistakes in the input text using LanguageTool and a specified vocabulary.

    Args:
        text (Union[str, None]): The input text to correct spelling.
        vocab (List[str]): A list of words considered correct, which are not flagged as misspellings.

    Returns:
        Union[str, None]: The corrected text if input is a string; otherwise, the original input.
    """
    spelling_mistakes = []
    if isinstance(text, str):
        matches = tool.check(text)
        is_correctly_spelled = lambda rule: rule.ruleIssueType == 'misspelling' and rule.matchedText in vocab
        contains_digit = lambda s: any(char.isdigit() for char in s)
        
        for match in matches:
            if not is_correctly_spelled(match) and not contains_digit(match.matchedText):
                spelling_mistakes.append(match)
        
        corrected_text = language_tool_python.utils.correct(text, spelling_mistakes)
        return corrected_text
    else:
        return text


In [None]:
# Apply the function to the first column and store the result in a new column

df_abbreviations['spelling_corrected'] = df_abbreviations['abbreviations_corrected'].apply(lambda x: correct_spelling(x, abbreviations_dict))

In [None]:
df_abbreviations.to_csv('a:/df_Silke_abbreviations.csv', index=False)