In [8]:
import pandas as pd
import inflect
import re
import contractions

# Create an engine for inflect
p = inflect.engine()

# Standardizing Formats "17", “17-year-old”, “3-4”, or “17yo”
def convert_numbers_in_string(s):
    # Find all numbers in the string
    numbers_in_words = re.findall(r'\b\d+\b', s)
    numbers_in_parts = re.findall(r'\d+', s)

    # For each number
    for number in numbers_in_words:
        # Convert the number to words
        word = p.number_to_words(number)
        # Replace the number with the word in the string
        s = re.sub(r'\b' + number + r'\b', word, s)

    for number in numbers_in_parts:
        # Convert the number to words
        word = p.number_to_words(number)
        # Replace the number with the word (followed by a space) in the string
        s = re.sub(number, word + " ", s)

    return s

# Handling Contractions: Expanding contractions" 
# "yo", "y.o." "y/o" to "year old",  
# "y" to "year"
# "f" to "female", 
# "m" to "male", 
# "mo" to "month", 
# "yr" to "year"
# "c/o" "c/of" "c/m" to "complains of",
# "cc" to "chief complaint"
# "h/o" to "history of"
# "pt" to "patient"
# "w", "wk" to "week", "hrs" to "hours""
# "x" to times?
def expand_contractions(text):
    # Define a dictionary of contractions and their expanded forms
    custom_contractions = {
        "yo": "year old",
        "y.o.": "year old",
        "y/o": "year old",
        "y": "year",
        "yr": "year",
        "f": "female",
        "m": "male",
        "mo": "month",
        "yr": "year",
        "c/o": "complains of",
        "c/of": "complains of",
        "c/m": "complains of",
        "cc": "chief complaint",
        "h/o": "history of",
        "pt": "patient",
        "w": "week",
        "wk": "week",
        "hrs": "hours",
        "hx": "history",
        "pmh": "past medical history",
        "pmhx": "past medical history",
        "psh": "past surgical history",
        "psurghx": "past surgical history",
        "pshh": "past surgical history",
        "meds": "medications",
        "hosp": "hosipital",
        "fh": "Family history",
        "fhx": "Family history",
        "fmh": "Family history",
        "sh":"social history",
        "soc": "social history",
        "Rx": "prescription",
        "ros":"review of systems",
        "hpi": "history of present illness"
    }

    # First, use the contractions library to expand common English contractions
    text = contractions.fix(text)

    # Then, handle the custom contractions
    # Split the text into words
    words = text.split()
    # For each word in the text
    for i in range(len(words)):
        # If the word is a contraction
        if words[i] in custom_contractions:
            # Replace the contraction with its expanded form in the text
            words[i] = custom_contractions[words[i]]
    # Join the words back into a text string
    return ' '.join(words)



def process_data(df):
    # Fill NaN values with a default value
    df.fillna('Unknown', inplace=True)
    
    # Convert columns to string type
    df['pn_history'] = df['pn_history'].astype(str)

    df['pn_history'] = df['pn_history'].apply(convert_numbers_in_string)

    # Apply the function to the 'pn_history' column
    df['pn_history'] = df['pn_history'].apply(expand_contractions)

    return df

# Read the data from the CSV file
df = pd.read_csv('./patient_notes.csv')

# Apply the function to the DataFrame
df = process_data(df)

# Write the data back to the CSV file
df.to_csv('data.csv', index=False)
