In [1]:
import sys, os, csv, ast, random, time, re
import pandas as pd
from pathlib import Path
from collections import deque  # Queue for questions
from faker import Faker
from dateutil.relativedelta import relativedelta
from datetime import datetime

from variables import HEALTH_CONDITIONS, COMMON_RECREATIONAL_DRUGS, COMMON_HEALTH_SUPPLEMENTS, COMMON_ALLERGIES, DR_APPOINTMENT_REASONS, TYPING_STYLES, CONVERSATIONAL_TONE, PERSONALITY_TRAITS, EDGE_CASE_SCENARIOS, INDIGENOUS, MIDDLE_EASTERN_REGIONS, EUROPEAN_REGIONS, EAST_ASIAN_REGIONS, SOUTH_ASIAN_REGIONS, SOUTHEAST_ASIAN_REGIONS, LATIN_AMERICAN_REGIONS, AFRICAN_REGIONS, OTHER_REGIONS
import variables
from ethnicity_codes import ETHNICITY_CODES

# import Langchain and Ollama
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model1 = OllamaLLM(base_url="localhost:11435", 
                   model="llama3.3:70b", 
                   temperature=0.7, 
                   num_ctx = 4096,
                   top_k = 50,
                   top_p = 0.95)

#llama3.1:70b-instruct-q4_0

In [2]:
print(Path.cwd())

/home/ninc-user/finetuning/transcript_generation-main/patient_creation


In [3]:
random.seed()
# This function would take a prompt, a variables file, and then call the LLM to generate a patient profile
def generate_patient_profile(variables):
    '''
    Generates a fictional patient profile by filling in missing details based on structured variables.

    Parameters
    ----------
    variables: dict
        A dictionary containing predefined patient variables, some of which may be incomplete or placeholders.

    Returns
    -------
    dict
        A fully populated patient profile dictionary with realistic details, ensuring coherence and consistency.

    Function Overview
    -----------------
    - Loads the provided `variables` as a dictionary representing patient data.
    - Defines a structured profile template with various categories such as General Information, Medical History, Family History, and Lifestyle.
    - Uses `hydrate_patient_profile` to populate fields with initial values.
    - Identifies missing or placeholder values that require completion.
    - Generates a structured prompt for an LLM to fill in missing details while maintaining logical consistency.
    - Ensures anglicization of names and birthplaces if necessary, adhering to strict formatting requirements.
    - Invokes an LLM (`model1.invoke(prompt)`) to generate missing information.
    - Updates the patient dictionary using `hydrate_dict()` with the newly generated details.
    - Returns the completed patient dictionary.
    '''
    
    # Load variables from a Python file (assuming it's structured as a dictionary)
    patient_variables = variables
    
    # Patient profile template (corresponding to your structured format)
    profile_template = """
    ### General Information
    - Full name: {full_name}
    - Preferred name: {preferred_name}
    - Date of birth: {date_of_birth}
    - Age: {age}
    - Sex: {sex}
    - Handedness: {handedness}
    
    ### Residence and Marital Status
    - Current city/town of residence: {residence}
    - Relationship status: {relationship_status}
    - Children or dependents: {children_status}
      {children_details}
    
    ### Employment and Financial Status
    - Current employment status: {employment_status}
      {employment_details}
    - Disabled: {disabled}
    {disability_details}
    - Disability assistance: {disability_status}
      {disability_assistance_details}
    
    ### Medical Care Information
    - List of current doctors: {doctors}
    - Allergies: {allergies}
    - Current medications and dosages: {medications}
    - Health supplements: {supplements}
    
    ### Substance Use
    - Nicotine, marijuana, alcohol use frequency: {substance_use}
    
    ### Medical History
    - Reason for Appointment: {dr_reason}
    - Health conditions/diagnoses: {health_conditions}
    - Previous hospitalizations or surgeries: {hospitalizations}
    - Head injuries or concussions: {head_injuries}
    - History of seizures: {seizures}
    - Rehab or substance counseling: {rehab_history}
    
    ### Family History
    - Psychiatric conditions in family: {family_psychiatric}
    - Neurological or genetic conditions in family: {family_neurological}
    - Siblings: {siblings}
    
    ### Personal History
    - Birthplace: {birthplace}
    - Canadian citizen: {citizenship_status}
    
    ### Education
    - Difficulties in elementary school: {elementary_difficulties}
    - High school performance: {high_school_performance}
    - Further education after high school: {further_education}
    
    ### Employment and Relationships
    - Previous work history: {previous_work}
    - Previous marriages/long-term relationships: {previous_relationships}
    
    ### Hobbies and Lifestyle
    - Hobbies: {hobbies}
    - How to relax on a stressful day: {relaxation_methods}
    
    ### Personality
    - Conversational tone: {conversational_tone}
    - Typing style: {typing_style}
    - Personality traits: {personality_traits}
    """
    
    # Hydrate randomly generated information
    patient_dict = hydrate_patient_profile(variables, {})
    patient_dict
    
    # fields_to_fill = {k: v for k, v in patient_dict.items() if v == ""}
    
    # Extract fields with placeholders
    fields_to_fill = {k: v for k, v in patient_dict.items() if isinstance(v, str) and v.startswith('{') and v.endswith('}')}

    # Convert to a more readable format for the LLM
    fields_to_fill_str = ', '.join([f"{key}: {value}" for key, value in fields_to_fill.items()])
    print(fields_to_fill_str)

    prompt = f"""
    I have a fictional patient profile, and I need to fill in some sections with realistic-sounding but completely made-up details. 
    Here are the sections with placeholders that need to be completed:

    {fields_to_fill_str}
    

    Please generate brief, suitable details for each section that make sense in the context of a fictional character and their background. The information should be coherent with the rest of the profile provided below,
    and there must NOT be contradicting information. For example, if the patient has no siblings, then there should not be siblings mentioned in the Family History.
    
    
    Here is the partially filled out dict:

    {patient_dict}
    
     If the full name and city of their birthplace are not in the english alphabet, please anglicize them and output it inline with the other fields, as such:
    "Anglicized Full name: value", "Anglicized Clinician Name: value", "Anglicized Birthplace: value", and "Anglicized Doctor Name: value". You MUST adhere to this format, and do not include any more text on the same line as these outputs.
    For the Birthplace, you MUST include the ENTIRETY of the Birthplace name in the value, so please include both the city name AND either the country name or the country code.
    
    YOU MUST return the output in the following format for each section:
    "Field Name: value"
    """


    # Call the LLM
    try:
        response = model1.invoke(prompt)
        
        # Get the generated profile
        print("Response: ", response)
    
    except Exception as e:
        print(f"Error generating patient profile: {e}")
        
    updated_patient_dict = hydrate_dict(patient_dict, response)
    
    return updated_patient_dict

In [4]:
def generate_health_conditions(n):
    '''
    Generates a list of n random health conditions along with associated medications and dosages.

    Parameters
    ----------
    n: int
        The number of random health conditions to select.

    Returns
    -------
    tuple (list, list)
        - A list of n randomly chosen health conditions.
        - A corresponding list of tuples, where each tuple contains:
          - A randomly selected medication for the condition (if applicable).
          - A randomly selected dosage (starting or therapeutic dose).
          - If no medication is available for a condition, `None` is added to the list.

    Function Overview
    -----------------
    - Randomly selects `n` health conditions from the `HEALTH_CONDITIONS` dictionary.
    - For each condition:
      - If medications exist, there is an 80% chance that a medication is assigned.
      - A random medication is chosen from the available options.
      - A corresponding dosage is selected from the available dosage list.
    - Returns two lists: one with selected health conditions and another with medications and dosages (or `None` if no medication is assigned).
    '''
    
    health_conditions = random.sample(list(HEALTH_CONDITIONS.keys()), n)
    meds_with_dosages = []
    
    for condition in health_conditions:
        meds = list(HEALTH_CONDITIONS[condition].keys())
        
        # Check if there are any medications for the condition
        if len(meds) > 0:
            # 80% chance that meds may be taken for that health condition
            if random.random() <= 0.8:
                med_name = random.choice(meds)
                # Select a random dosage (starting or therapeutic dose)
                dosage = random.choice(list(HEALTH_CONDITIONS[condition][med_name].values()))
                meds_with_dosages.append((med_name, dosage))
        else:
            meds_with_dosages.append(None)
    
    return health_conditions, meds_with_dosages

def create_clinician_name()->str:
    '''Generates a random clinician's name.

    Returns
    -------
    str
        A randomly generated full name for a clinician.

    Function Overview
    -----------------
    - Uses the `Faker` library with an ethnicity-specific provider (`ethnicity_picker()`).
    - Generates a random name.
    - Returns the generated name.'''
    ethnicity, checker = ethnicity_picker()
    
    fake = Faker(ethnicity)
    dr = fake.name()
    return dr

def create_appt_date()->str:
    '''Generates a random appointment date within the next five years.

    Returns
    -------
    str
        A string representing a randomly chosen month and day (e.g., "February 14").

    Function Overview
    -----------------
    - Uses the `Faker` library to generate a random date between today and five years in the future.
    - Formats the date as "<Month> <Day>" (e.g., "March 5").
    - Returns the formatted date.'''
    
    fake = Faker()
    appt_date = fake.date_between("today", "+5y").strftime("%B %#d")
    return appt_date

In [5]:
# Function to pick a full demographic profile
def pick_demographics(categories, weights):
    '''
    Randomly selects a demographic category based on weighted probabilities.

    Parameters
    ----------
    categories: list
        A list of possible demographic categories to choose from.
    weights: list
        A list of corresponding weights that determine the likelihood of selecting each category.

    Returns
    -------
    any
        A randomly chosen category from `categories`, selected based on the given `weights`.

    Function Overview
    -----------------
    - Uses `random.choices()` to select one category from `categories`, considering the specified `weights`.
    - The `weights` list determines the probability of each category being chosen.
    - Calls `random.seed()` to ensure randomness is not influenced by previous runs.
    - Returns the selected category.
    '''
    
    random.seed()
    choice = random.choices(categories, weights)[0]
    return choice

In [6]:
def get_dob(age_range):
    """
    Generates a random date of birth (DOB) based on an age range category.

    Parameters
    ----------
    age_range: int
        An integer representing the age range category:
        - 0: 18-25 years old
        - 1: 26-35 years old
        - 2: 36-45 years old
        - 3: 46-60 years old
        - 4: 61+ years old

    Returns
    -------
    datetime.date
        A randomly generated date of birth within the specified age range.

    Raises
    ------
    ValueError
        If an invalid age range is provided.

    Function Overview
    -----------------
    - Uses the `Faker` library to generate a random birthdate within the given age range.
    - Supports five predefined age categories.
    - If an invalid `age_range` is given, raises a `ValueError`.
    """
    
    fake = Faker()
    match age_range:
        case 0:  # 18-25
            dob = fake.date_of_birth(minimum_age = 18, maximum_age=25)
            return dob
        case 1:  # 26-35
            dob = fake.date_of_birth(minimum_age = 26, maximum_age=35)
            return dob
        case 2:  # 36-45
            dob = fake.date_of_birth(minimum_age = 36, maximum_age=45)
            return dob
        case 3:  # 46-60
            dob = fake.date_of_birth(minimum_age = 46, maximum_age=60)
            return dob
        case 4:  # 60+
            dob = fake.date_of_birth(minimum_age = 61, maximum_age=112)
            return dob  # 112 is the age of the current oldest living person in Vancouver
        case _:  # Default case
            raise ValueError("Invalid age range")

In [7]:
def ethnicity_picker():
    """
    Randomly selects an ethnicity category based on predefined regional groups and weighted probabilities.

    Returns
    -------
    str
        A randomly chosen ethnicity from one of the predefined regional groups.

    Function Overview
    -----------------
    - Defines a list of ethnicity region categories, each represented by a predefined constant.
    - Assigns weighted probabilities to each category based on estimated distributions.
    - Uses `pick_demographics()` to randomly select an ethnicity group according to the weights.
    - Chooses a random ethnicity from the selected regional group.
    - Returns the randomly chosen ethnicity.
    """
    
    random.seed()
    indigenous = 0
    ethnicity_categories = [INDIGENOUS, MIDDLE_EASTERN_REGIONS, EUROPEAN_REGIONS, EAST_ASIAN_REGIONS, SOUTH_ASIAN_REGIONS, SOUTHEAST_ASIAN_REGIONS, LATIN_AMERICAN_REGIONS, AFRICAN_REGIONS, OTHER_REGIONS]
    ethnicity_weights = [0.0243, 0.0334, 0.4313, 0.2328, 0.1417, 0.0763, 0.0198, 0.0158, 0.0251]

    ethnicity_group = pick_demographics(ethnicity_categories, ethnicity_weights)
    print(ethnicity_group)
    
    if ethnicity_group == INDIGENOUS:
        indigenous = 1
    ethnicity = random.choice(ethnicity_group)
    return ethnicity, indigenous

In [8]:
def hydrate_patient_profile(variables, preset_attributes = {}) -> dict:
    """
    Generates a fully populated patient profile with demographic, medical, and personal details.

    Parameters
    ----------
    variables: dict
        A dictionary containing predefined patient variables.
    preset_attributes: dict, optional
        A dictionary of attributes that should be pre-set rather than generated. Defaults to an empty dictionary.

    Returns
    -------
    dict
        A patient profile dictionary containing structured demographic, medical, and personal history details.

    Function Overview
    -----------------
    - Randomly assigns ethnicity, sex, and name using `ethnicity_picker()` and `Faker()`.
    - Selects an age category and determines the corresponding date of birth (`get_dob()`).
    - Generates employment status, relationship status, and whether the patient has children based on age and marital status.
    - Determines handedness, address, and occupation.
    - Assigns a general practitioner or states "None" if no doctor is assigned.
    - Randomly selects allergies, medical conditions, and medications (`generate_health_conditions()`).
    - Determines substance use, past hospitalizations, head injuries, seizures, and family medical history.
    - Generates educational background and employment history based on age and disability status.
    - Assigns likelihood of past marriages/long-term relationships.
    - Adds hobbies, relaxation methods, and personality-related details (typing style, conversational tone).
    - Introduces optional "Do Not Disclose" scenarios for sensitive topics.
    - Adds potential edge case scenarios to test LLM robustness.
    - Returns the fully populated patient dictionary.

    Notes
    -----
    - Uses probability distributions and weights to ensure realism in assigned characteristics.
    - Ensures generated attributes are internally consistent (e.g., age aligns with education/work history).
    - Supports preset values in `preset_attributes` to maintain control over specific fields.
    """

    # Define multiple demographic categories
    gender_categories = [0, 1] # Male, Female
    gender_weights = [0.49, 0.51]

    age_categories = [0, 1, 2, 3, 4] # 18-25, 26-35, 36-45, 46-60, 60+
    age_weights_men = [0.2979, 0.1916, 0.1557, 0.2266, 0.1282]
    age_weights_women = [0.2861, 0.1683, 0.1483, 0.2343, 0.1630]
    
    random.seed()
    
    ethnicity, checker = ethnicity_picker()

    indigenous = checker
    
    fake_eth = Faker(ethnicity)
    patient = dict()
    print("Checker: ", indigenous)
    if indigenous == 1:
        print("indy")
        patient['Ethnicity Code'] = "en_Ind"
        patient["Ethnicity"] = "Indigenous"
    else: 
        print("not indy")
        patient['Ethnicity Code'] = ethnicity
        patient["Ethnicity"] = ETHNICITY_CODES.get(patient["Ethnicity Code"], "Unknown")
    
    #doctor data
    patient["Clinician Name"] = create_clinician_name()
    patient["Appointment Date"] = create_appt_date()
    
    #basic information
    sex = pick_demographics(gender_categories,gender_weights)
    middle_name = random.randint(0,1)
    
    if sex == 1:
        patient['Sex'] = "Female"
        try:
            patient['Full Name'] = fake_eth.name_female() 
        except Exception as g:
            print(f"Error generating female patient name: {g}")
            patient['Full Name'] = fake_eth.name_nonbinary()
            
        age_range = pick_demographics(age_categories, age_weights_women)
    else:
        try:
            patient['Sex'] = "Male"
            patient['Full Name'] = fake_eth.name_male()
        except Exception as g:
            print(f"Error generating male patient name: {g}")
            patient['Full Name'] = fake_eth.name_nonbinary()
            
        age_range = pick_demographics(age_categories, age_weights_men)
        
    fake = Faker()

    if "Age" not in patient.keys():
        dob = get_dob(age_range)
        patient['Age'] = relativedelta(datetime.now(), dob).years

        if random.random() <= 0.618: # employment rate
            job = fake.job()
        else:
            job = "unemployed"
        if patient['Age'] <= 22:
            job = "student " + job
        if patient['Age'] >= 75:
            job = "retired " + job

    if "Date of Birth" not in patient.keys():
        patient['Date of Birth'] =dob.strftime("%B %#d %Y") #windows: replace %-d with %#d

    # see document for calculation justifications, and sheets for performing calculations (?)
    relationship_status = [0,1,2,3,4] # single, LAT, married/common-law, divorced/separated, widowded
    relationship_weights = [
        [69.26, 23.09, 7.4, 0.25, 0.0],  # 18-25
        [31.73, 10.58, 55.35, 2.15, 0.2],  # 26-35
        [10.88, 3.63, 76.0, 8.15, 1.35],    # 36-45
        [9, 3, 68.33, 15.33, 8.17],   # 46-60
        [11.25, 3.75, 54.17, 8.83, 33.5]     # 60+
]
            
    if 'Relationship Status' not in patient.keys():
        if patient['Age'] <= 25:
            # print("PATIENT LESS THAN 25")
            relationship_int = pick_demographics(relationship_status, relationship_weights[0])
        elif patient['Age'] <= 35:
            # print("PATIENT LESS THAN 35")
            relationship_int = pick_demographics(relationship_status, relationship_weights[1])
        elif patient['Age'] <= 45:
            # print("PATIENT LESS THAN 45")
            relationship_int = pick_demographics(relationship_status, relationship_weights[2])
        elif patient['Age'] <= 60:
            # print("PATIENT LESS THAN 60")
            relationship_int = pick_demographics(relationship_status, relationship_weights[3])
        else:
            # print("PATIENT IS OLD")
            relationship_int = pick_demographics(relationship_status, relationship_weights[4])
            
        if relationship_int == 0:
            # print("SINGLE")
            patient['Relationship Status'] = "Single"
        elif relationship_int == 1:
            # print("LAT")
            patient['Relationship Status'] = "Long-term relationship"
        elif relationship_int == 2:
            # print("MARRIED")
            patient['Relationship Status'] = random.choice(["Married", "Common-Law"])
        elif relationship_int == 3:
            # print("DIV")
            patient['Relationship Status'] = random.choice(["Divorced", "Separated"])
        else:
            # print("WD")
            patient['Relationship Status'] = "Widowded"

    #https://open.canada.ca/data/en/dataset/5ae719d0-f4c7-4059-8c04-16886f9a5bbd families by age of eldest partner and # of children
    #https://open.canada.ca/data/en/dataset/ecdee020-5919-4996-8d3d-c3df75f50ca0 population estimate by age and gender

    in_family = [ # women, men
        [0.0756, 0.0701],    # 0-24
        [0.5102, 0.4765], # 25-34
        [0.8340, 0.8178], # 35-44
        [0.8426, 08557], # 45-54
        # [0.7444, 0.7622], # 55-64
        # [0.6873, 0.7995] # 65+
]
    w_children = [ # yes, no
        [0.6997, 0.3003],    # 0-24
        [0.5193, 0.4807], # 25-34
        [0.2056, 0.7944], # 35-44
        [0.1771, 0.8229], # 45-54
        # [0.4516, 0.5484], # 55-64
        # [0.7738, 0.2262] # 65+
]
    num_children = [ # 1, 2, 3
        [0.7440, 0.2016, 0.0546],    # 0-24
        [0.5057, 0.3338, 0.1605], # 25-34
        [0.3148, 0.4478, 0.2374], # 35-44
        [0.3592, 0.4346, 0.2062], # 45-54
        # [0.5692, 0.3216, 0.1092], # 55-64
        # [0.8212, 0.1485, 0.0303] # 65+
]
    if patient["Age"] < 25:
        age_idx = 0
    elif patient["Age"] < 35:
        age_idx = 1
    elif patient["Age"] < 45:
        age_idx = 2
    else:
        age_idx = 3 

    if sex == 0: # male
        in_fam = pick_demographics([0,1] [in_family[age_idx][0], 1-in_family[age_idx][0]])
    else: # female
        in_fam = pick_demographics([0,1] [in_family[age_idx][1], 1-in_family[age_idx][1]])

    if in_fam == 1:
        if 'Children' not in patient.keys() and 'Children Details' not in patient.keys():
            children = pick_demographics([0,1], child_weights)
        
            if children == 1:
                patient['Children'] = pick_demographics([1,2,3], num_children[age_idx])
                patient['Children Details'] = "{Children Details}"
            else:
                patient['Children'] = 0
    else:
        patient['Children'] = 0

    handedness_weights = [0.89, 0.1, 0.01]
    handedness_choices = ["right", "left", "ambidextrous"]
    if "Handedness" not in patient.keys():
        patient['Handedness'] = pick_demographics(handedness_choices, handedness_weights)
    
    # residence
    if "Address" not in patient.keys():
        patient['Address'] = f"{fake.street_address()}, {fake.city()}, British Columbia, Canada"
        
    # employment status
    if "Occupation" not in patient.keys():
        patient['Occupation'] = job
    
    # medical care information
    current_doctor_weights = [80, 20]
    current_doctor_options = ["Yes", "No"]
    if 'Current Doctors' not in patient.keys():
        if pick_demographics(current_doctor_options, current_doctor_weights) == "Yes":
            patient['Current Doctor(s)'] = f"Dr. {create_clinician_name()}, General Practioner"
        else:
            patient['Current Doctor(s)'] = "None"

    allergy_weights_f = [28.9, 100-28.9]
    allergy_weights_m = [25.6, 100-25.6]
    if "Allergies" not in patient.keys():
        if sex == 0: #male
            allergy = pick_demographics(["Yes", "No"], allergy_weights_m)
        else: #female
            allergy = pick_demographics(["Yes", "No"], allergy_weights_f)

        if allergy == "Yes":
            patient['Allergies'] = f"{', '.join(random.sample(COMMON_ALLERGIES, random.randint(1,3)))}"

    if "Medical Conditions" not in patient.keys() and "Medications" not in patient.keys():
        conditions, meds = generate_health_conditions(random.randint(1,3))
        patient['Medical Conditions'] = ", ".join(conditions)
        patient['Medications'] = ", ".join([f"{med_name} ({dosage})" for med_name, dosage in meds])
        
    if "Health Supplements" not in patient.keys():
        patient['Health Supplements'] = random.sample(COMMON_HEALTH_SUPPLEMENTS, random.randint(0,3))
        
    # substance use
    if 'Recreational Drug Usage' not in patient.keys():
        if random.random() <= 0.3:
            patient['Recreational Drug Usage'] = f'{", ".join(random.sample(["beer", "wine", "cannabis", "cigarettes", "shrooms", "psychedelics"], random.randint(1,4)))}'
            if "Rehab or Substance Counselling" not in patient.keys():
                patient["Rehab or Substance Counselling"] = "{Rehab or Substance Counselling}"
                
    # medical history
    if "Reason for Appointment" not in patient.keys():
        reasons = random.sample(DR_APPOINTMENT_REASONS, random.randint(1,3))
        patient["Reason for Appointment"] = ", ".join(reasons)

    if "Previous Hospitalizations or Surgeries" not in patient.keys():
        patient['Previous Hospitalizations or Surgeries'] = random.choice(["Yes", "No"])
        
        if patient['Previous Hospitalizations or Surgeries'] == "Yes":
            patient['Previous Hospitalizations or Surgeries Details'] = "{Previous Hospitalizations or Surgeries Details}"

    concussion_weights = [0.3, 0.7]
    if "Head Injuries or Concussions" not in patient.keys():
        patient['Head Injuries or Concussions'] = pick_demographics(["Yes", "No"], concussion_weights)
        
        if patient['Head Injuries or Concussions'] == "Yes":
            patient["Head Injuries or Concussions Details"] = "{Head Injuries or Concussions Details}"

    seizure_weights = [0.1, 0.9]
    if "History of Seizures" not in patient.keys():
        patient['History of Seizures'] = pick_demographics(["Yes", "No"], seizure_weights)
        
    
    # family history
    if "Family History of Health Conditions" not in patient.keys():
        patient['Family History of Health Conditions'] = random.choice(["Yes", "No"])
        
        if patient['Family History of Health Conditions'] == "Yes":
            patient['Family History of Health Conditions Details'] = "{Family History of Health Conditions Details}"

    siblings_weights = [0.40, 0.40, 0.15, 0.05]
    if 'Siblings' not in patient.keys() and 'Sibling Details' not in patient.keys():
        patient['Siblings'] = pick_demographics(["0", "1", "2", "3+"], siblings_weights)

        if patient['Siblings'] == "3+":
            patient['Siblings'] = random.choice(["3", "4", "5"])
        
        if not patient['Siblings'] == "0":
            patient['Sibling Details'] = "{Sibling Details}"
        
    # personal history
    canadian_weights = [0.912, 0.088] 
    if "Canadian Citizenship" not in patient.keys():
        patient['Canadian Citizenship'] = pick_demographics(["Yes", "No"], canadian_weights)

    if ethnicity == "fr_CA" or ethnicity == "en_CA":
        patient['Canadian Citizenship'] = "Yes"
    born_in_canada = ""
    if patient['Canadian Citizenship'] == "Yes":
        born_in_canada_weights = [53.1, 46.9]
        born_in_canada = pick_demographics(["Yes", "No"], born_in_canada_weights)
        
    if "Birthplace" not in patient.keys():
        if born_in_canada == "Yes":
            # print("CANADIAN")
            patient['Birthplace'] = f"{fake.city()}, Canada"
        else: 
            try:
                country = fake_eth.current_country()
                patient['Birthplace'] = f"{fake_eth.city()}, {country}"
            except Exception as c:
                print(f"Error generating current country for profile: {c}")
                country_code = fake_eth.current_country_code()
                patient['Birthplace'] = f"{fake_eth.city()}, {country_code}"


    disability_probability_total_w = [ # yes, no
        [0.247, 0.753], #18-24
        [0.269, 0.731], #25-64
        [0.418, 0.582]  # 65+
    ] 
    disability_probability_total_m = [ # yes, no
        [0.158, 0.842],
        [0.211, 0.789],
        [0.389, 0.611]
    ]

    # Determine age category index
    if patient['Age'] < 25:
        age_idx = 0
    elif patient['Age'] < 65:
        age_idx = 1
    else:
        age_idx = 2

    # Assigning probabilities based on sex
    sex_idx = 0 if patient['Sex'] == "Male" else 1  # Male = 0, Female = 1

    if 'Disabled' not in patient.keys():
        if age_idx == 0: # <25
            if sex_idx == 0: 
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_m[0])
            else:
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_w[0])
        if age_idx == 1 or age_idx == 2: # 25-64
            if sex_idx == 0:
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_m[1])
            else:
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_w[1])
        if age_idx == 3:
            if sex_idx == 0: # 65+
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_m[2])
            else:
                patient['Disabled'] = pick_demographics(["Yes", "No"], disability_probability_total_w[2])

    # https://www150.statcan.gc.ca/n1/pub/11-627-m/11-627-m2023063-eng.htm -- this is for all canadians, not just disabled ones
    # https://www150.statcan.gc.ca/n1/daily-quotidien/231201/dq231201b-eng.htm
    
    comorbidity_labels = [0, 1, 2] # only one, two or three, four (or more)
    
    comorbidity_num = [0.29, # only one
                         0.37, # two or three
                         0.34] # four (or more)
    comorbid_val = pick_demographics(comorbidity_labels, comorbidity_num)

    # TODO -- normalize to remove "unknown" category
    disability_list = ["Mental-health related", 
                       "Pain-related", "Seeing", 
                       "Learning", "Memory", 
                       "Mobility", "Flexibility", 
                       "Hearing", "Dexterity", 
                       "Developmental"]
    # Normalize the Dist (?)
    # https://www150.statcan.gc.ca/n1/daily-quotidien/231201/dq231201b-eng.htm

    # s = 38.6 + 61.8 + 27.4 + 20.7 + 18.2 + 39.2 + 40.3 + 20.7 + 18.4 + 5.7
    
    disability_probability_types = [38.6, 61.8,
                                   27.4, 20.7,
                                   18.2, 39.2,
                                   40.3, 20.7,
                                   18.4, 5.7]

    # Only assign mental disabilities **if the patient is already disabled**
    if patient['Disabled'] == "Yes":
        if comorbid_val == 0: #only one
            patient['Disability'] = pick_demographics(disability_list, disability_probability_types)

        else:
            patient['Disability'] = set()  # Using a set to ensure uniqueness

            if comorbid_val == 1:
                n = random.randint(2, 3)  # Randomly pick 2 or 3 disabilities
            else:
                n = 4  # Always pick 3 disabilities for comorbid_val > 1

            while len(patient['Disability']) < n:
                patient['Disability'].add(pick_demographics(disability_list, disability_probability_types))

            patient['Disability'] = list(patient['Disability'])  # Convert back to a list

        patient['Disability Details'] = "{Disability Details}"

    # https://inclusioncanada.ca/wp-content/uploads/2024/06/CDB-Vision-and-Design.pdf
    
    # Assign Disability Assistance (Only for disabled patients)
    if patient['Disabled'] == "Yes":
        disability_assistance_weights = [0.25, 0.75]  # 24.8%-28.4% of disabled people receive assistance
        if 'Disability Assistance' not in patient.keys():
            patient['Disability Assistance'] = pick_demographics(["Yes", "No"], disability_assistance_weights)

            if patient['Disability Assistance'] == "Yes":
                patient['Disability Assistance Details'] = "{Disability Assistance Details}"
    
    # education --
    # https://studentsuccess.gov.bc.ca/school-district/099/report/fsa
    # https://studentsuccess.gov.bc.ca/school-district/099/report/grad-assess

    # Adjusted probability distributions for elementary school performance
    elementary_school_weights = {
        "All Other": [0.266, 0.620, 0.114],  # Emerging, On Track, Extending
        "Indigenous": [0.508, 0.463, 0.029],
        "Disabilities or Diverse Abilities": [0.464, 0.444, 0.092]
    }

    # Probability distributions for high school performance
    # Dropout stats:
    # https://catalogue.data.gov.bc.ca/dataset/1c6256d0-c120-4de1-817b-fb291732f8a4/resource/2640bc34-d559-499d-b8a8-5dd0a0cbd823/download/completion_rate_residents_only_1999-2000_to_2023-2024.csv
    
    dropout_stats = [0.011, 0.244, 0.213] #All other, Indigenous, Disabilities or Diverse Abilities
    high_school_weights = {
        "All Other": [0.048, 0.232, 0.563, 0.157],  # Emerging, Developing, Proficient, Extending
        "Indigenous": [0.163, 0.389, 0.407, 0.041],
        "Disabilities or Diverse Abilities": [0.165, 0.360, 0.390, 0.085]
    }

    

    # Assign Categories
    if patient.get('Ethnicity') == "Indigenous":
        category = "Indigenous"
        dropout = pick_demographics(["Yes", "No"], [dropout_stats[1], 1-dropout_stats[1]])
    elif patient['Disabled'] == "Yes":
        category = "Disabilities or Diverse Abilities"
        dropout = pick_demographics(["Yes", "No"], [dropout_stats[2], 1-dropout_stats[2]])
    else:
        category = "All Other"
        dropout = pick_demographics(["Yes", "No"], [dropout_stats[0], 1-dropout_stats[0]])

    # Assign elementary school performance
    if "Elementary School Performance" not in patient.keys():
        patient["Elementary School Performance"] = pick_demographics(
            ["Emerging", "On Track", "Extending"], elementary_school_weights[category]
        )

    # Assign high school performance
    if "High School Performance" not in patient.keys():
        if dropout == "Yes":
            patient["High School Performance"] = "Drop out"
        else:
            patient["High School Performance"] = pick_demographics(["Emerging", "Developing", "Proficient", "Extending"], high_school_weights[category])

    # https://www12.statcan.gc.ca/census-recensement/2021/as-sa/fogs-spg/alternative.cfm?topic=11&lang=E&dguid=2021A00035915&objectId=4_2
    # Define education probabilities for 25-64 age group in Vancouver
    education_weights = [0.432, 0.273, 0.294]  # [Pursuing] University, [Pursuing] Post-Secondary (No Degree), No Further Education

    # only have stats for 25-64, applying to rest of ages due to lack of information
    
    if 25 <= patient['Age'] <= 64:
        patient['Further Education After Highschool'] = pick_demographics(
            ["University Degree", "Post-Secondary Certificate or Diploma", "No Further Education"],
            education_weights
        )
    elif 18 <= patient['Age'] < 25:
        patient['Further Education After Highschool'] = pick_demographics(
            ["Pursuing University Degree", "Pursuing Post-Secondary Certificate or Diploma", "No Further Education"],
        education_weights
        )
    else:
        patient['Further Education After Highschool'] = "Unknown"  # In case of error

        
    # previous employment and relationships
    # https://www12.statcan.gc.ca/census-recensement/2021/as-sa/fogs-spg/page.cfm?topic=12&lang=E&dguid=2021S0503933

    # Define probability of having previous work history by age
    work_history_weights = {
        "18-24": [0.479, 1-0.479],  
        "25-64": [0.756, 1-0.756],  
        "65+": [0.756, 1-0.756]  # same as 25-64 due to lack of statistics
    }

    # Determine correct probability
    if patient['Age'] < 25:
        work_weights = work_history_weights["18-24"]
    elif patient['Age'] < 65:
        work_weights = work_history_weights["25-64"]
    else:
        work_weights = work_history_weights["65+"]

    # Assign work history based on probabilities
    if "Previous Work History" not in patient.keys():
        patient["Previous Work History"] = pick_demographics(
            ["Has Work History", "No Work History"], work_weights
        )

        if patient["Previous Work History"] == "Has Work History":
            patient["Previous Work History"] = "{Previous Work History}"

    # Define probability of having previous marriages/relationships by age
    # https://www12.statcan.gc.ca/census-recensement/2021/as-sa/fogs-spg/alternative.cfm?topic=4&lang=E&dguid=2021S0503933&objectId=6
    # relationship_history_weights = {
    #     "18-24": [0.15, 0.85],  # Low likelihood of past relationships
    #     "25-34": [0.35, 0.65],  # Increasing likelihood
    #     "35-64": [0.60, 0.40],  # Most likely have had a prior relationship
    #     "65+": [0.80, 0.20]  # Majority have had at least one past relationship
    # }

    # lack of avaliable data
    relationship_history_weights = [1-0.306, 0.306]

    relationship_weights = relationship_history_weights
    # Determine correct probability
    # if patient['Age'] < 25:
    #     relationship_weights = relationship_history_weights["18-24"]
    # elif patient['Age'] < 35:
    #     relationship_weights = relationship_history_weights["25-34"]
    # elif patient['Age'] < 65:
    #     relationship_weights = relationship_history_weights["35-64"]
    # else:
    #     relationship_weights = relationship_history_weights["65+"]
    
    # Assign relationship history based on probabilities
    if "Previous Marriages/Long-Term Relationships" not in patient.keys():
        patient["Previous Marriages/Long-Term Relationships"] = pick_demographics(
            ["Has Previous Relationships/Marriages", "No Previous Relationships"], 
            relationship_weights
        )
        if patient["Previous Marriages/Long-Term Relationships"] == "Has Previous Relationships/Marriages":
            patient["Previous Marriages/Long-Term Relationships"] = "{Previous Marriages/Long-Term Relationships}"


    # hobbies and lifestyle
    if "Hobbies" not in patient.keys():
        patient["Hobbies"] = "{Hobbies}"
    if "Relaxation Methods" not in patient.keys():
        patient["Relaxation Methods"] = "{Relaxation Methods}"

    # attributes with possible do not disclose options
    dnd_attributes = [
        "Past Trauma", 
        "Substance Abuse", 
    ]

    for attribute in dnd_attributes:
        if attribute not in patient.keys():
            cond = random.random() #returns [0,1)
            if cond <= 0.5: patient[attribute] = "yes"
            else: patient[attribute] = "None"
    
    # create variation in conversations and tones
    if "Typing Style" not in patient.keys():
        patient["Typing Style"] = random.choice(TYPING_STYLES)
    if "Conversational Tone" not in patient.keys():
        patient["Conversational Tone"] = random.choice(CONVERSATIONAL_TONE)
    if "Personality Traits" not in patient.keys():
        patient["Personality Traits"] = random.sample(PERSONALITY_TRAITS, random.randint(1,4))
    if "Edge Case Scenario" not in patient.keys():
        cond = random.random() #returns [0,1)
        if cond <= 0.5: patient["Edge Case Scenario"] = random.choice(EDGE_CASE_SCENARIOS)
        else: patient[attribute] = "N/A"
        
    print("Patient Dict - BEFORE LLM", patient)
    return patient

In [9]:
def clean_stars_from_dict(patient_dict):
    '''
    Cleans leading '** ' and surrounding spaces from string values in a patient dictionary.

    Parameters
    ----------
    patient_dict: dict
        A dictionary containing patient information.

    Returns
    -------
    dict
        The cleaned dictionary with unnecessary formatting removed from string values.

    Function Overview
    -----------------
    - Iterates over the dictionary and checks if each value is a string.
    - Removes any leading '** ' and surrounding spaces from string values.
    - Returns the updated dictionary.
    '''
    for key, value in patient_dict.items():
        if isinstance(value, str):  # Only clean string values
            patient_dict[key] = value.lstrip('** ').strip()  # Remove leading '** ' and any surrounding spaces
    return patient_dict

# Function to update the dictionary with LLM output using flexible pattern matching
def hydrate_dict(patient_dict, llm_output):
    '''
    Updates a patient dictionary with details extracted from LLM-generated output using pattern matching.

    Parameters
    ----------
    patient_dict: dict
        A dictionary containing patient information, potentially with missing or placeholder values.
    llm_output: str
        The text output generated by the LLM, containing new details to populate missing fields.

    Returns
    -------
    dict
        The updated patient dictionary with extracted values inserted into their respective fields.

    Function Overview
    -----------------
    - Defines regex patterns to extract key details (e.g., Anglicized names, medical history, family details) from `llm_output`.
    - Iterates through each pattern, searching for matches in `llm_output`.
    - Updates `patient_dict` with extracted values if a match is found.
    - Calls `clean_stars_from_dict()` to remove formatting artifacts from string values.
    - Returns the updated dictionary with hydrated fields.
    '''
    
    # Define patterns to capture various fields in the LLM output
    patterns = {
        'Full Name': r"Anglicized Full Name[:\s]*([^\n]+)",
        'Birthplace': r"Anglicized Birthplace[:\s]*([^\n]+)",
        'Clinician Name': r"Anglicized Clinician Name[:\s]*([^\n]+)",
        'Current Doctor(s)': r"Anglicized Doctor Name[:\s]*([^\n]+)",
        'Children Details' : r"Children Details[:\s]*([^\n]+)",
        'Disability Assistance Details': r"Disability Assistance Details[:\s]*([^\n]+)",
        'Disability Details': r"Disability Details[:\s]*([^\n]+)",
        'Previous Hospitalizations or Surgeries Details' : r"Previous Hospitalizations or Surgeries Details[:\s]*([^\n]+)",
        'Head Injuries or Concussions Details' : r"Head Injuries or Concussions Details[:\s]*([^\n]+)",
        'Rehab or Substance Counselling': r"Rehab or Substance Counselling[:\s]*([^\n]+)",
        'Family History of Health Conditions Details': r"Family History of Health Conditions Details[:\s]*([^\n]+)",
        'Sibling Details': r"Sibling Details[:\s]*([^\n]+)",
        'Previous Work History': r"Previous Work History[:\s]*([^\n]+)",
        'Previous Marriages/Long-Term Relationships': r"Previous Marriages/Long-Term Relationships[:\s]*([^\n]+)",
        'Hobbies': r"Hobbies[:\s]*([^\n]+)",
        'Relaxation Methods': r"Relaxation Methods[:\s]*([^\n]+)"
    }

    # Loop through each pattern and search for matches in the LLM output
    for key, pattern in patterns.items():
        match = re.search(pattern, llm_output, re.IGNORECASE)
        if match:
            patient_dict[key] = match.group(1).strip()  # Update dict with captured value
    
    clean_stars_from_dict(patient_dict)
    return patient_dict



In [10]:
def patient_to_csv(patient:dict, filepath:str="./llm_patients.csv"):
    '''
    Saves or appends a patient dictionary to a CSV file.

    Parameters
    ----------
    patient: dict 
        A dictionary containing patient details.
    filepath: str, optional
        The filepath for the CSV file where patient data will be stored. Defaults to "./llm_patients.csv".

    Function Overview
    -----------------
    - Attempts to read an existing CSV file at `filepath` using pandas (`pd.read_csv()`).
    - If the file exists:
      - Converts the existing data into a dictionary (`current_dict`).
      - Ensures all columns in the existing CSV are included in the new patient entry.
      - Appends the new patient data to the existing structure.
      - Saves the updated data back to the CSV file.
    - If the file does not exist:
      - Creates a new CSV file with `patient` as the first entry.
    - Uses `|` as the delimiter to prevent issues with commas in text fields.
    '''
    try:
        current_df = pd.read_csv(filepath, sep="|")
        current_dict = current_df.to_dict("list")
        existing_len = current_df.shape[0]
        
        #add variables in current dict not in patient dict
        missing_cols = list(set(list(current_dict.keys())) - set(list(patient.keys())))
        for missing_col in missing_cols:
            patient[missing_col] = ""
        
        #add everything from patient dict to to the current dict
        for col in patient.keys():
            if col in current_dict:
                current_dict[col].append(patient[col])
            else:
                #create a new column for variable in patient dict not in current dict
                buffer = [""]*existing_len
                buffer.append(patient[col])
                current_dict[col] = buffer

        updated_df = pd.DataFrame(current_dict)

    except FileNotFoundError:
        print(f"creating file at {filepath}")
        updated_dict = {}
        #save dictionary as csv with keys
        for col in patient.keys():
            updated_dict[col] = [f"{patient[col]}"]
        updated_df = pd.DataFrame(updated_dict)

    updated_df.to_csv(filepath, sep="|",index=False)

def patient_creator(filepath = "./llm_patients.csv"):
    """
    Generates a randomized patient profile and adds it to a CSV file.

    Parameters
    ----------
    filepath: str, optional
        The filepath for the CSV file where patient data will be stored. Defaults to "./llm_patients.csv".

    Returns
    -------
    dict
        The generated patient dictionary.

    Function Overview
    -----------------
    - Calls `generate_patient_profile(variables)` to create a randomized patient profile.
    - Saves the generated patient profile to a CSV file using `patient_to_csv()`.
    - Returns the generated patient dictionary.
    """
    
    patient = generate_patient_profile(variables)
    patient_to_csv(patient, filepath)
    return patient

In [11]:
i=0
while i < 1000:
    patient_creator(filepath="../llm_patients_040225.csv")
    i += 1
    print(i)
# patient_creator(filepath="../llm_patients.csv")
# 1000 patients - 3:09:24

['ja_JP', 'ko_KR', 'zh_CN', 'zh_TW']
Checker:  0
not indy
['cs_CZ', 'da_DK', 'de_AT', 'de_CH', 'de_DE', 'el_GR', 'en_CA', 'en_US', 'en_GB', 'en_IE', 'es_ES', 'fi_FI', 'fr_CA', 'fr_CH', 'fr_FR', 'hr_HR', 'hu_HU', 'hy_AM', 'it_IT', 'nl_BE', 'nl_NL', 'no_NO', 'pl_PL', 'pt_PT', 'ro_RO', 'ru_RU', 'sk_SK', 'sv_SE', 'uk_UA']
['cs_CZ', 'da_DK', 'de_AT', 'de_CH', 'de_DE', 'el_GR', 'en_CA', 'en_US', 'en_GB', 'en_IE', 'es_ES', 'fi_FI', 'fr_CA', 'fr_CH', 'fr_FR', 'hr_HR', 'hu_HU', 'hy_AM', 'it_IT', 'nl_BE', 'nl_NL', 'no_NO', 'pl_PL', 'pt_PT', 'ro_RO', 'ru_RU', 'sk_SK', 'sv_SE', 'uk_UA']
Patient Dict - BEFORE LLM {'Ethnicity Code': 'zh_TW', 'Ethnicity': 'Chinese - Taiwan', 'Clinician Name': 'Orvokki Nousiainen-Alatalo', 'Appointment Date': 'June 17', 'Sex': 'Male', 'Full Name': '李家豪', 'Age': 36, 'Date of Birth': 'October 17 1988', 'Relationship Status': 'Married', 'Children': 'Yes', 'Children Details': '{Children Details}', 'Handedness': 'right', 'Address': '15982 Roman Via, New Ronald, British Col