In [1]:
import sys, os, csv, ast, random, time
from pathlib import Path
import pandas as pd
import time
from collections import deque  # Queue for questions
import re
import random
from faker import Faker
from dateutil.relativedelta import relativedelta
from datetime import datetime

from variables import HEALTH_CONDITIONS, COMMON_RECREATIONAL_DRUGS, COMMON_HEALTH_SUPPLEMENTS, COMMON_ALLERGIES, DR_APPOINTMENT_REASONS, TYPING_STYLES, CONVERSATIONAL_TONE, PERSONALITY_TRAITS, EDGE_CASE_SCENARIOS, MIDDLE_EASTERN_REGIONS, EUROPEAN_REGIONS, EAST_ASIAN_REGIONS, SOUTH_ASIAN_REGIONS, SOUTHEAST_ASIAN_REGIONS, LATIN_AMERICAN_REGIONS, AFRICAN_REGIONS, OTHER_REGIONS
from ethnicity_codes import ETHNICITY_CODES

# import Langchain and Ollama
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model1 = OllamaLLM(base_url="localhost:11435", 
                   model="llama3.3:70b", 
                   temperature=0.7, 
                   num_ctx = 4096,
                   top_k = 50,
                   top_p = 0.95)

#llama3.1:70b-instruct-q4_0

In [2]:
print(Path.cwd())

/home/ninc-user/finetuning/transcript_generation-main/patient_creation


In [3]:
import variables

In [4]:
random.seed()
# This function would take a prompt, a variables file, and then call the LLM to generate a patient profile
def generate_patient_profile(variables):
    # Load variables from a Python file (assuming it's structured as a dictionary)
    patient_variables = variables
    
    # Patient profile template (corresponding to your structured format)
    profile_template = """
    ### General Information
    - Full name: {full_name}
    - Preferred name: {preferred_name}
    - Date of birth: {date_of_birth}
    - Age: {age}
    - Sex: {sex}
    - Handedness: {handedness}
    
    ### Residence and Marital Status
    - Current city/town of residence: {residence}
    - Relationship status: {relationship_status}
    - Children or dependents: {children_status}
      {children_details}
    
    ### Employment and Financial Status
    - Current employment status: {employment_status}
      {employment_details}
    - Disabled: {disabled}
    - Disability assistance: {disability_status}
      {disability_assistance_details}
    
    ### Medical Care Information
    - List of current doctors: {doctors}
    - Allergies: {allergies}
    - Current medications and dosages: {medications}
    - Health supplements: {supplements}
    
    ### Substance Use
    - Nicotine, marijuana, alcohol use frequency: {substance_use}
    
    ### Medical History
    - Reason for Appointment: {dr_reason}
    - Health conditions/diagnoses: {health_conditions}
    - Previous hospitalizations or surgeries: {hospitalizations}
    - Head injuries or concussions: {head_injuries}
    - History of seizures: {seizures}
    - Rehab or substance counseling: {rehab_history}
    
    ### Family History
    - Psychiatric conditions in family: {family_psychiatric}
    - Neurological or genetic conditions in family: {family_neurological}
    - Siblings: {siblings}
    
    ### Personal History
    - Birthplace: {birthplace}
    - Canadian citizen: {citizenship_status}
    
    ### Education
    - Difficulties in elementary school: {elementary_difficulties}
    - High school performance: {high_school_performance}
    - Further education after high school: {further_education}
    
    ### Employment and Relationships
    - Previous work history: {previous_work}
    - Previous marriages/long-term relationships: {previous_relationships}
    
    ### Hobbies and Lifestyle
    - Hobbies: {hobbies}
    - How to relax on a stressful day: {relaxation_methods}
    
    ### Personality
    - Conversational tone: {conversational_tone}
    - Typing style: {typing_style}
    - Personality traits: {personality_traits}
    """
    
    # Hydrate randomly generated information
    patient_dict = hydrate_patient_profile(variables, {})
    patient_dict
    
    # fields_to_fill = {k: v for k, v in patient_dict.items() if v == ""}
    
    # Extract fields with placeholders
    fields_to_fill = {k: v for k, v in patient_dict.items() if isinstance(v, str) and v.startswith('{') and v.endswith('}')}

    # Convert to a more readable format for the LLM
    fields_to_fill_str = ', '.join([f"{key}: {value}" for key, value in fields_to_fill.items()])
    print(fields_to_fill_str)

    prompt = f"""
    I have a fictional patient profile, and I need to fill in some sections with realistic-sounding but completely made-up details. 
    Here are the sections with placeholders that need to be completed:

    {fields_to_fill_str}
    

    Please generate brief, suitable details for each section that make sense in the context of a fictional character and their background. The information should be coherent with the rest of the profile provided below,
    and there must NOT be contradicting information. For example, if the patient has no siblings, then there should not be siblings mentioned in the Family History.
    
    
    Here is the partially filled out dict:

    {patient_dict}
    
     If the full name and city of their birthplace are not in the english alphabet, please anglicize them and output it inline with the other fields, as such:
    "Anglicized Full name: value", "Anglicized Clinician Name: value", "Anglicized Birthplace: value", and "Anglicized Doctor Name: value". You MUST adhere to this format, and do not include any more text on the same line as these outputs.
    For the Birthplace, you MUST include the ENTIRETY of the Birthplace name in the value, so please include both the city name AND either the country name or the country code.
    
    YOU MUST return the output in the following format for each section:
    "Field Name: value"
    """
 

    
#     # Now, create the prompt that will guide the LLM in generating a patient profile
#     prompt = f"""
#     Given a partially filled out profile of a synthetic psychiatric patient, I would like you to fill in the rest
#     of the information in the template to generate a fully fleshed out patient. Here are some specific rules:
    
#     - You must make up a **FAKE** name for the following:
#         - Their full name
#         - Their city/town of residence (must be in British Columbia)
#         - Their children's names (if applicable)
#         - Their current doctors' names (if applicable)
#         - Their siblings' names (if applicable)
#         - The name of the company they are employed at (if employed)
    
#     - The **number of children and siblings** can vary, including **0**. You do not need to create children or siblings if not relevant.
    
#     - The patient **does not need** to have a nickname (Preferred name) unless it naturally fits their character.
    
#     - For traits in the Family History section, it's okay if there are **no psychiatric, neurological, or genetic conditions** in the family.
    
#     - For everything else, please generate **realistic synthetic information** that meshes well with the already filled out information.
    
#     Please ensure the names, places, and other details vary from previous completions.
    
#     Here is the half-filled out template:
#     {profile_template}
#     """


    # Call the LLM (assuming using OpenAI GPT-based models or LLaMA)
    try:
        response = response = model1.invoke(prompt)
        
        # Get the generated profile
        print("Response: ", response)
    
    except Exception as e:
        print(f"Error generating patient profile: {e}")
        
    updated_patient_dict = hydrate_dict(patient_dict, response)
    
    return updated_patient_dict

In [5]:
# def generate_health_conditions(n):
#     '''gets a list of n random health conditions along with a list of a randomly selected associated medication for that condition'''
#     health_condition = random.sample(list(HEALTH_CONDITIONS.keys()), n)
#     meds = []
#     for i in health_condition:
#         if len(HEALTH_CONDITIONS[i]) > 0:
#             #chance that meds may not be taken for that health condition
#             if random.random() <= 0.8:
#                 med_name = random.choice(HEALTH_CONDITIONS[i])
#                 meds.append(med_name)
#         else:
#             pass
#     return health_condition, meds

def generate_health_conditions(n):
    '''Gets a list of n random health conditions along with a list of a randomly selected associated medication and dosage for that condition.'''
    health_conditions = random.sample(list(HEALTH_CONDITIONS.keys()), n)
    meds_with_dosages = []
    
    for condition in health_conditions:
        meds = list(HEALTH_CONDITIONS[condition].keys())
        
        # Check if there are any medications for the condition
        if len(meds) > 0:
            # 80% chance that meds may be taken for that health condition
            if random.random() <= 0.8:
                med_name = random.choice(meds)
                # Select a random dosage (starting or therapeutic dose)
                dosage = random.choice(list(HEALTH_CONDITIONS[condition][med_name].values()))
                meds_with_dosages.append((med_name, dosage))
        else:
            meds_with_dosages.append(None)
    
    return health_conditions, meds_with_dosages

def create_clinician_name()->str:
    '''returns string of random name of a clinician'''
    fake = Faker(ethnicity_picker())
    dr = fake.name()
    return dr

def create_appt_date()->str:
    '''returns string with random month and date (e.g. February 14)'''
    fake = Faker()
    appt_date = fake.date_between("today", "+5y").strftime("%B %#d")
    return appt_date

def get_random_names(fake, num_people, age_low, age_high)->list:
    '''returns a list of strings with num_people number of people with first name, random age ages between age_low and age_high (e.g. "Lary (12)")'''
    output = []
    for i in range(num_people):
        output.append(f"{fake.first_name()} ({random.randint(age_low, age_high)})")
    return output

In [6]:
# Function to pick a full demographic profile
def pick_demographics(categories, weights):
    random.seed()
    choice = random.choices(categories, weights)[0]
    return choice

In [7]:
def get_dob(age_range):
    fake = Faker()
    match age_range:
        case 0:  # 18-25
            dob = fake.date_of_birth(minimum_age = 18, maximum_age=25)
            return dob
        case 1:  # 26-35
            dob = fake.date_of_birth(minimum_age = 26, maximum_age=35)
            return dob
        case 2:  # 36-45
            dob = fake.date_of_birth(minimum_age = 36, maximum_age=45)
            return dob
        case 3:  # 46-60
            dob = fake.date_of_birth(minimum_age = 46, maximum_age=60)
            return dob
        case 4:  # 60+
            dob = fake.date_of_birth(minimum_age = 61, maximum_age=112)
            return dob  # 112 is the age of the current oldest living person in Vancouver
        case _:  # Default case
            raise ValueError("Invalid age range")

In [8]:
def ethnicity_picker():
    random.seed()
    ethnicity_categories = [MIDDLE_EASTERN_REGIONS, EUROPEAN_REGIONS, EAST_ASIAN_REGIONS, SOUTH_ASIAN_REGIONS, SOUTHEAST_ASIAN_REGIONS, LATIN_AMERICAN_REGIONS, AFRICAN_REGIONS, OTHER_REGIONS]
    ethnicity_weights = [0.0334, 0.4313, 0.2328, 0.1417, 0.0763, 0.0198, 0.0158, 0.0251]

    ethnicity_group = pick_demographics(ethnicity_categories, ethnicity_weights)
    # print(ethnicity_group)

    ethnicity = random.choice(ethnicity_group)
    return ethnicity

In [9]:
def hydrate_patient_profile(variables, preset_attributes = {}) -> dict:

    # Define multiple demographic categories
    gender_categories = [0, 1] # Male, Female
    gender_weights = [0.49, 0.51]

    age_categories = [0, 1, 2, 3, 4] # 18-25, 26-35, 36-45, 46-60, 60+
    age_weights_men = [0.2979, 0.1916, 0.1557, 0.2266, 0.1282]
    age_weights_women = [0.2861, 0.1683, 0.1483, 0.2343, 0.1630]
    
    random.seed()
    
    # ethnicity_group = pick_demographics(ethnicity_categories, ethnicity_weights)
    # print(ethnicity_group)
    ethnicity = ethnicity_picker()
    
    fake_eth = Faker(ethnicity)
    # print(fake_eth.current_country())
    patient = dict()
    patient['Ethnicity Code'] = ethnicity
    patient["Ethnicity"] = ETHNICITY_CODES.get(patient["Ethnicity Code"], "Unknown")
    #doctor data
    patient["Clinician Name"] = create_clinician_name()
    patient["Appointment Date"] = create_appt_date()
    
    #basic information
    sex = pick_demographics(gender_categories,gender_weights)
    middle_name = random.randint(0,1)
    
    if sex == 1:
        patient['Sex'] = "Female"
        try:
            patient['Full Name'] = fake_eth.name_female() 
        except Exception as g:
            print(f"Error generating female patient name: {g}")
            patient['Full Name'] = fake_eth.name_nonbinary()
            
        # if middle_name == 1:
        #     patient['Full Name'] = f"{fake_eth.first_name_female()} " + patient['Full Name']
        age_range = pick_demographics(age_categories, age_weights_women)
    else:
        try:
            patient['Sex'] = "Male"
            patient['Full Name'] = fake_eth.name_male()
        except Exception as g:
            print(f"Error generating male patient name: {g}")
            patient['Full Name'] = fake_eth.name_nonbinary()
            
        # if middle_name == 1:
        #     patient['Full Name'] = f"{fake_eth.first_name_male()} " + patient['Full Name']
        age_range = pick_demographics(age_categories, age_weights_men)
        
    fake = Faker()

    
    if "Age" not in patient.keys():
        dob = get_dob(age_range)
        patient['Age'] = relativedelta(datetime.now(), dob).years

        ## TODO -- done
        if random.random() <= 0.618: 
            job = fake.job()
        else:
            job = "unemployed"
        if patient['Age'] <= 22:
            job = "student " + job
        if patient['Age'] >= 75:
            job = "retired " + job

    if "Date of Birth" not in patient.keys():
        patient['Date of Birth'] =dob.strftime("%B %#d %Y") #windows: replace %-d with %#d

    relationship_status = [0,1,2,3,4] # single, LAT, married/common-law, divorced/separated, widowded
    relationship_weights = [
        [67.35, 25, 7.4, 0.25, 0.0],  # 18-25
        [30.3, 12, 55.35, 2.15, 0.2],  # 26-35
        [7.5, 7, 76.0, 8.15, 1.35],    # 36-45
        [0, 12, 68.33, 15.33, 8.17],   # 46-60
        [0, 15, 54.17, 8.83, 33.5]     # 60+
]
            
    if 'Relationship Status' not in patient.keys():
        if patient['Age'] <= 25:
            # print("PATIENT LESS THAN 25")
            relationship_int = pick_demographics(relationship_status, relationship_weights[0])
        elif patient['Age'] <= 35:
            # print("PATIENT LESS THAN 35")
            relationship_int = pick_demographics(relationship_status, relationship_weights[1])
        elif patient['Age'] <= 45:
            # print("PATIENT LESS THAN 45")
            relationship_int = pick_demographics(relationship_status, relationship_weights[2])
        elif patient['Age'] <= 60:
            # print("PATIENT LESS THAN 60")
            relationship_int = pick_demographics(relationship_status, relationship_weights[3])
        else:
            # print("PATIENT IS OLD")
            relationship_int = pick_demographics(relationship_status, relationship_weights[4])
            
        if relationship_int == 0:
            # print("SINGLE")
            patient['Relationship Status'] = "Single"
        elif relationship_int == 1:
            # print("LAT")
            patient['Relationship Status'] = "Long-term relationship"
        elif relationship_int == 2:
            # print("MARRIED")
            patient['Relationship Status'] = random.choice(["Married", "Common-Law"])
        elif relationship_int == 3:
            # print("DIV")
            patient['Relationship Status'] = random.choice(["Divorced", "Separated"])
        else:
            # print("WD")
            patient['Relationship Status'] = "Widowded"
            

    child_probability_matrix = [ # Single, LAT, Married/Common-law, Divorced/Separated, Widowed
        [5, 10, 30, 5, 1],    # 18-25
        [20, 30, 70, 50, 30], # 26-35
        [40, 50, 85, 70, 60], # 36-45
        [50, 60, 90, 80, 80], # 46-60
        [40, 50, 80, 70, 90]  # 60+
]
    child_percentage = child_probability_matrix[age_range][relationship_int]
    child_weights = [child_percentage, 100-child_percentage]
    
    if 'Children' not in patient.keys() and 'Children Details' not in patient.keys():
        patient['Children'] = pick_demographics(["Yes", "No"], child_weights)
        
        if patient['Children'] == "Yes":
            patient['Children Details'] = "{Children Details}"

    ## TODO -- done
    handedness_weights = [0.89, 0.1, 0.01]
    handedness_choices = ["right", "left", "ambidextrous"]
    if "Handedness" not in patient.keys():
        patient['Handedness'] = pick_demographics(handedness_choices, handedness_weights)
    
    # residence
    if "Address" not in patient.keys():
        patient['Address'] = f"{fake.street_address()}, {fake.city()}, British Columbia, Canada"
        
    # employment status
    if "Occupation" not in patient.keys():
        patient['Occupation'] = job
    
    # medical care information
    current_doctor_weights = [80, 20]
    current_doctor_options = ["Yes", "No"]
    if 'Current Doctors' not in patient.keys():
        if pick_demographics(current_doctor_options, current_doctor_weights) == "Yes":
            patient['Current Doctor(s)'] = f"Dr. {create_clinician_name()}, General Practioner"
        else:
            patient['Current Doctor(s)'] = "None"

    # TODO -- done
    allergy_weights_f = [28.9, 100-28.9]
    allergy_weights_m = [25.6, 100-25.6]
    if "Allergies" not in patient.keys():
        if sex == 0: #male
            allergy = pick_demographics(["Yes", "No"], allergy_weights_m)
        else: #female
            allergy = pick_demographics(["Yes", "No"], allergy_weights_f)

        if allergy == "Yes":
            patient['Allergies'] = f"{', '.join(random.sample(COMMON_ALLERGIES, random.randint(1,3)))}"

    if "Medical Conditions" not in patient.keys() and "Medications" not in patient.keys():
        conditions, meds = generate_health_conditions(random.randint(1,3))
        patient['Medical Conditions'] = ", ".join(conditions)
        patient['Medications'] = ", ".join([f"{med_name} ({dosage})" for med_name, dosage in meds])
        
    if "Health Supplements" not in patient.keys():
        patient['Health Supplements'] = random.sample(COMMON_HEALTH_SUPPLEMENTS, random.randint(0,3))
        
    # substance use
    if 'Recreational Drug Usage' not in patient.keys():
        if random.random() <= 0.3:
            patient['Recreational Drug Usage'] = f'{", ".join(random.sample(["beer", "wine", "cannabis", "cigarettes", "shrooms", "psychedelics"], random.randint(1,4)))}'
            if "Rehab or Substance Counselling" not in patient.keys():
                patient["Rehab or Substance Counselling"] = "{Rehab or Substance Counselling}"
                
    # medical history
    if "Reason for Appointment" not in patient.keys():
        reasons = random.sample(DR_APPOINTMENT_REASONS, random.randint(1,3))
        patient["Reason for Appointment"] = ", ".join(reasons)

    ## TODO
    if "Previous Hospitalizations or Surgeries" not in patient.keys():
        patient['Previous Hospitalizations or Surgeries'] = random.choice(["Yes", "No"])
        
        if patient['Previous Hospitalizations or Surgeries'] == "Yes":
            patient['Previous Hospitalizations or Surgeries Details'] = "{Previous Hospitalizations or Surgeries Details}"

    ## TODO -- done
    concussion_weights = [0.3, 0.7]
    if "Head Injuries or Concussions" not in patient.keys():
        patient['Head Injuries or Concussions'] = pick_demographics(["Yes", "No"], concussion_weights)
        
        if patient['Head Injuries or Concussions'] == "Yes":
            patient["Head Injuries or Concussions Details"] = "{Head Injuries or Concussions Details}"

    ## TODO -- done
    seizure_weights = [0.1, 0.9]
    if "History of Seizures" not in patient.keys():
        patient['History of Seizures'] = pick_demographics(["Yes", "No"], seizure_weights)
        
    
    # family history
    if "Family History of Health Conditions" not in patient.keys():
        patient['Family History of Health Conditions'] = random.choice(["Yes", "No"])
        
        if patient['Family History of Health Conditions'] == "Yes":
            patient['Family History of Health Conditions Details'] = "{Family History of Health Conditions Details}"

    ## TODO -- done
    siblings_weights = [0.40, 0.40, 0.15, 0.05]
    if 'Siblings' not in patient.keys() and 'Sibling Details' not in patient.keys():
        patient['Siblings'] = pick_demographics(["0", "1", "2", "3+"], siblings_weights)

        if patient['Siblings'] == "3+":
            patient['Siblings'] = random.choice(["3", "4", "5"])
        
        if not patient['Siblings'] == "0":
            patient['Sibling Details'] = "{Sibling Details}"
        
    # personal history
    ## TODO -- done
    canadian_weights = [0.912, 0.088] 
    if "Canadian Citizenship" not in patient.keys():
        patient['Canadian Citizenship'] = pick_demographics(["Yes", "No"], canadian_weights)

    if ethnicity == "fr_CA" or ethnicity == "en_CA":
        patient['Canadian Citizenship'] = "Yes"
    born_in_canada = ""
    if patient['Canadian Citizenship'] == "Yes":
        born_in_canada_weights = [53.1, 46.9]
        born_in_canada = pick_demographics(["Yes", "No"], born_in_canada_weights)
        
    if "Birthplace" not in patient.keys():
        if born_in_canada == "Yes":
            # print("CANADIAN")
            patient['Birthplace'] = f"{fake.city()}, Canada"
        else: 
            try:
                country = fake_eth.current_country()
                patient['Birthplace'] = f"{fake_eth.city()}, {country}"
            except Exception as c:
                print(f"Error generating current country for profile: {c}")
                country_code = fake_eth.current_country_code()
                patient['Birthplace'] = f"{fake_eth.city()}, {country_code}"

    learning_probability_matrix = [ # men, women
        [0.085, 0.097], # 18-24
        [0.049, 0.056], # 25-64
        [0.038, 0.043], # 65+
    ]
    developmental_probability_matrix = [ # women, men
        [0.046, 0.033], # 18-24
        [0.017, 0.012], # 25-64
        [0.007, 0.005] # 65+
    ]
    
    ## TODO -- done
    ## add "disabled" as a feature, so they can be disabled and potentially not have disability assistance
    # 20.5% are disabled, but only 5.2% have disability assistance # Define disability prevalence (20.5% disabled)
    disabled_weights = [0.205, 0.795]  # 20.5% chance of being disabled

    if 'Disabled' not in patient.keys():
        patient['Disabled'] = pick_demographics(["Yes", "No"], disabled_weights)

    # Determine age category index
    if patient['Age'] < 25:
        age_idx = 0
    elif patient['Age'] < 65:
        age_idx = 1
    else:
        age_idx = 2

    # Assigning probabilities based on sex
    sex_idx = 0 if patient['Sex'] == "Male" else 1  # Male = 0, Female = 1
    
    # Scale mental disability probabilities to be **within the disabled group**
    learning_probability = learning_probability_matrix[age_idx][sex_idx] / 0.205
    developmental_probability = developmental_probability_matrix[age_idx][sex_idx] / 0.205

    # Ensure probabilities do not exceed 100%
    total_mental_probability = (
        learning_probability + developmental_probability
    )
    if total_mental_probability > 1:
        scale_factor = 1 / total_mental_probability
        learning_probability *= scale_factor
        developmental_probability *= scale_factor

    # Only assign mental disabilities **if the patient is already disabled**
    if patient['Disabled'] == "Yes":
        mental_disability = 0
        if "Learning Disability" not in patient.keys():
            patient['Learning Disability'] = pick_demographics(["Yes", "No"], [learning_probability, 1 - learning_probability])

        if "Developmental Disability" not in patient.keys():
            patient['Developmental Disability'] = pick_demographics(["Yes", "No"], [developmental_probability, 1 - developmental_probability])

        # If any mental disability is present, classify as Mental Disability
        if patient['Learning Disability'] == "Yes" or patient['Developmental Disability'] == "Yes":
            mental_disability = 1
            if patient['Learning Disability'] == "Yes":
                patient['Learning Disability Details'] = "{Learning Disability Details}"
            else:
                patient['Developmental Disability Details'] = "{Developmental Disability Details}}"

        # Assign Physical Disability if no mental disability
        if not mental_disability == 1:
            patient['Physical Disability'] = "Yes"
            patient['Physical Disability Details'] = "{Physical Disability Details}"

    # else:  # If not disabled, all disabilities are "No"
    #     patient['Learning Disability'] = "No"
    #     patient['Developmental Disability'] = "No"
    #     patient['Physical Disability'] = "No"

    # Assign Disability Assistance (Only for disabled patients)
    if patient['Disabled'] == "Yes":
        disability_assistance_weights = [0.052, 0.948]  # 5.2% of disabled people receive assistance
        if 'Disability Assistance' not in patient.keys():
            patient['Disability Assistance'] = pick_demographics(["Yes", "No"], disability_assistance_weights)

            if patient['Disability Assistance'] == "Yes":
                patient['Disability Assistance Details'] = "{Disability Assistance Details}"
    
    # education -- ChatGPT estimated... unsure if I want to keep
    
    # if "Elementary School Performance" not in patient.keys():
    #     patient["Elementary School Performance"] = random.choice(["poor", "average", "good", "excellent"])
    # if "High School Performance" not in patient.keys():
    #     patient["High School Performance"] = random.choice(["dropped out", "poor", "average", "good", "excellent"])

    # Adjusted probability distributions for elementary school performance
    elementary_school_weights = {
        "No Disabilities": [0.10, 0.30, 0.40, 0.20],  # Poor, Average, Good, Excellent
        "Learning Disability": [0.15, 0.35, 35, 0.15],
        "Developmental Disability": [0.30, 0.35, 0.25, 0.10]
    }

    # Adjusted probability distributions for high school performance
    high_school_weights = {
        "No Disabilities": [0.05, 0.10, 0.30, 0.35, 0.20],  # Dropout, Poor, Average, Good, Excellent
        "Learning Disability": [0.10, 0.20, 0.35, 0.25, 0.10],
        "Developmental Disability": [0.20, 0.25, 0.30, 0.20, 0.05]
    }

    # Determine disability category
    if patient.get("Learning Disability") == "Yes":
        disability_category = "Learning Disability"
    elif patient.get("Developmental Disability") == "Yes":
        disability_category = "Developmental Disability"
    else:
        disability_category = "No Disabilities"

    # Assign elementary school performance
    if "Elementary School Performance" not in patient.keys():
        patient["Elementary School Performance"] = pick_demographics(
            ["poor", "average", "good", "excellent"], elementary_school_weights[disability_category]
        )

    # Assign high school performance
    if "High School Performance" not in patient.keys():
        patient["High School Performance"] = pick_demographics(
            ["dropped out", "poor", "average", "good", "excellent"], high_school_weights[disability_category]
        )

    # Define education probabilities for 25-64 age group in Vancouver
    education_weights = [0.53, 0.23, 0.24]  # [Pursuing] University, [Pursuing] Post-Secondary (No Degree), No Further Education
    education_weights_65_plus = [0.28, 0.22, 0.50]  # Adjusted rates for 65+, ChatGPT estimated rates

    # Only apply to 25-64 age group
    if 25 <= patient['Age'] <= 64:
        patient['Further Education After Highschool'] = pick_demographics(
            ["University Degree", "Post-Secondary Certificate or Diploma", "No Further Education"],
            education_weights
        )
    elif 18 <= patient['Age'] < 25:
        patient['Further Education After Highschool'] = pick_demographics(
            ["Pursuing University Degree", "Pursuing Post-Secondary Certificate or Diploma", "No Further Education"],
        education_weights
        )
    elif patient['Age'] >= 65:
        patient['Further Education After Highschool'] = pick_demographics(
            ["University Degree", "Post-Secondary Certificate or Diploma", "No Further Education"],
        education_weights_65_plus
        )
    else:
        patient['Further Education After Highschool'] = "Unknown"  # In case of error

        
    # previous employment and relationships -- ChatGPT Estimated Statistics
    # if "Previous Work History" not in patient.keys():
    #     patient["Previous Work History"] = "{Previous Work History}"
    # if "Previous Marriages/Long-Term Relationships" not in patient.keys():
    #     patient["Previous Marriages/Long-Term Relationships"] = "{Previous Marriages/Long-Term Relationships}"

    # Define probability of having previous work history by age
    work_history_weights = {
        "18-24": [0.50, 0.50],  # 50% chance of having work history
        "25-64": [0.95, 0.05],  # 95% chance of having work history
        "65+": [1.00, 0.00]  # Nearly all 65+ have work history
    }

    # Determine correct probability
    if patient['Age'] < 25:
        work_weights = work_history_weights["18-24"]
    elif patient['Age'] < 65:
        work_weights = work_history_weights["25-64"]
    else:
        work_weights = work_history_weights["65+"]

    # Assign work history based on probabilities
    if "Previous Work History" not in patient.keys():
        patient["Previous Work History"] = pick_demographics(
            ["Has Work History", "No Work History"], work_weights
        )

        if patient["Previous Work History"] == "Has Work History":
            patient["Previous Work History"] = "{Previous Work History}"

    # Define probability of having previous marriages/relationships by age
    relationship_history_weights = {
        "18-24": [0.15, 0.85],  # Low likelihood of past relationships
        "25-34": [0.35, 0.65],  # Increasing likelihood
        "35-64": [0.60, 0.40],  # Most likely have had a prior relationship
        "65+": [0.80, 0.20]  # Majority have had at least one past relationship
    }

    # Determine correct probability
    if patient['Age'] < 25:
        relationship_weights = relationship_history_weights["18-24"]
    elif patient['Age'] < 35:
        relationship_weights = relationship_history_weights["25-34"]
    elif patient['Age'] < 65:
        relationship_weights = relationship_history_weights["35-64"]
    else:
        relationship_weights = relationship_history_weights["65+"]
    
    # Assign relationship history based on probabilities
    if "Previous Marriages/Long-Term Relationships" not in patient.keys():
        patient["Previous Marriages/Long-Term Relationships"] = pick_demographics(
            ["Has Previous Relationships/Marriages", "No Previous Relationships"], 
            relationship_weights
        )
        if patient["Previous Marriages/Long-Term Relationships"] == "Has Previous Relationships/Marriages":
            patient["Previous Marriages/Long-Term Relationships"] = "{Previous Marriages/Long-Term Relationships}"


    # hobbies and lifestyle
    if "Hobbies" not in patient.keys():
        patient["Hobbies"] = "{Hobbies}"
    if "Relaxation Methods" not in patient.keys():
        patient["Relaxation Methods"] = "{Relaxation Methods}"

    # attributes with possible do not disclose options
    dnd_attributes = [
        "Past Trauma", 
        "Substance Abuse", 
    ]

    for attribute in dnd_attributes:
        if attribute not in patient.keys():
            cond = random.random() #returns [0,1)
            if cond <= 0.5: patient[attribute] = "yes"
            else: patient[attribute] = "None"
    
    # create variation in conversations and tones
    if "Typing Style" not in patient.keys():
        patient["Typing Style"] = random.choice(TYPING_STYLES)
    if "Conversational Tone" not in patient.keys():
        patient["Conversational Tone"] = random.choice(CONVERSATIONAL_TONE)
    if "Personality Traits" not in patient.keys():
        patient["Personality Traits"] = random.sample(PERSONALITY_TRAITS, random.randint(1,4))
    if "Edge Case Scenario" not in patient.keys():
        cond = random.random() #returns [0,1)
        if cond <= 0.5: patient["Edge Case Scenario"] = random.choice(EDGE_CASE_SCENARIOS)
        else: patient[attribute] = "N/A"
        
    print("Patient Dict - BEFORE LLM", patient)
    return patient

            
    
#     age = str(random.randint(18, 80))
#     birth_date = f"{random.randint(1, 12)}/{random.randint(1, 28)}/{2024 - int(age)}"
#     birth_date = str(birth_date)
#     handedness = random.choice(["left", "right", "both"])
#     health_condition = random.choice(list(variables.HEALTH_CONDITIONS.keys()))
#     medications = ", ".join(random.sample(variables.HEALTH_CONDITIONS[health_condition], k=2))
#     dr_reason = random.choice(variables.DR_APPOINTMENT_REASONS)
#     conversational_tone = random.choice(variables.CONVERSATIONAL_TONE)
#     typing_style = random.choice(variables.TYPING_STYLES)
#     recreational_drugs = random.choice(variables.COMMON_RECREATIONAL_DRUGS)
#     health_supplements = random.choice(variables.COMMON_HEALTH_SUPPLEMENTS)
#     allergies = random.choice(variables.COMMON_ALLERGIES)
#     selected_personality_traits = random.sample(variables.PERSONALITY_TRAITS, random.randint(1, 4))


# Disability Prevalence by Age, Gender, and Type

We estimate the prevalence for each gender in each age group by **proportionally distributing** the known gender-specific rates based on the overall prevalence in each age group.

### Formula for Adjustment:
P(gender, age) = P(gender overall) × (P(age overall) / P(total overall))

where:

- P(gender, age) = estimated prevalence for a specific gender and age group
- P(gender overall) = known prevalence for men or women in the total population
- P(age overall) = known prevalence for the given age group (without gender split)
- P(total overall) = known prevalence for the entire 15+ population (sum of all three disabilities)

---

### Adjusted Disability Prevalence Rates by Age and Gender (%):

| Age Group         | Learning Disability (Women) | Learning Disability (Men) | Developmental Disability (Women) | Developmental Disability (Men) | Memory Disability (Women) | Memory Disability (Men) |
|-------------------|---------------------------|-------------------------|---------------------------------|-----------------------------|---------------------------|-------------------------|
| Youth (15-24)     | 9.69%                       | 8.54%                    | 3.29%                           | 4.56%                        | 4.49%                       | 3.51%                    |
| Working Age (25-64) | 5.58%                     | 4.92%                    | 1.21%                           | 1.68%                        | 5.05%                       | 3.95%                    |
| Seniors (65+)     | 4.32%                       | 3.81%                    | 0.52%                           | 0.72%                        | 7.41%                       | 5.79%                    |

---

These adjusted values ensure that gender and age group distributions align correctly. Let me know if you need any refinements!


In [10]:
import re

def clean_stars_from_dict(patient_dict):
    for key, value in patient_dict.items():
        if isinstance(value, str):  # Only clean string values
            patient_dict[key] = value.lstrip('** ').strip()  # Remove leading '** ' and any surrounding spaces
    return patient_dict

# Function to update the dictionary with LLM output using flexible pattern matching
def hydrate_dict(patient_dict, llm_output):
    # Define patterns to capture various fields in the LLM output
    patterns = {
        'Full Name': r"Anglicized Full Name[:\s]*([^\n]+)",
        'Birthplace': r"Anglicized Birthplace[:\s]*([^\n]+)",
        'Clinician Name': r"Anglicized Clinician Name[:\s]*([^\n]+)",
        'Current Doctor(s)': r"Anglicized Doctor Name[:\s]*([^\n]+)",
        'Children Details' : r"Children Details[:\s]*([^\n]+)",
        'Disability Assistance Details': r"Disability Assistance Details[:\s]*([^\n]+)",
        'Learning Disability Details': r"Learning Disability Details[:\s]*([^\n]+)",
        'Developmental Disability Details': r"Developmental Disability Details[:\s]*([^\n]+)",
        'Physical Disability Details': r"Physical Disability Details[:\s]*([^\n]+)",
        'Previous Hospitalizations or Surgeries Details' : r"Previous Hospitalizations or Surgeries Details[:\s]*([^\n]+)",
        'Head Injuries or Concussions Details' : r"Head Injuries or Concussions Details[:\s]*([^\n]+)",
        'Rehab or Substance Counselling': r"Rehab or Substance Counselling[:\s]*([^\n]+)",
        'Family History of Health Conditions Details': r"Family History of Health Conditions Details[:\s]*([^\n]+)",
        'Sibling Details': r"Sibling Details[:\s]*([^\n]+)",
        'Previous Work History': r"Previous Work History[:\s]*([^\n]+)",
        'Previous Marriages/Long-Term Relationships': r"Previous Marriages/Long-Term Relationships[:\s]*([^\n]+)",
        'Hobbies': r"Hobbies[:\s]*([^\n]+)",
        'Relaxation Methods': r"Relaxation Methods[:\s]*([^\n]+)"
    }

    # Loop through each pattern and search for matches in the LLM output
    for key, pattern in patterns.items():
        match = re.search(pattern, llm_output, re.IGNORECASE)
        if match:
            patient_dict[key] = match.group(1).strip()  # Update dict with captured value
    
    clean_stars_from_dict(patient_dict)
    return patient_dict



In [11]:
def patient_to_csv(patient:dict, filepath:str="./llm_patients.csv"):
    '''
    Outputs the patient dictionary to the csv file at filepath.

    Parameters
    ----------
    patient: dict 
        randomized patient dictionary
    filepath: str
        filepath associated with the csv of patients; defaults to patients.csv"
    '''
    try:
        current_df = pd.read_csv(filepath, sep="|")
        current_dict = current_df.to_dict("list")
        existing_len = current_df.shape[0]
        
        #add variables in current dict not in patient dict
        missing_cols = list(set(list(current_dict.keys())) - set(list(patient.keys())))
        for missing_col in missing_cols:
            patient[missing_col] = ""
        
        #add everything from patient dict to to the current dict
        for col in patient.keys():
            if col in current_dict:
                current_dict[col].append(patient[col])
            else:
                #create a new column for variable in patient dict not in current dict
                buffer = [""]*existing_len
                buffer.append(patient[col])
                current_dict[col] = buffer

        updated_df = pd.DataFrame(current_dict)

    except FileNotFoundError:
        print(f"creating file at {filepath}")
        updated_dict = {}
        #save dictionary as csv with keys
        for col in patient.keys():
            updated_dict[col] = [f"{patient[col]}"]
        updated_df = pd.DataFrame(updated_dict)

    updated_df.to_csv(filepath, sep="|",index=False)

def patient_creator(filepath = "./llm_patients.csv"):
    """
    creates a randomized patient with optional preset attributes and adds it to the patients csv at the filepath
    """
    patient = generate_patient_profile(variables)
    patient_to_csv(patient, filepath)
    return patient

# if __name__ == '__main__':
#     patient_creator({}, filepath="../patients.csv")

In [12]:
i=0
while i < 1000:
    patient_creator(filepath="../llm_patients_1000.csv")
    i += 1
    print(i)
# patient_creator(filepath="../llm_patients.csv")
# 1000 patients - 3:09:24

Patient Dict - BEFORE LLM {'Ethnicity Code': 'hy_AM', 'Ethnicity': 'Armenian - Armenia', 'Clinician Name': 'Emil Matějka', 'Appointment Date': 'June 26', 'Sex': 'Male', 'Full Name': 'Սահակ Բեկնազարյան', 'Age': 51, 'Date of Birth': 'November 23 1973', 'Relationship Status': 'Common-Law', 'Children': 'Yes', 'Children Details': '{Children Details}', 'Handedness': 'right', 'Address': '81714 Oconnell Ramp Apt. 094, South Donaldchester, British Columbia, Canada', 'Occupation': 'unemployed', 'Current Doctor(s)': 'Dr. Róża Doroba, General Practioner', 'Medical Conditions': 'narcissistic personality disorder, avoidant personality disorder, obsessive-compulsive disorder', 'Medications': 'Fluoxetine (Prozac) (20 mg/day)', 'Health Supplements': ['calcium'], 'Reason for Appointment': 'difficulty managing life transitions or stressors, difficulty concentrating or making decisions', 'Previous Hospitalizations or Surgeries': 'No', 'Head Injuries or Concussions': 'No', 'History of Seizures': 'No', 'Fam