In [1]:
import sys, os, csv, ast, random, time
from pathlib import Path
import pandas as pd
import time
from collections import deque  # Queue for questions
import re
import random
from faker import Faker
from dateutil.relativedelta import relativedelta
from datetime import datetime

from variables import HEALTH_CONDITIONS, COMMON_RECREATIONAL_DRUGS, COMMON_HEALTH_SUPPLEMENTS, COMMON_ALLERGIES, DR_APPOINTMENT_REASONS, TYPING_STYLES, CONVERSATIONAL_TONE, PERSONALITY_TRAITS, EDGE_CASE_SCENARIOS, NAME_REGIONS

# import Langchain and Ollama
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model1 = OllamaLLM(base_url="localhost:11435", 
                   model="llama3.1:70b-instruct-q4_0", 
                   temperature=0.7, 
                   num_ctx = 4096,
                   top_k = 50,
                   top_p = 0.95)


In [2]:
print(Path.cwd())

C:\Users\aleynaw\Desktop\transcript_generation-main\patient_creation


In [3]:
import variables

In [4]:
random.seed()
# This function would take a prompt, a variables file, and then call the LLM to generate a patient profile
def generate_patient_profile(variables):
    # Load variables from a Python file (assuming it's structured as a dictionary)
    patient_variables = variables
    
    # Patient profile template (corresponding to your structured format)
    profile_template = """
    ### General Information
    - Full name: {full_name}
    - Preferred name: {preferred_name}
    - Date of birth: {date_of_birth}
    - Age: {age}
    - Sex: {sex}
    - Handedness: {handedness}
    
    ### Residence and Marital Status
    - Current city/town of residence: {residence}
    - Relationship status: {relationship_status}
    - Children or dependents: {children_status}
      {children_details}
    
    ### Employment and Financial Status
    - Current employment status: {employment_status}
      {employment_details}
    - Disability assistance: {disability_status}
      {disability_details}
    
    ### Medical Care Information
    - List of current doctors: {doctors}
    - Allergies: {allergies}
    - Current medications and dosages: {medications}
    - Health supplements: {supplements}
    
    ### Substance Use
    - Nicotine, marijuana, alcohol use frequency: {substance_use}
    
    ### Medical History
    - Reason for Appointment: {dr_reason}
    - Health conditions/diagnoses: {health_conditions}
    - Previous hospitalizations or surgeries: {hospitalizations}
    - Head injuries or concussions: {head_injuries}
    - History of seizures: {seizures}
    - Rehab or substance counseling: {rehab_history}
    
    ### Family History
    - Psychiatric conditions in family: {family_psychiatric}
    - Neurological or genetic conditions in family: {family_neurological}
    - Siblings: {siblings}
    
    ### Personal History
    - Birthplace: {birthplace}
    - Canadian citizen: {citizenship_status}
    - Developmental difficulties: {developmental_difficulties}
    
    ### Education
    - Difficulties in elementary school: {elementary_difficulties}
    - High school performance: {high_school_performance}
    - Further education after high school: {further_education}
    
    ### Employment and Relationships
    - Previous work history: {previous_work}
    - Previous marriages/long-term relationships: {previous_relationships}
    
    ### Hobbies and Lifestyle
    - Hobbies: {hobbies}
    - How to relax on a stressful day: {relaxation_methods}
    
    ### Personality
    - Conversational tone: {conversational_tone}
    - Typing style: {typing_style}
    - Personality traits: {personality_traits}
    """
    
    # Hydrate randomly generated information
    patient_dict = hydrate_patient_profile(variables, {})
    patient_dict
    
    # fields_to_fill = {k: v for k, v in patient_dict.items() if v == ""}
    
    # Extract fields with placeholders
    fields_to_fill = {k: v for k, v in patient_dict.items() if isinstance(v, str) and v.startswith('{') and v.endswith('}')}

    # Convert to a more readable format for the LLM
    fields_to_fill_str = ', '.join([f"{key}: {value}" for key, value in fields_to_fill.items()])
    print(fields_to_fill_str)

    prompt = f"""
    I have a fictional patient profile, and I need to fill in some sections with realistic-sounding but completely made-up details. 
    Here are the sections with placeholders that need to be completed:

    {fields_to_fill_str}
    

    Please generate brief, suitable details for each section that make sense in the context of a fictional character and their background. The information should be coherent with the rest of the profile provided below,
    and there must NOT be contradicting information. For example, if the patient has no siblings, then there should not be siblings mentioned in the Family History.
    
    
    Here is the partially filled out dict:

    {patient_dict}
    
     If the full name is in not in the english alphabet, please anglicize it and output it inline with the other fields, as such:
    "Anglicized Full name: value".
    
    YOU MUST return the output in the following format for each section:
    "Field Name: value"
    """
 

    
#     # Now, create the prompt that will guide the LLM in generating a patient profile
#     prompt = f"""
#     Given a partially filled out profile of a synthetic psychiatric patient, I would like you to fill in the rest
#     of the information in the template to generate a fully fleshed out patient. Here are some specific rules:
    
#     - You must make up a **FAKE** name for the following:
#         - Their full name
#         - Their city/town of residence (must be in British Columbia)
#         - Their children's names (if applicable)
#         - Their current doctors' names (if applicable)
#         - Their siblings' names (if applicable)
#         - The name of the company they are employed at (if employed)
    
#     - The **number of children and siblings** can vary, including **0**. You do not need to create children or siblings if not relevant.
    
#     - The patient **does not need** to have a nickname (Preferred name) unless it naturally fits their character.
    
#     - For traits in the Family History section, it's okay if there are **no psychiatric, neurological, or genetic conditions** in the family.
    
#     - For everything else, please generate **realistic synthetic information** that meshes well with the already filled out information.
    
#     Please ensure the names, places, and other details vary from previous completions.
    
#     Here is the half-filled out template:
#     {profile_template}
#     """


    # Call the LLM (assuming using OpenAI GPT-based models or LLaMA)
    try:
        response = response = model1.invoke(prompt)
        
        # Get the generated profile
        print("Response: ", response)
    
    except Exception as e:
        print(f"Error generating patient profile: {e}")
        
    updated_patient_dict = hydrate_dict(patient_dict, response)
    
    return updated_patient_dict

In [5]:
# def generate_health_conditions(n):
#     '''gets a list of n random health conditions along with a list of a randomly selected associated medication for that condition'''
#     health_condition = random.sample(list(HEALTH_CONDITIONS.keys()), n)
#     meds = []
#     for i in health_condition:
#         if len(HEALTH_CONDITIONS[i]) > 0:
#             #chance that meds may not be taken for that health condition
#             if random.random() <= 0.8:
#                 med_name = random.choice(HEALTH_CONDITIONS[i])
#                 meds.append(med_name)
#         else:
#             pass
#     return health_condition, meds

def generate_health_conditions(n):
    '''Gets a list of n random health conditions along with a list of a randomly selected associated medication and dosage for that condition.'''
    health_conditions = random.sample(list(HEALTH_CONDITIONS.keys()), n)
    meds_with_dosages = []
    
    for condition in health_conditions:
        meds = list(HEALTH_CONDITIONS[condition].keys())
        
        # Check if there are any medications for the condition
        if len(meds) > 0:
            # 80% chance that meds may be taken for that health condition
            if random.random() <= 0.8:
                med_name = random.choice(meds)
                # Select a random dosage (starting or therapeutic dose)
                dosage = random.choice(list(HEALTH_CONDITIONS[condition][med_name].values()))
                meds_with_dosages.append((med_name, dosage))
        else:
            meds_with_dosages.append(None)
    
    return health_conditions, meds_with_dosages

def create_clinician_name()->str:
    '''returns string of random name of a clinician'''
    fake = Faker()
    dr = fake.first_name() + " " + fake.last_name()
    return dr

def create_appt_date()->str:
    '''returns string with random month and date (e.g. February 14)'''
    fake = Faker()
    appt_date = fake.date_between("today", "+5y").strftime("%B %#d")
    return appt_date

def get_random_names(fake, num_people, age_low, age_high)->list:
    '''returns a list of strings with num_people number of people with first name, random age ages between age_low and age_high (e.g. "Lary (12)")'''
    output = []
    for i in range(num_people):
        output.append(f"{fake.first_name()} ({random.randint(age_low, age_high)})")
    return output

In [6]:
def hydrate_patient_profile(variables, preset_attributes = {}) -> dict:
    
    random.seed()
    fake = Faker(NAME_REGIONS)
    patient = dict()
    #doctor data
    patient["Clinician Name"] = create_clinician_name()
    patient["Appointment Date"] = create_appt_date()
    
    #basic information
    sex = random.randint(0,1)
    middle_name = random.randint(0,1)
    
    if sex == 0:
        patient['Sex'] = "Female"
        patient['Full Name'] = fake.name_female() 
        if middle_name == 1:
            patient['Full Name'] = f"{fake.first_name_female()} " + patient['Full Name']
    else:
        patient['Sex'] = "Male"
        patient['Full Name'] = fake.name_male()
        if middle_name == 1:
            patient['Full Name'] = f"{fake.first_name_male()} " + patient['Full Name']
            
    fake = Faker()
    if "Age" not in patient.keys():
        dob = fake.date_of_birth(minimum_age = 20, maximum_age=80)
        patient['Age'] = relativedelta(datetime.now(), dob).years

        if random.random() <= 0.8: 
            job = fake.job()
        else:
            job = "unemployed"
        if patient['Age'] <= 22:
            job = "student " + job
        if patient['Age'] >= 75:
            job = "retired " + job

    if "Date of Birth" not in patient.keys():
        patient['Date of Birth'] =dob.strftime("%B %#d %Y") #windows: replace %-d with %#d
        
    if "Handedness" not in patient.keys():
        patient['Handedness'] = random.choice(["left", "right", "ambidextrous"])
    
    
    # residence and marital status
    if "Address" not in patient.keys():
        patient['Address'] = f"{fake.street_address()}, {fake.city()}, British Columbia, Canada"
        
    if 'Relationship Status' not in patient.keys():
        patient['Relationship Status'] = random.choice(["married", "long term relationship", "casual relationship", "divorced", "windowed", "single"])
        
    if 'Children' not in patient.keys() and 'Children Details' not in patient.keys():
        patient['Children'] = random.choice(["Yes", "No"])
        
        if patient['Children'] == "Yes":
            patient['Children Details'] = "{Children Details}"
        
    # employment and financial status
    if "Occupation" not in patient.keys():
        patient['Occupation'] = job
        
    if 'Disability Assistance' not in patient.keys():
        patient['Disability Assistance'] = random.choice (["Yes", "No"])
        
        if patient['Disability Assistance'] == "Yes":
            patient['Disability Details'] = "{Disability Details}"
    
    # medical care information
    if 'Current Doctors' not in patient.keys():
        patient['Current Doctors'] = "{Current Doctors}"
        
    if "Allergies" not in patient.keys():
        patient['Allergies'] = f"{', '.join(random.sample(COMMON_ALLERGIES, random.randint(0,3)))}"

    if "Medical Conditions" not in patient.keys() and "Medications" not in patient.keys():
        conditions, meds = generate_health_conditions(random.randint(1,3))
        patient['Medical Conditions'] = ", ".join(conditions)
        patient['Medications'] = ", ".join([f"{med_name} ({dosage})" for med_name, dosage in meds])
        
    if "Health Supplements" not in patient.keys():
        patient['Health Supplements'] = random.sample(COMMON_HEALTH_SUPPLEMENTS, random.randint(0,3))
        
    # substance use
    if 'Recreational Drug Usage' not in patient.keys():
        if random.random() <= 0.3:
            patient['Recreational Drug Usage'] = f'{", ".join(random.sample(["beer", "wine", "cannabis", "cigarettes", "shrooms", "psychedelics"], random.randint(1,4)))}'
            if "Rehab or Substance Counselling" not in patient.keys():
                patient["Rehab or Substance Counselling"] = "{Rehab or Substance Counselling}"
                
    # medical history
    if "Reason for Appointment" not in patient.keys():
        reasons = random.sample(DR_APPOINTMENT_REASONS, random.randint(1,3))
        patient["Reason for Appointment"] = ", ".join(reasons)
    
    if "Previous Hospitalizations or Surgeries" not in patient.keys():
        patient['Previous Hospitalizations or Surgeries'] = random.choice(["Yes", "No"])
        
        if patient['Previous Hospitalizations or Surgeries'] == "Yes":
            patient['Previous Hospitalizations or Surgeries Details'] = "{Previous Hospitalizations or Surgeries Details}"
        
    if "Head Injuries or Concussions" not in patient.keys():
        patient['Head Injuries or Concussions'] = random.choice(["Yes", "No"])
        
        if patient['Head Injuries or Concussions'] == "Yes":
            patient["Head Injuries or Concussions Details"] = "{Head Injuries or Concussions Details}"
    
    if "History of Seizures" not in patient.keys():
        patient['History of Seizures'] = random.choice(["Yes", "No"])
        
    
    
    # family history
    if "Family History of Health Conditions" not in patient.keys():
        patient['Family History of Health Conditions'] = random.choice(["Yes", "No"])
        
        if patient['Family History of Health Conditions'] == "Yes":
            patient['Family History of Health Conditions Details'] = "{Family History of Health Conditions Details}"
        
    if 'Siblings' not in patient.keys() and 'Sibling Details' not in patient.keys():
        patient['Siblings'] = random.choice(["Yes", "No"])
        
        if patient['Siblings'] == "Yes":
            patient['Sibling Details'] = "{Sibling Details}"
        
    # personal history
    if "Birthplace" not in patient.keys():
        patient['Birthplace'] = f"{fake.city()}, British Columbia, Canada"
    
    if "Canadian Citizenship" not in patient.keys():
        patient['Canadian Citizenship'] = random.choice(["Yes", "No"])
        
    if "Developmental Difficulties" not in patient.keys():
        patient['Developmental Difficulties'] = random.choice(["Yes", "No"])
    
    # education
    if "Elementary School Performance" not in patient.keys():
        patient["Elementary School Performance"] = random.choice(["poor", "average", "good", "excellent"])
    if "High School Performance" not in patient.keys():
        patient["High School Performance"] = random.choice(["dropped out", "poor", "average", "good", "excellent"])
    if "Further Education after High School" not in patient.keys():
        patient['Canadian Citizenship'] = random.choice(["Yes", "No"])
        
    # previous employment and relationships
    if "Previous Work History" not in patient.keys():
        patient["Previous Work History"] = "{Previous Work History}"
    if "Previous Marriages/Long-Term Relationships" not in patient.keys():
        patient["Previous Marriages/Long-Term Relationships"] = "{Previous Marriages/Long-Term Relationships}"
        
    # hobbies and lifestyle
    if "Hobbies" not in patient.keys():
        patient["Hobbies"] = "{Hobbies}"
    if "Relaxation Methods" not in patient.keys():
        patient["Relaxation Methods"] = "{Relaxation Methods}"

    # attributes with possible do not disclose options
    dnd_attributes = [
        "Past Trauma", 
        "Substance Abuse", 
    ]

    for attribute in dnd_attributes:
        if attribute not in patient.keys():
            cond = random.random() #returns [0,1)
            if cond <= 0.5: patient[attribute] = "yes"
            else: patient[attribute] = "None"
    
    # create variation in conversations and tones
    if "Typing Style" not in patient.keys():
        patient["Typing Style"] = random.choice(TYPING_STYLES)
    if "Conversational Tone" not in patient.keys():
        patient["Conversational Tone"] = random.choice(CONVERSATIONAL_TONE)
    if "Personality Traits" not in patient.keys():
        patient["Personality Traits"] = random.sample(PERSONALITY_TRAITS, random.randint(1,4))
    if "Edge Case Scenario" not in patient.keys():
        cond = random.random() #returns [0,1)
        if cond <= 0.5: patient["Edge Case Scenario"] = random.choice(EDGE_CASE_SCENARIOS)
        else: patient[attribute] = "N/A"
        
    print("Patient Dict - BEFORE LLM", patient)
    return patient

            
    
#     age = str(random.randint(18, 80))
#     birth_date = f"{random.randint(1, 12)}/{random.randint(1, 28)}/{2024 - int(age)}"
#     birth_date = str(birth_date)
#     handedness = random.choice(["left", "right", "both"])
#     health_condition = random.choice(list(variables.HEALTH_CONDITIONS.keys()))
#     medications = ", ".join(random.sample(variables.HEALTH_CONDITIONS[health_condition], k=2))
#     dr_reason = random.choice(variables.DR_APPOINTMENT_REASONS)
#     conversational_tone = random.choice(variables.CONVERSATIONAL_TONE)
#     typing_style = random.choice(variables.TYPING_STYLES)
#     recreational_drugs = random.choice(variables.COMMON_RECREATIONAL_DRUGS)
#     health_supplements = random.choice(variables.COMMON_HEALTH_SUPPLEMENTS)
#     allergies = random.choice(variables.COMMON_ALLERGIES)
#     selected_personality_traits = random.sample(variables.PERSONALITY_TRAITS, random.randint(1, 4))


In [7]:
import re

def clean_stars_from_dict(patient_dict):
    for key, value in patient_dict.items():
        if isinstance(value, str):  # Only clean string values
            patient_dict[key] = value.lstrip('** ').strip()  # Remove leading '** ' and any surrounding spaces
    return patient_dict

# Function to update the dictionary with LLM output using flexible pattern matching
def hydrate_dict(patient_dict, llm_output):
    # Define patterns to capture various fields in the LLM output
    patterns = {
        'Full Name': r"Anglicized Full Name[:\s]*([^\n]+)",
        'Children Details' : r"Children Details[:\s]*([^\n]+)",
        'Disability Details': r"Disability Details[:\s]*([^\n]+)",
        'Current Doctors': r"Current Doctors[:\s]*([^\n]+)",
        'Previous Hospitalizations or Surgeries Details' : r"Previous Hospitalizations or Surgeries Details[:\s]*([^\n]+)",
        'Head Injuries or Concussions Details' : r"Head Injuries or Concussions Details[:\s]*([^\n]+)",
        'Rehab or Substance Counselling': r"Rehab or Substance Counselling[:\s]*([^\n]+)",
        'Family History of Health Conditions Details': r"Family History of Health Conditions Details[:\s]*([^\n]+)",
        'Sibling Details': r"Sibling Details[:\s]*([^\n]+)",
        'Previous Work History': r"Previous Work History[:\s]*([^\n]+)",
        'Previous Marriages/Long-Term Relationships': r"Previous Marriages/Long-Term Relationships[:\s]*([^\n]+)",
        'Hobbies': r"Hobbies[:\s]*([^\n]+)",
        'Relaxation Methods': r"Relaxation Methods[:\s]*([^\n]+)"
    }

    # Loop through each pattern and search for matches in the LLM output
    for key, pattern in patterns.items():
        match = re.search(pattern, llm_output, re.IGNORECASE)
        if match:
            patient_dict[key] = match.group(1).strip()  # Update dict with captured value
    
    clean_stars_from_dict(patient_dict)
    return patient_dict



In [8]:
def patient_to_csv(patient:dict, filepath:str="./llm_patients.csv"):
    '''
    Outputs the patient dictionary to the csv file at filepath.

    Parameters
    ----------
    patient: dict 
        randomized patient dictionary
    filepath: str
        filepath associated with the csv of patients; defaults to patients.csv"
    '''
    try:
        current_df = pd.read_csv(filepath, sep="|")
        current_dict = current_df.to_dict("list")
        existing_len = current_df.shape[0]
        
        #add variables in current dict not in patient dict
        missing_cols = list(set(list(current_dict.keys())) - set(list(patient.keys())))
        for missing_col in missing_cols:
            patient[missing_col] = ""
        
        #add everything from patient dict to to the current dict
        for col in patient.keys():
            if col in current_dict:
                current_dict[col].append(patient[col])
            else:
                #create a new column for variable in patient dict not in current dict
                buffer = [""]*existing_len
                buffer.append(patient[col])
                current_dict[col] = buffer

        updated_df = pd.DataFrame(current_dict)

    except FileNotFoundError:
        print(f"creating file at {filepath}")
        updated_dict = {}
        #save dictionary as csv with keys
        for col in patient.keys():
            updated_dict[col] = [f"{patient[col]}"]
        updated_df = pd.DataFrame(updated_dict)

    updated_df.to_csv(filepath, sep="|",index=False)

def patient_creator(filepath = "./llm_patients.csv"):
    """
    creates a randomized patient with optional preset attributes and adds it to the patients csv at the filepath
    """
    patient = generate_patient_profile(variables)
    patient_to_csv(patient, filepath)
    return patient

# if __name__ == '__main__':
#     patient_creator({}, filepath="../patients.csv")

In [9]:
patient_creator(filepath="../llm_patients.csv")

Patient Dict - BEFORE LLM {'Clinician Name': 'Destiny Taylor', 'Appointment Date': 'June 22', 'Sex': 'Female', 'Full Name': '清水 美加子', 'Age': 41, 'Date of Birth': 'June 28 1983', 'Handedness': 'left', 'Address': '88700 Philip Knolls Apt. 271, Port Randy, British Columbia, Canada', 'Relationship Status': 'married', 'Children': 'Yes', 'Children Details': '{Children Details}', 'Occupation': 'Market researcher', 'Disability Assistance': 'Yes', 'Disability Details': '{Disability Details}', 'Current Doctors': '{Current Doctors}', 'Allergies': 'ibuprofen, opioid pain medications, fragrances', 'Medical Conditions': 'panic disorder, social anxiety disorder', 'Medications': 'Alprazolam (Xanax) (0.25-0.5 mg, 3 times daily), Paroxetine (Paxil) (20-60 mg/day)', 'Health Supplements': ['multivitamins', 'herbal supplements'], 'Reason for Appointment': 'feelings of detachment or estrangement from oneself or surroundings, difficulty concentrating or making decisions', 'Previous Hospitalizations or Surger

{'Clinician Name': 'Destiny Taylor',
 'Appointment Date': 'June 22',
 'Sex': 'Female',
 'Full Name': '清水 美加子',
 'Age': 41,
 'Date of Birth': 'June 28 1983',
 'Handedness': 'left',
 'Address': '88700 Philip Knolls Apt. 271, Port Randy, British Columbia, Canada',
 'Relationship Status': 'married',
 'Children': 'Yes',
 'Children Details': 'Two children, Emily (12) and James (9), both living with the patient and her husband.',
 'Occupation': 'Market researcher',
 'Disability Assistance': 'Yes',
 'Disability Details': 'The patient has dysgraphia, a learning disability that affects writing skills, which was diagnosed in childhood. She also experiences occasional seizures due to an unknown cause.',
 'Current Doctors': '',
 'Allergies': 'ibuprofen, opioid pain medications, fragrances',
 'Medical Conditions': 'panic disorder, social anxiety disorder',
 'Medications': 'Alprazolam (Xanax) (0.25-0.5 mg, 3 times daily), Paroxetine (Paxil) (20-60 mg/day)',
 'Health Supplements': ['multivitamins', 'h