### Prompt Engineering OUTPATIENT HEALTHCARE + VERIFICATION ###
Add another output to confirm if the prediction can be verified from the information given

In [3]:
import os
import pandas as pd
import numpy as np
import logging

logger = logging.getLogger(__name__)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.llmtools import Prompt
from llmt.llmtools import MentalHealth, OutpatientServices
from llmt.llmtools import process_prompt
from llmt.openai import OpenAI, create_messages
from llmt.performance import Performance

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Parameters
model = 'gpt-4o'
temperature = 0

# Directories and files
data_dir = os.path.join(os.environ.get('HOME'), 'home_data', 'hcp')
test_file_name = 'hcp-alldata-250413.parquet'
test_file = os.path.join(data_dir, test_file_name)
df = pd.read_parquet(test_file)
# Filter the labeled data
df = df.loc[df['dset'] == 'train'].\
                reset_index(drop=True).\
                astype({'mental_health': int,
                        'inpatient': int,
                        'outpatient': int})
display(df.head())
print(df.shape)

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,431643-07,Actriv,Provider of healthcare staffing services based...,2,0,0,train
1,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train
2,162054-28,Apothecare,Provider of pharmacy services intended to prov...,0,0,0,train
3,597285-28,April Health (Clinics/Outpatient Services),Provider of mental health services intended to...,1,0,1,train
4,373978-90,Arise Child and Family Service,Operator of independent living centers caterin...,2,0,0,train


(187, 7)


### Prompt development ###

In [5]:
# Pick an example of a company that provides mental health services
mental_health = 1
outpatient = 1
id_list = list(df.loc[(df['mental_health'] == mental_health) & (df['outpatient'] == outpatient), 'id'].unique())
print(f'Found {len(id_list)} company ids')

# Pick a company ID
idx = 5
company_id = id_list[idx]
# company_id = '135432-64'

ser_id = df.loc[df['id'] == company_id].iloc[0]
display(ser_id)
name = ser_id['name']
description = ser_id['description']
print()
print(name)
print(description)

Found 87 company ids


id                                                       277413-40
name                                                Eleanor Health
description      Provider of evidence-based outpatient care and...
mental_health                                                    1
inpatient                                                        0
outpatient                                                       1
dset                                                         train
Name: 23, dtype: object


Eleanor Health
Provider of evidence-based outpatient care and addiction recovery services intended to help people suffering from substance abuse disorder. The company's services offer an integrated approach that includes medication-assisted treatment for addiction, evidence-based outpatient care, behavioral health and personalized recovery plans, enabling patients to fasten the recovery process.


In [6]:
variable = 'outpatient'
prompt_version = 1
prompt_name = f'{variable}_system_{str(prompt_version).zfill(2)}'
system_prompt = Prompt().load(prompt_name=prompt_name)

# For the user prompt, we want to include the name of the business and the description
def create_user_prompt(name: str, description: str, variable: str):
    user_prompt = f"""
        The organization {name} is described as: {description} 
        Does this organization provide {variable} healthcare services?
        """
    return process_prompt(user_prompt)

user_prompt = create_user_prompt(name=name, description=description, variable=variable)
messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt)

In [7]:
print(system_prompt)
print()
print(user_prompt)

You are a classification assistant. Your task is to analyze a healthcare-related organization's name and description, and return two Boolean values based on the following rules:


Outpatient_Services (Boolean):
Return True if the organization clearly provides direct outpatient medical services to human patients. These services include in-person or telemedicine-based care such as primary care, specialty care, behavioral or mental health therapy (only if it is not a software-only platform), urgent care, or other services that do not require overnight hospitalization. Return False if the organization does not clearly provide such services.


Verifiable (Boolean):
Return True if the classification in Outpatient_Services can be confidently determined using:

The provided name and description
The assistant’s general knowledge of the organization
Whether the organization clearly matches any of the exclusion criteria
Return False if the description is too vague or lacks sufficient information 

In [8]:
# Send prompt to model
model = 'gpt-4o'
temperature = 0
response_format = OutpatientServices
client = OpenAI().create_client()

output = OpenAI().send_messages(messages=messages, 
                                model=model, 
                                response_format=response_format,
                                temperature=0,
                                client=client)
print(output)
# Replace the boolean with binary outcome prediction
key_list = ['pred_op', 'verified_op']
output.update({key: 1 if output.get(key) == True else 0 for key in key_list})
print(output)

{'pred_op': True, 'pred_op_score': 0.95, 'verified_op': True, 'refusal': None}
{'pred_op': 1, 'pred_op_score': 0.95, 'verified_op': 1, 'refusal': None}


In [9]:
# Run the prompt on all data
company_id_list = sorted(list(df['id'].unique()))
results_df_list = []

for c, company_id in enumerate(company_id_list):
    if (c + 1) % 20 == 0:
        print(f'Sending description {c + 1} / {len(company_id_list)} to the model')
    df_id = df.loc[df['id'] == company_id]
    user_prompt_id = create_user_prompt(name=df_id['name'].values[0], 
                                        description=df_id['description'].values[0],
                                        variable=variable)
    messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt_id)
    output = OpenAI().send_messages(messages=messages,
                                    model=model,
                                    temperature=temperature,
                                    response_format=response_format,
                                    client=client)
    
   # Replace the boolean with binary outcome prediction
    key_list = ['pred_op', 'verified_op']
    output.update({key: 1 if output.get(key) == True else 0 for key in key_list})
    results_df_list.append(df_id.assign(**output))

results_df = pd.concat(results_df_list, axis=0, ignore_index=True)
# Save the results
results_file_name = f'{variable}_{str(prompt_version).zfill(2)}_results.parquet'
results_file = os.path.join(data_dir, results_file_name)
results_df.to_parquet(results_file)
print(results_file)

Sending description 20 / 187 to the model
Sending description 40 / 187 to the model
Sending description 60 / 187 to the model
Sending description 80 / 187 to the model
Sending description 100 / 187 to the model
Sending description 120 / 187 to the model
Sending description 140 / 187 to the model
Sending description 160 / 187 to the model
Sending description 180 / 187 to the model
/app/home_data/hcp/outpatient_01_results.parquet


In [10]:
display(Performance(data=results_df).\
        binary_performance(true_col='outpatient', pred_col='pred_op'))

{'p': 94,
 'n': 42,
 'tp': 76,
 'tn': 31,
 'fp': 11,
 'fn': 18,
 'recall': 0.8085,
 'precision': 0.8736,
 'min_precision': 0.6912,
 'specificity': 0.7381,
 'f_score': 0.8398}