### Script to run all prompts on the data set ###

In [None]:
import os
import pandas as pd
import numpy as np
import logging
import time

logger = logging.getLogger(__name__)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.llmtools import Prompt
from llmt.llmtools import MentalHealth, OutpatientServices, InpatientServices
from llmt.llmtools import process_prompt
from llmt.openai import OpenAI, create_messages
from llmt.performance import Performance

In [None]:
# Parameters
model = 'gpt-4o'
temperature = 0

# Directories and files
data_dir = os.path.join(os.environ.get('HOME'), 'home_data', 'hcp')
test_file_name = 'hcp-alldata-250413.parquet'
test_file = os.path.join(data_dir, test_file_name)
df_all = pd.read_parquet(test_file)
# Filter the labeled data
df_train = df_all.loc[df_all['dset'] == 'train'].\
                reset_index(drop=True).\
                astype({'mental_health': int,
                        'inpatient': int,
                        'outpatient': int})
display(df_train.head())
print(df_train.shape)

In [None]:
# Add some additional samples 
test_samples = 10
random_state = 111
df_test = df_all.loc[df_all['mental_health'].isnull()].\
                sample(n=test_samples, replace=False, random_state=random_state).\
                reset_index(drop=True)
# Combine the training and test samples
df = pd.concat([df_train, df_test], axis=0, ignore_index=True).\
                sample(frac=1, random_state=random_state).\
                reset_index(drop=True)
for dset in ['train', 'test']:
    print(f'{dset}: {len(df.loc[df['dset']==dset, 'id'].unique())}')

display(df.head())

In [None]:
def predict_mh(name, description, version, model='gpt-4o', client=None, temperature=0): 
    variable = 'mental_health'
    pred_col = 'pred_mh'
    prompt_name = f'{variable}_system_{str(version).zfill(2)}'
    system_prompt = Prompt().load(prompt_name=prompt_name)
    user_prompt = process_prompt(f"""
                    The organization {name} is described as: {description} 
                    Does this organization provide {variable} healthcare services?
                    """)
    messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt)
    output = OpenAI().send_messages(messages=messages,
                                    model=model,
                                    temperature=temperature,
                                    response_format=MentalHealth,
                                    client=client)
    # Replace the boolean fields with binary outcome prediction
    output.update({pred_col: 1 if output.get(pred_col) == True else 0})
    # Select the fields that we want 
    output = {pred_col: output.get(pred_col)}
    return output

def predict_ip(name, description, version, model='gpt-4o', client=None, temperature=0): 
    variable = 'inpatient'
    pred_col = 'pred_ip'
    prompt_name = f'{variable}_system_{str(version).zfill(2)}'
    system_prompt = Prompt().load(prompt_name=prompt_name)
    user_prompt = process_prompt(f"""
                    The organization {name} is described as: {description} 
                    Does this organization provide {variable} healthcare services?
                    """)
    messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt)
    output = OpenAI().send_messages(messages=messages,
                                    model=model,
                                    temperature=temperature,
                                    response_format=InpatientServices,
                                    client=client)
    # Replace the boolean fields with binary outcome prediction
    output.update({pred_col: 1 if output.get(pred_col) == True else 0})
    # Select the fields that we want 
    output = {pred_col: output.get(pred_col)}
    return output

def predict_op(name, description, version, model='gpt-4o', client=None, temperature=0): 
    variable = 'outpatient'
    pred_col = 'pred_op'
    prompt_name = f'{variable}_system_{str(version).zfill(2)}'
    system_prompt = Prompt().load(prompt_name=prompt_name)
    user_prompt = process_prompt(f"""
                    The organization {name} is described as: {description} 
                    Does this organization provide {variable} healthcare services?
                    """)
    messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt)
    output = OpenAI().send_messages(messages=messages,
                                    model=model,
                                    temperature=temperature,
                                    response_format=OutpatientServices,
                                    client=client)
    # Replace the boolean fields with binary outcome prediction
    key_list = [pred_col, 'verified_op']
    output.update({key: 1 if output.get(key) == True else 0 for key in key_list})
    # Select the fields that we want 
    output = {pred_col: output.get(pred_col), 'verified_op': output.get('verified_op')}
    return output

In [None]:
# Try this function
company_id_list = sorted(list(df['id'].unique()))
company_id = company_id_list[10]
df_id = df.loc[df['id'] == company_id]
name = df_id['name'].values[0]
description = df_id['description'].values[0]
client = OpenAI().create_client()

model_params = {'model': 'gpt-4o', 
                'client': client, 
                'temperature': 0}

response_mh = predict_mh(name=name, description=description, version=2, **model_params)
print(response_mh)
response_ip = predict_ip(name=name, description=description, version=1, **model_params)
print(response_ip)
response_op = predict_op(name=name, description=description, version=1, **model_params)
print(response_op)

In [None]:
client = OpenAI().create_client()
model_params = {'model': 'gpt-4o', 
                'client': client, 
                'temperature': 0}

runs = 10

# Run the prompt on all data
company_id_list = sorted(list(df['id'].unique()))
results_df_list = []
start_time = time.perf_counter()

for run in range(runs):
    execution_time = time.perf_counter() - start_time
    execution_time_min = np.round(execution_time/60, decimals=1)
    print(f'Execution time: {execution_time_min} minutes.')
    print(f'STARTING RUN {run + 1} / {runs}')
    for c, company_id in enumerate(company_id_list):
        if (c + 1) % 20 == 0:
            print(f'Sending description {c + 1} / {len(company_id_list)} to the model')
    
        df_id = df.loc[df['id'] == company_id]
        name = df_id['name'].values[0]
        description = df_id['description'].values[0]
        
        # Mental health predictions
        response_mh = predict_mh(name=name, description=description, version=2, **model_params)
        df_id = df_id.assign(**response_mh)
        
        # Inpatient predictions
        response_ip = predict_ip(name=name, description=description, version=1, **model_params)
        df_id = df_id.assign(**response_ip)
        
        # Outpatient predictions
        response_op = predict_op(name=name, description=description, version=1, **model_params)
        df_id = df_id.assign(**response_op)

        # Add the new data frame to the list
        df_id = df_id.assign(run=run, 
                             temp=model_params.get('temperature'))
        results_df_list.append(df_id)

results_df = pd.concat(results_df_list, axis=0, ignore_index=True)

# Save the results
results_file_name = f'250420_results_10_runs.parquet'
results_file = os.path.join(data_dir, results_file_name)
results_df.to_parquet(results_file)
print(results_file)
end_time = time.perf_counter()
execution_time = end_time - start_time
execution_time_min = np.round(execution_time/60, decimals=1)
print(f'Execution time: {execution_time_min} minutes.')

In [None]:
# Load the data and check the results
# results_file_name = 'inpatient_01_results.parquet'
results_file_name = f'250420_results_10_runs.parquet'
results_file = os.path.join(data_dir, results_file_name)
df = pd.read_parquet(results_file)

results_train = df.loc[df['dset'] == 'train']

# Mapping true variable and predictions
variable_dict = {'mental_health': 'pred_mh',
                 'inpatient': 'pred_ip',
                 'outpatient': 'pred_op'}

for true_col in variable_dict.keys():
    print(f'PERFORMANCE: {true_col.upper()}')
    display(Performance(data=df.copy()).\
            binary_performance(true_col=true_col, pred_col=variable_dict.get(true_col)))
    print()