### Script to run all prompts on the data set ###

In [1]:
import os
import pandas as pd
import numpy as np
import logging
import time
from pathlib import Path

logger = logging.getLogger(__name__)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.llmtools import Prompt
from llmt.llmtools import process_prompt
from llmt.openai import OpenAIModel, OpenAI
from llmt.openai import MentalHealth, OutpatientServices, InpatientServices, create_messages
from llmt.performance import Performance

In [2]:
# Directories and files
data_dir = os.path.join(os.environ.get('HOME'), 'home_data', 'hcp')
experiment_name = 'hcp_experiment_test'
output_dir = os.path.join(data_dir, experiment_name)
Path(output_dir).mkdir(exist_ok=True, parents=True)

test_file_name = 'hcp-alldata-250413.parquet'
test_file = os.path.join(data_dir, test_file_name)
df_all = pd.read_parquet(test_file)
# Filter the labeled data
df_train = df_all.loc[df_all['dset'] == 'train'].\
                reset_index(drop=True).\
                astype({'mental_health': int,
                        'inpatient': int,
                        'outpatient': int})
display(df_train.head())
print(df_train.shape)

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,431643-07,Actriv,Provider of healthcare staffing services based...,2,0,0,train
1,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train
2,162054-28,Apothecare,Provider of pharmacy services intended to prov...,0,0,0,train
3,597285-28,April Health (Clinics/Outpatient Services),Provider of mental health services intended to...,1,0,1,train
4,373978-90,Arise Child and Family Service,Operator of independent living centers caterin...,2,0,0,train


(187, 7)


In [3]:
# Add some additional samples 
test_samples = 0
random_state = 111
df_test = df_all.loc[df_all['mental_health'].isnull()].\
                sample(n=test_samples, replace=False, random_state=random_state).\
                reset_index(drop=True)
# Combine the training and test samples
df = pd.concat([df_train, df_test], axis=0, ignore_index=True).\
                sample(frac=1, random_state=random_state).\
                reset_index(drop=True)
for dset in ['train', 'test']:
    print(f'{dset}: {len(df.loc[df['dset']==dset, 'id'].unique())}')

display(df.head())

train: 187
test: 0


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,248187-88,River Crest Hospital,Provider of psychiatric diagnostic medical ser...,1.0,1.0,1.0,train
1,127477-00,Curry Health District,Operator of a healthcare network in Southern O...,0.0,1.0,1.0,train
2,11809-18,BrightSpring Health Services (NAS: BTSG),BrightSpring Health Services Inc is a home and...,2.0,0.0,0.0,train
3,144253-45,Bon Secours Baltimore Hospital,Operator of a full-service hospital in Baltimo...,0.0,1.0,1.0,train
4,439172-02,East Tennessee Behavioral Health,Operator of an inpatient behavioral health hos...,1.0,1.0,2.0,train


In [None]:
temperature = 0 
runs = 1
results_file_base = f'250421_results'

# Run the prompt on all data
company_id_list = sorted(list(df['id'].unique()))
start_time = time.perf_counter()

# Instantiate the model class
model = OpenAIModel()

for run in range(runs):
    execution_time = time.perf_counter() - start_time
    execution_time_min = np.round(execution_time/60, decimals=1)
    print(f'Execution time: {execution_time_min} minutes.')
    results_run_file_name = f'{results_file_base}_{str(run).zfill(2)}.parquet'
    results_run_file = os.path.join(output_dir, results_run_file_name)
    print(f'STARTING RUN {run + 1} / {runs}: {results_run_file_name}')
    results_run_df_list = []
    for c, company_id in enumerate(company_id_list):
        if (c + 1) % 20 == 0:
            print(f'Sending description {c + 1} / {len(company_id_list)} to the model')
    
        df_id = df.loc[df['id'] == company_id]
        name = df_id['name'].values[0]
        description = df_id['description'].values[0]
        
        # Mental health predictions
        response_mh = model.predict_mh(name=name, description=description, version=2, temperature=temperature)
        df_id = df_id.assign(**response_mh)
        
        # Inpatient predictions
        response_ip = model.predict_ip(name=name, description=description, version=1, temperature=temperature)
        df_id = df_id.assign(**response_ip)
        
        # Outpatient predictions
        response_op = model.predict_op(name=name, description=description, version=1, temperature=temperature)
        df_id = df_id.assign(**response_op)

        results_run_df_list.append(df_id)

    # Add the new data frame to the list
    results_run_df = pd.concat(results_run_df_list, axis=0, ignore_index=True)
    results_run_df = results_run_df.assign(temperature=temperature)
    # Save it
    results_run_df.to_parquet(results_run_file)

In [None]:
# Load the data and check the results
# results_file_name = 'inpatient_01_results.parquet'
results_file_name = f'hcp_experiment_01_01_t1.parquet'
results_file = os.path.join(output_dir, results_file_name)
df = pd.read_parquet(results_file)

results_train = df.loc[df['dset'] == 'train']

# Mapping true variable and predictions
variable_dict = {'mental_health': 'pred_mh',
                 'inpatient': 'pred_ip',
                 'outpatient': 'pred_op'}

for true_col in variable_dict.keys():
    print(f'PERFORMANCE: {true_col.upper()}')
    display(Performance(data=df.copy()).\
            binary_performance(true_col=true_col, pred_col=variable_dict.get(true_col)))
    print()