### Export experimental predictions ###

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import glob
from matplotlib import pyplot as plt
import seaborn as sns

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.performance import Performance

In [3]:
# Directories and data files
experiment_name = 'exports_500mh3ip1op1'
data_root = os.path.join(os.environ.get('HOME'), 'home_data')
data_dir = os.path.join(data_root, 'hcp')
output_dir = os.path.join(data_dir, experiment_name)

# Collect the data from the completed experiment
parquet_file_list = sorted(glob.glob(os.path.join(output_dir, f'250423_mh3ip1op1_*.parquet')))
print(len(parquet_file_list))

df_list = []
for file in parquet_file_list:
    df_file = pd.read_parquet(file)
    df_file = df_file.\
        assign(run=int(os.path.basename(file).rsplit('_', maxsplit=2)[1]))
    df_list.append(df_file)
df = pd.concat(df_list, axis=0, ignore_index=True).\
        sort_values(by=['run', 'temperature', 'id'], ascending=True).\
        reset_index(drop=True)
print(df.shape)
display(df.head())

1
(687, 13)


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,temperature,run
0,100289-80,At Home Health Services,Provider of home healthcare services. The comp...,,,,test,1,0,1,1,0.0,1
1,100296-37,Behavioral Pathway Systems,Provider of healthcare benchmarking services i...,,,,test,0,0,0,1,0.0,1
2,100363-69,Chilton Hospital,Operator of inpatient hospital center. The com...,1.0,1.0,1.0,train,1,1,1,1,0.0,1
3,100501-21,Fortwood Center,Provider of outpatient mental health care serv...,,,,test,1,0,1,1,0.0,1
4,100593-37,Leadership Health,Operator of a healthcare program platform inte...,,,,test,0,0,0,0,0.0,1


In [4]:
# Mapping true variable and predictions
variable_dict = {'mental_health': 'pred_mh',
                 'inpatient': 'pred_ip',
                 'outpatient': 'pred_op'}

prompt_version = {'mental_health': 3,
                  'inpatient': 1,
                  'outpatient': 1}

In [11]:
# Sort the data frame starting with the training set
df_list = []
for dset in ['train', 'test']:
    df_dset = df.loc[df['dset'] == dset].\
        drop('run', axis=1).\
        sort_values(by='name', ascending=True).\
        reset_index(drop=True)
    print(f'dset {dset}: {df_dset.shape}')
    df_list.append(df_dset)

df_sorted = pd.concat(df_list, axis=0, ignore_index=True)
display(df_sorted.head())

dset train: (187, 12)
dset test: (500, 12)


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,temperature
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1.0,1.0,1.0,train,1,1,1,1,0.0
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1.0,1.0,2.0,train,1,1,0,1,0.0
2,431643-07,Actriv,Provider of healthcare staffing services based...,2.0,0.0,0.0,train,0,0,0,1,0.0
3,310749-31,Alima,Operator of a non-governmental organization in...,0.0,0.0,0.0,train,1,1,1,1,0.0
4,107240-50,Alvarado Parkway Institute,Operator of a psychiatric health care facility...,1.0,1.0,1.0,train,1,1,1,1,0.0


In [14]:
# Save the data set as a .csv file
csv_file_name = '250423_mh3ip1op1_T0.csv'
csv_file = os.path.join(output_dir, csv_file_name)
df_sorted.to_csv(csv_file)

In [22]:
# Make sure we save the performance as well
df_train = df_sorted.loc[df_sorted['dset'] == 'train']
temperature = 0
performance_df_list = []
for true_col, pred_col in variable_dict.items():
    performance_dict = Performance(data=df_train).\
                    binary_performance(true_col=true_col, pred_col=pred_col)
    performance_df = pd.DataFrame(performance_dict, index=[0])
    performance_df.insert(loc=0, column='category', value=true_col)
    performance_df.insert(loc=2, column='temperature', value=temperature)
    performance_df.insert(loc=3, column='prompt_version', value=prompt_version.get(true_col))
    performance_df_list.append(performance_df)
performance_df = pd.concat(performance_df_list, axis=0, ignore_index=True)
performance_df

Unnamed: 0,category,p,temperature,prompt_version,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,mental_health,136,0,3,35,108,29,6,28,0.7941,0.9474,0.7953,0.8286,0.864
1,inpatient,69,0,1,113,68,103,10,1,0.9855,0.8718,0.3791,0.9115,0.9252
2,outpatient,94,0,1,42,79,31,11,15,0.8404,0.8778,0.6912,0.7381,0.8587


In [26]:
# Save the summary statistics
stat_file_base = '250423_mh3ip1op1_T0_performance'
stat_file_csv = os.path.join(output_dir, f'{stat_file_base}.csv')
performance_df.to_csv(stat_file_csv)