### OpenAI performance calculations for HCP models and data sets ###
We have updates on data sets and prompts.
Use this code as a template to run predictions for the trainin and test sets

In [3]:
import os
import copy
import numpy as np
import pandas as pd
import time
import logging
from pandas.core.frame import DataFrame

logger = logging.getLogger(__name__)

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
from llmt.performance import Performance

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
data_dir = os.path.join(os.environ.get('DATA'), 'hcp', 'run_250720')
train_file_name_list = ['hcp-train-250413.parquet', 'hcp-train-250701.parquet']

In [8]:
# Count the labels for the ground-truth training files
col_list = ['mental_health', 'inpatient', 'outpatient']
for fidx, train_file_name in enumerate(train_file_name_list):
    data_file = os.path.join(data_dir, train_file_name)
    df = pd.read_parquet(data_file)
    print(f'dataset: {os.path.splitext(train_file_name)[0]}')
    for col in col_list:
        ct = df[col].\
            value_counts().\
            to_frame().\
            reset_index(drop=False).\
            sort_values(by=col, ascending=False).\
            reset_index(drop=True)
        ct.insert(loc=0, column='dataset', value=os.path.splitext(train_file_name)[0])
        display(ct)
        print(f'TOTAL: {ct['count'].sum()}')

dataset: hcp-train-250413


Unnamed: 0,dataset,mental_health,count
0,hcp-train-250413,2.0,16
1,hcp-train-250413,1.0,136
2,hcp-train-250413,0.0,35


TOTAL: 187


Unnamed: 0,dataset,inpatient,count
0,hcp-train-250413,2.0,5
1,hcp-train-250413,1.0,69
2,hcp-train-250413,0.0,113


TOTAL: 187


Unnamed: 0,dataset,outpatient,count
0,hcp-train-250413,2.0,51
1,hcp-train-250413,1.0,94
2,hcp-train-250413,0.0,42


TOTAL: 187
dataset: hcp-train-250701


Unnamed: 0,dataset,mental_health,count
0,hcp-train-250701,1,125
1,hcp-train-250701,0,62


TOTAL: 187


Unnamed: 0,dataset,inpatient,count
0,hcp-train-250701,1,71
1,hcp-train-250701,0,116


TOTAL: 187


Unnamed: 0,dataset,outpatient,count
0,hcp-train-250701,1,115
1,hcp-train-250701,0,72


TOTAL: 187


### Performance calculations ###

In [32]:
# Define the names of the true and predicted columns
# We need to define these names somewhere to combine the correct columns
col_dict = {'mental_health': 'pred_mh',
            'inpatient': 'pred_ip',
            'outpatient': 'pred_op'}

# We need the combination columns for the performance table
col_dict.update({'mental_health_inpatient': 'pred_mh_ip',
                 'mental_health_outpatient': 'pred_mh_op'})

def performance_table(data:DataFrame, true_pred_cols:dict) -> DataFrame:
    """ Create a binary performance table """
    performance_df_list = []
    for v, (true_col, pred_col) in enumerate(true_pred_cols.items()):
        performance_dict = Performance(data=data).binary_performance(true_col=true_col, pred_col=pred_col)
        performance_df = pd.DataFrame(performance_dict, index=[v])
        performance_df.insert(loc=0, column='category', value=true_col)
        performance_df_list.append(performance_df)
    performance_df = pd.concat(performance_df_list, axis=0)
    return performance_df

In [29]:
dset_idx = 1
train_file_name = train_file_name_list[dset_idx]
print(f'DATA SET: {train_file_name}')
pred_file_name = f'{os.path.splitext(train_file_name)[0]}-samples.parquet'
print(pred_file_name)
df = pd.read_parquet(os.path.join(data_dir, pred_file_name))
model_name_list = list(df['model'].unique())
prompt_list = list(df['prompt'].unique())

# Select a mode and a prompt
m = 1
model_name = model_name_list[m]

p = 1
prompt = prompt_list[p]
print(f'MODEL: {model_name} PROMPT: {prompt}')

df1 = df.loc[(df['model'] == model_name) & (df['prompt'] == prompt)]

performance_df = performance_table(data=df1, true_pred_cols=col_dict)
performance_df.insert(loc=0, column='model', value=model_name)
performance_df.insert(loc=1, column='prompt', value=prompt)
display(performance_df)

DATA SET: hcp-train-250701.parquet
hcp-train-250701-samples.parquet
MODEL: gpt-4.1 PROMPT: 2


Unnamed: 0,model,prompt,category,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,gpt-4.1,2,mental_health,125,62,90,54,8,35,0.72,0.9184,0.6684,0.871,0.8072
1,gpt-4.1,2,inpatient,71,116,70,109,7,1,0.9859,0.9091,0.3797,0.9397,0.9459
2,gpt-4.1,2,outpatient,115,72,91,52,20,24,0.7913,0.8198,0.615,0.7222,0.8053
3,gpt-4.1,2,mental_health_inpatient,62,125,52,119,6,10,0.8387,0.8966,0.3316,0.952,0.8667
4,gpt-4.1,2,mental_health_outpatient,102,85,69,71,14,33,0.6765,0.8313,0.5455,0.8353,0.7459


### Summary stats for the test predictions ###

In [33]:
display(col_dict)

{'mental_health': 'pred_mh',
 'inpatient': 'pred_ip',
 'outpatient': 'pred_op',
 'mental_health_inpatient': 'pred_mh_ip',
 'mental_health_outpatient': 'pred_mh_op'}

In [36]:
pred_cols = col_dict.values()
print(pred_cols)

dict_values(['pred_mh', 'pred_ip', 'pred_op', 'pred_mh_ip', 'pred_mh_op'])


In [62]:
test_file_name = 'hcp-test-250701-samples.parquet'
test_file = os.path.join(data_dir, test_file_name)
df = pd.read_parquet(test_file)
display(df.head(2))
print(df.shape)
model = df['model'].unique()[0]
prompt = df['prompt'].unique()[0]

Unnamed: 0,model,prompt,id,name,description,dset,pred_mh,pred_ip,pred_op,verified_op,pred_mh_ip,pred_mh_op
0,gpt-4.1,2,525519-64,10-4 Medical,Provider focused on whole-person healthcare in...,test,1.0,0.0,1.0,1.0,0.0,1.0
1,gpt-4.1,2,111309-13,12 Keys Rehab,Provider of rehabilitation services intended t...,test,1.0,1.0,0.0,0.0,1.0,0.0


(1838, 12)


In [40]:
# Find rows with missing description
df_missing = df.loc[(df['description'].isnull()) | (df['name'].isnull())]
display(df_missing[['id', 'name', 'description']])

Unnamed: 0,id,name,description
310,145425-97,Cherokee Health Systems,
529,256053-16,Florida Recovery Group,
1645,130632-04,Venture Forthe,
1755,128649-61,accreditation commission for health care,
1761,316456-30,building blocks counseling,


In [79]:
category_order = [0, 1]
stat_df_list = []
for col in pred_cols:
    df_col = df[['id', col]].dropna(axis=0).\
        astype({col: int}).\
        astype({col: 'category'})
    df_col[col] = df_col[col].cat.set_categories(category_order, ordered=True)
    cnt = df_col[col].\
        value_counts().\
        to_frame().\
        reset_index(drop=False).\
        sort_values(by=col, ascending=True).\
        rename(columns={col: 'value'})
    #cnt['value'] = cnt['value'].apply(lambda v: 'True' if v==1 else 'False')
    cnt.insert(loc=0, column='model', value=model)
    cnt.insert(loc=1, column='prompt', value=prompt)
    cnt.insert(loc=2, column='category', value=col)
    cnt.insert(loc=4, column='bool', value=cnt['value'].apply(lambda v: 'True' if v==1 else 'False'))
    stat_df_list.append(cnt)
stat_df = pd.concat(stat_df_list, axis=0, ignore_index=True)
display(stat_df)

Unnamed: 0,model,prompt,category,value,bool,count
0,gpt-4.1,2,pred_mh,0,False,741
1,gpt-4.1,2,pred_mh,1,True,1092
2,gpt-4.1,2,pred_ip,0,False,1322
3,gpt-4.1,2,pred_ip,1,True,511
4,gpt-4.1,2,pred_op,0,False,668
5,gpt-4.1,2,pred_op,1,True,1165
6,gpt-4.1,2,pred_mh_ip,0,False,1450
7,gpt-4.1,2,pred_mh_ip,1,True,383
8,gpt-4.1,2,pred_mh_op,0,False,883
9,gpt-4.1,2,pred_mh_op,1,True,950


Unnamed: 0,model,prompt,category,value,bool,count
1,gpt-4.1,2,pred_mh_op,0,False,883
0,gpt-4.1,2,pred_mh_op,1,True,950
