### Multi-Model Performance Evaluation on the training set ###
Use the manual labels as ground truth

In [1]:
import os
import copy
import numpy as np
import pandas as pd
from typing import List
import glob
import logging
from collections import deque

logger = logging.getLogger(__name__)

# Binary perfomance metrics
from sklearn.metrics import confusion_matrix

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.performance import Performance, binary_performance

In [2]:
data_root = os.environ.get('DATA')
data_dir = os.path.join(data_root, 'hcp')
model_dir = os.path.join(data_dir, 'models')
model_file_list = sorted(glob.glob(os.path.join(model_dir, 'hcp_predictions_*.parquet')))
print(*model_file_list, sep='\n')
pred_col_list = ['pred_mh', 'pred_ip', 'pred_op']

/app/data/hcp/models/hcp_predictions_20250520_gpt-4o.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_deepseek-v3_671b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_13b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_70b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_7b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama3_70b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama3_8b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama4_maverick.parquet


In [3]:
# Data set that we sent to the HCP team
df_file_name = 'Predictions-2025-04-24.xlsx'
df_file = os.path.join(data_dir, df_file_name)
dfxl = pd.read_excel(df_file)
# Train and test data
df_train = dfxl.loc[dfxl['dset'] == 'train']
df_test = dfxl.loc[dfxl['dset'] == 'test']
display(df_train.head(2))
print(dfxl.shape)
print(len(dfxl['id'].unique()))
print(df_test.shape)
print(df_train.shape)

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,temperature
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1.0,1.0,1.0,train,1,1,1,1,0
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1.0,1.0,2.0,train,1,1,0,1,0


(687, 12)
687
(500, 12)
(187, 12)


### Helper functions ###

In [4]:
# Helper functions
def flat_list(input_list: list) -> list:
    flattened_list = [item for sublist in input_list for item in sublist]
    return flattened_list 
    
def rotate_columns(data, by=1):
    col_deque = deque(list(data.columns))
    col_deque.rotate(by)
    return data[list(col_deque)]
    
# A function to combine columns
def combine_columns(data, true_col_list, pred_col_list):
    ''' Combine binary columns like [true_mh, true_ip] '''
    pf = Performance(data=data)
    true_combined = pf.combine_binary_columns(input_col_list=true_col_list).\
        drop(true_col_list, axis=1)
    pred_combined = pf.combine_binary_columns(input_col_list=pred_col_list).\
        drop(pred_col_list, axis=1)
    data_combined = data.\
        merge(true_combined, on='id', how='left').\
        merge(pred_combined, on='id', how='left')
    # Rename the columns in the output 
    return data_combined

### Performance on the training set ###

In [5]:
# Column names
true_mh = 'mental_health'
true_ip = 'inpatient'
true_op = 'outpatient'
true_mh_ip = 'mh_ip'
true_mh_op = 'mh_op'

pred_mh = 'pred_mh'
pred_ip = 'pred_ip'
pred_op = 'pred_op'
pred_mh_ip = 'pred_mh_ip'
pred_mh_op = 'pred_mh_op'

# Dictionary with true:pred column names
col_dict = {true_mh: pred_mh, true_ip:pred_ip, true_op:pred_op,
            true_mh_ip: pred_mh_ip,
            true_mh_op: pred_mh_op}

# This function uses the definitions above
def combine_mental_health_columns(data, 
                                  true_mh_col=true_mh,
                                  true_ip_col=true_ip,
                                  true_op_col=true_op,
                                  true_mh_ip_col=true_mh_ip,
                                  true_mh_op_col=true_mh_op,
                                  pred_mh_col=pred_mh,
                                  pred_ip_col=pred_ip,
                                  pred_op_col=pred_op, 
                                  pred_mh_ip_col=pred_mh_ip,
                                  pred_mh_op_col=pred_mh_op):
                                  
    ''' Combine mental health with inpatient/outpatient services columns '''
    data_ip = combine_columns(data=data, true_col_list=[true_mh_col, true_ip_col], pred_col_list=[pred_mh_col, pred_ip_col])
    data_op = combine_columns(data=data_ip, true_col_list=[true_mh_col, true_op_col], pred_col_list=[pred_mh_col, pred_op_col])
    output = data_op.\
        rename(columns={f'{true_mh_col}_{true_ip_col}': true_mh_ip_col,
                        f'{true_mh_col}_{true_op_col}': true_mh_op_col,
                        f'{pred_mh_col}_{pred_ip_col}': pred_mh_ip_col,
                        f'{pred_mh_col}_{pred_op_col}': pred_mh_op_col})
    return output

display(col_dict)

{'mental_health': 'pred_mh',
 'inpatient': 'pred_ip',
 'outpatient': 'pred_op',
 'mh_ip': 'pred_mh_ip',
 'mh_op': 'pred_mh_op'}

In [6]:
# We compare the same variable between different models
# Start with the variable
var_list = list(col_dict.keys())

df_train_list = []
performance_var_list = []
for v, true_col in enumerate(var_list):
    pred_col = col_dict.get(true_col)
    
    performance_model_list = []
    for m, model_file in enumerate(model_file_list):
        df_model = pd.read_parquet(model_file)
        df_train = df_model.loc[df_model['dset'] == 'train'].reset_index(drop=True)
    
        # Create the combined variable columnd
        df_train_combined = combine_mental_health_columns(data=df_train)
        pf = Performance(data=df_train_combined)
        df_train_list.append(df_train_combined)
        
        # Performance metrics for this model
        performance_dict = pf.binary_performance(true_col=true_col, pred_col=pred_col)
        performance_df = pd.DataFrame(performance_dict, index=[m]).\
            assign(model=df_train['model'].values[0])
        performance_model_list.append(rotate_columns(performance_df))
    
    performance_model = rotate_columns(pd.concat(performance_model_list, axis=0, ignore_index=True).\
        assign(category=true_col)).\
        sort_values(by='specificity', ascending=False).\
        reset_index(drop=True)
    performance_var_list.append(performance_model)
    display(performance_model)

train_performance_df = pd.concat(performance_var_list, axis=0, ignore_index=True)
train_output_df = pd.concat(df_train_list, axis=0, ignore_index=True)
display(train_output_df.head(2))
print(train_output_df.shape)

Unnamed: 0,category,model,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,mental_health,llama2_7b,136,35,2,35,0,134,0.0147,1.0,0.7953,1.0,0.029
1,mental_health,deepseek-v3_671b,136,35,89,33,2,47,0.6544,0.978,0.7953,0.9429,0.7841
2,mental_health,llama4_maverick,136,35,102,30,5,34,0.75,0.9533,0.7953,0.8571,0.8395
3,mental_health,gpt-4o,136,35,112,29,6,24,0.8235,0.9492,0.7953,0.8286,0.8819
4,mental_health,llama3_70b,136,35,113,25,10,23,0.8309,0.9187,0.7953,0.7143,0.8726
5,mental_health,llama2_70b,136,35,129,25,10,7,0.9485,0.9281,0.7953,0.7143,0.9382
6,mental_health,llama2_13b,136,35,82,22,13,54,0.6029,0.8632,0.7953,0.6286,0.71
7,mental_health,llama3_8b,136,35,126,21,14,10,0.9265,0.9,0.7953,0.6,0.913


Unnamed: 0,category,model,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,inpatient,llama4_maverick,69,113,64,109,4,5,0.9275,0.9412,0.3791,0.9646,0.9343
1,inpatient,deepseek-v3_671b,69,113,64,107,6,5,0.9275,0.9143,0.3791,0.9469,0.9209
2,inpatient,gpt-4o,69,113,68,103,10,1,0.9855,0.8718,0.3791,0.9115,0.9252
3,inpatient,llama3_70b,69,113,69,99,14,0,1.0,0.8313,0.3791,0.8761,0.9079
4,inpatient,llama3_8b,69,113,69,39,74,0,1.0,0.4825,0.3791,0.3451,0.6509
5,inpatient,llama2_70b,69,113,69,26,87,0,1.0,0.4423,0.3791,0.2301,0.6133
6,inpatient,llama2_13b,69,113,63,19,94,6,0.913,0.4013,0.3791,0.1681,0.5575
7,inpatient,llama2_7b,69,113,69,1,112,0,1.0,0.3812,0.3791,0.0088,0.552


Unnamed: 0,category,model,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,outpatient,deepseek-v3_671b,94,42,66,33,9,28,0.7021,0.88,0.6912,0.7857,0.7811
1,outpatient,gpt-4o,94,42,77,31,11,17,0.8191,0.875,0.6912,0.7381,0.8462
2,outpatient,llama4_maverick,94,42,74,31,11,20,0.7872,0.8706,0.6912,0.7381,0.8268
3,outpatient,llama3_70b,94,42,78,28,14,16,0.8298,0.8478,0.6912,0.6667,0.8387
4,outpatient,llama3_8b,94,42,94,10,32,0,1.0,0.746,0.6912,0.2381,0.8545
5,outpatient,llama2_13b,94,42,93,0,42,1,0.9894,0.6889,0.6912,0.0,0.8122
6,outpatient,llama2_70b,94,42,94,0,42,0,1.0,0.6912,0.6912,0.0,0.8174
7,outpatient,llama2_7b,94,42,94,0,42,0,1.0,0.6912,0.6912,0.0,0.8174


Unnamed: 0,category,model,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,mh_ip,llama2_7b,62,104,0,102,2,62,0.0,0.0,0.3735,0.9808,0.0
1,mh_ip,deepseek-v3_671b,62,104,49,100,4,13,0.7903,0.9245,0.3735,0.9615,0.8522
2,mh_ip,llama4_maverick,62,104,55,98,6,7,0.8871,0.9016,0.3735,0.9423,0.8943
3,mh_ip,gpt-4o,62,104,60,92,12,2,0.9677,0.8333,0.3735,0.8846,0.8955
4,mh_ip,llama3_70b,62,104,61,88,16,1,0.9839,0.7922,0.3735,0.8462,0.8777
5,mh_ip,llama2_13b,62,104,31,58,46,31,0.5,0.4026,0.3735,0.5577,0.446
6,mh_ip,llama3_8b,62,104,60,50,54,2,0.9677,0.5263,0.3735,0.4808,0.6818
7,mh_ip,llama2_70b,62,104,62,38,66,0,1.0,0.4844,0.3735,0.3654,0.6526


Unnamed: 0,category,model,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,mh_op,llama2_7b,87,40,2,40,0,85,0.023,1.0,0.685,1.0,0.0449
1,mh_op,deepseek-v3_671b,87,40,56,36,4,31,0.6437,0.9333,0.685,0.9,0.7619
2,mh_op,llama4_maverick,87,40,66,34,6,21,0.7586,0.9167,0.685,0.85,0.8302
3,mh_op,gpt-4o,87,40,72,33,7,15,0.8276,0.9114,0.685,0.825,0.8675
4,mh_op,llama3_70b,87,40,73,31,9,14,0.8391,0.8902,0.685,0.775,0.8639
5,mh_op,llama2_13b,87,40,52,27,13,35,0.5977,0.8,0.685,0.675,0.6842
6,mh_op,llama2_70b,87,40,86,25,15,1,0.9885,0.8515,0.685,0.625,0.9149
7,mh_op,llama3_8b,87,40,84,24,16,3,0.9655,0.84,0.685,0.6,0.8984


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,model,mh_ip,pred_mh_ip,mh_op,pred_mh_op
0,97840-81,Vheda Health,Developer of an integrated healthcare platform...,0.0,0.0,0.0,train,0,0,0,1,gpt-4o,0.0,0,0.0,0
1,97356-34,Referral Solutions Group,Developer of an online platform designed to he...,0.0,0.0,0.0,train,0,0,0,1,gpt-4o,0.0,0,0.0,0


(7480, 16)


In [7]:
# Save the results
performance_name = 'performance-trainset-8models.csv'
performance_file = os.path.join(data_dir, performance_name)
train_performance_df.to_csv(performance_file)

output_name = 'predictions-trainset-8models-2025-06-21.csv'
output_file = os.path.join(data_dir, output_name)
train_output_df.to_csv(output_file)

### Performance for the test set ###
Use a reference model as the ground truth

In [12]:
# Ground truth model file
print(*model_file_list, sep='\n')
ground_truth_model = 'gpt-4o'
ground_truth_model_file = [file for file in model_file_list if ground_truth_model in file][0]
model_file_list_test = [file for file in model_file_list if ground_truth_model not in file]
ground_truth_columns = ['id', 'dset', 'model', 'pred_mh', 'pred_ip', 'pred_op']

print()
print(ground_truth_model_file)
df_gt = pd.read_parquet(ground_truth_model_file, columns=ground_truth_columns)
df_gt_test = df_gt.loc[df_gt['dset'] == 'test'].reset_index(drop=True)

# Let's rename the columns that we want to use as the ground truth
df_gt_test = df_gt_test.rename(columns={'model': 'reference',
                                        'pred_mh': 'mental_health',
                                        'pred_ip': 'inpatient',
                                        'pred_op': 'outpatient'})

/app/data/hcp/models/hcp_predictions_20250520_gpt-4o.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_deepseek-v3_671b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_13b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_70b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama2_7b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama3_70b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama3_8b.parquet
/app/data/hcp/models/hcp_predictions_ollama_20250520_llama4_maverick.parquet

/app/data/hcp/models/hcp_predictions_20250520_gpt-4o.parquet


In [13]:
display(df_gt_test.head())

Unnamed: 0,id,dset,reference,mental_health,inpatient,outpatient
0,99842-23,test,gpt-4o,1,0,1
1,99155-98,test,gpt-4o,1,0,1
2,98451-10,test,gpt-4o,0,0,0
3,98105-14,test,gpt-4o,0,1,1
4,97868-17,test,gpt-4o,0,0,0
