### Export experimental predictions ###

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import glob
from matplotlib import pyplot as plt
import seaborn as sns

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
from llmt.performance import Performance

In [2]:
# Directories and data files
experiment_name = 'exports_500mh3ip1op1'
data_root = os.path.join(os.environ.get('HOME'), 'home_data')
data_dir = os.path.join(data_root, 'hcp')
output_dir = os.path.join(data_dir, experiment_name)

# Collect the data from the completed experiment
parquet_file_list = sorted(glob.glob(os.path.join(output_dir, f'250423_mh3ip1op1_*.parquet')))
print(len(parquet_file_list))

df_list = []
for file in parquet_file_list:
    df_file = pd.read_parquet(file)
    df_file = df_file.\
        assign(run=int(os.path.basename(file).rsplit('_', maxsplit=2)[1]))
    df_list.append(df_file)
df = pd.concat(df_list, axis=0, ignore_index=True).\
        sort_values(by=['run', 'temperature', 'id'], ascending=True).\
        reset_index(drop=True)
print(df.shape)
display(df.head())

1
(687, 13)


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,temperature,run
0,100289-80,At Home Health Services,Provider of home healthcare services. The comp...,,,,test,1,0,1,1,0.0,1
1,100296-37,Behavioral Pathway Systems,Provider of healthcare benchmarking services i...,,,,test,0,0,0,1,0.0,1
2,100363-69,Chilton Hospital,Operator of inpatient hospital center. The com...,1.0,1.0,1.0,train,1,1,1,1,0.0,1
3,100501-21,Fortwood Center,Provider of outpatient mental health care serv...,,,,test,1,0,1,1,0.0,1
4,100593-37,Leadership Health,Operator of a healthcare program platform inte...,,,,test,0,0,0,0,0.0,1


In [3]:
# Mapping true variable and predictions
variable_dict = {'mental_health': 'pred_mh',
                 'inpatient': 'pred_ip',
                 'outpatient': 'pred_op'}

prompt_version = {'mental_health': 3,
                  'inpatient': 1,
                  'outpatient': 1}

In [4]:
# Sort the data frame starting with the training set
df_list = []
for dset in ['train', 'test']:
    df_dset = df.loc[df['dset'] == dset].\
        drop('run', axis=1).\
        sort_values(by='name', ascending=True).\
        reset_index(drop=True)
    print(f'dset {dset}: {df_dset.shape}')
    df_list.append(df_dset)

df_sorted = pd.concat(df_list, axis=0, ignore_index=True)
display(df_sorted.tail())

dset train: (187, 12)
dset test: (500, 12)


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,temperature
682,316350-82,the camden center,Provider of mental health and addiction treatm...,,,,test,1,0,1,1,0.0
683,326761-39,the neurovation center,Provider of counseling and neurofeedback servi...,,,,test,1,0,1,1,0.0
684,463651-93,uMore,Developer of a comprehensive library of therap...,,,,test,0,0,0,1,0.0
685,321669-64,warrior surf foundation,"Provider of free surf therapy, wellness coachi...",,,,test,1,0,0,1,0.0
686,321782-86,welcome home ministries,Faith-based organization offering recovery sup...,,,,test,0,1,0,1,0.0


In [5]:
# Save the data set as a .csv file
csv_file_name = '250423_mh3ip1op1_T0.csv'
csv_file = os.path.join(output_dir, csv_file_name)
df_sorted.to_csv(csv_file)

In [6]:
# Make sure we save the performance as well
df_train = df_sorted.loc[df_sorted['dset'] == 'train']
df_train = df_train.astype({key: int for key in variable_dict.keys()})
temperature = 0
performance_df_list = []
for true_col, pred_col in variable_dict.items():
    performance_dict = Performance(data=df_train).\
                    binary_performance(true_col=true_col, pred_col=pred_col)
    performance_df = pd.DataFrame(performance_dict, index=[0])
    performance_df.insert(loc=0, column='category', value=true_col)
    performance_df.insert(loc=2, column='temperature', value=temperature)
    performance_df.insert(loc=3, column='prompt_version', value=prompt_version.get(true_col))
    performance_df_list.append(performance_df)
performance_df = pd.concat(performance_df_list, axis=0, ignore_index=True)
performance_df

# Save the summary statistics
stat_file_base = '250423_mh3ip1op1_T0_performance'
stat_file_csv = os.path.join(output_dir, f'{stat_file_base}.csv')
performance_df.to_csv(stat_file_csv)

### Combine categories ###

### Unknown values ###
How did the model predict the unknown categories

In [7]:
# Remove the unknown categories
var1 = 'mental_health'
pred_var1 = variable_dict.get(var1)

# The second category can be either inpatient or outpatient
var2_list = ['inpatient', 'outpatient']

performance_df_list = []
for var2 in var2_list:
    
    pred_var2 = variable_dict.get(var2)
    category_label = f'{var1} + {var2}'
    
    # Filter out the unknown catagories
    for var in [var1, var2]:
        df12 = df_train.copy().\
                        loc[df_train[var].isin([0, 1])].\
                        reset_index(drop=True)
    
    # Create the combined category
    df12 = df12[['id', 'name', 'description', var1, var2, pred_var1, pred_var2]] 
    df12 = df12.assign(y_true=df12.apply(lambda s: 1 if (s[var1] == 1) & (s[var2] == 1) else 0, axis=1),
                       y_pred=df12.apply(lambda s: 1 if (s[pred_var1] == 1) & (s[pred_var2] == 1) else 0, axis=1))
    y_true = df12['y_true'].values
    y_pred = df12['y_pred'].values
    
    # Performance metrics
    performance_dict = Performance(data=df12).binary_performance(true_col='y_true', pred_col='y_pred')
    performance_df_var = pd.DataFrame(performance_dict, index=[0])
    performance_df_var.insert(loc=0, column='category', value=category_label)
    performance_df_list.append(performance_df_var)
    
performance_df = pd.concat(performance_df_list, axis=0, ignore_index=True)
display(performance_df)

Unnamed: 0,category,p,n,tp,tn,fp,fn,recall,precision,min_precision,specificity,f_score
0,mental_health + inpatient,62,120,58,107,13,4,0.9355,0.8169,0.3407,0.8917,0.8722
1,mental_health + outpatient,87,49,71,38,11,16,0.8161,0.8659,0.6397,0.7755,0.8402


In [8]:
for var in variable_dict.keys():
    pred_var = variable_dict.get(var)
    var_cols = ['id', 'name', 'description', var, pred_var]
    print(var.upper())
    df_train_var = df_train.loc[df_train[var] == 2, var_cols]
    preds = df_train_var[pred_var].\
                    value_counts().\
                    to_frame().\
                    reset_index(drop=False).\
                    sort_values(by=pred_var, ascending=True).\
                    reset_index(drop=True)
    display(preds)

MENTAL_HEALTH


Unnamed: 0,pred_mh,count
0,0,11
1,1,5


INPATIENT


Unnamed: 0,pred_ip,count
0,0,4
1,1,1


OUTPATIENT


Unnamed: 0,pred_op,count
0,0,29
1,1,22


### Look at some mental health predictions that are wrong ###

In [9]:
# False positives
var = 'mental_health'
pred_var = variable_dict.get(var)
col_list = ['id', 'name', 'description', var, pred_var]
fp_df = df_train.loc[(df_train['mental_health'] == 0) & (df_train['pred_mh'] == 1), col_list].\
                reset_index(drop=True)
for i in range(len(fp_df)):
    name = fp_df.iloc[i].get('name')
    description = fp_df.iloc[i].get('description')
    true = fp_df.iloc[i].get(var)
    pred = fp_df.iloc[i].get(pred_var)
    print(f'{name.upper()} {var.upper()}: {true}, {pred_var.upper()}: {pred}')
    print(description)
    print()

ALIMA MENTAL_HEALTH: 0, PRED_MH: 1
Operator of a non-governmental organization intended to provide medical care in humanitarian emergency regions. The company specializes in rapid deployment and emergency medical interventions in response to crises such as natural disasters, epidemics, and conflicts and offers primary healthcare, maternal and child services, surgery, nutrition programs, and mental support, enabling vulnerable and underserved populations with clinical care and improving health outcomes.

ASCELLUS MENTAL_HEALTH: 0, PRED_MH: 1
Operator of trauma prevention and treatment clinics intended to treat employees suffering from chronic pain. The company's clinics deliver customized treatment options, reduce compensation claims costs, and empower injured workers to return to work sooner, enabling doctors and practitioners to restore their physical and emotional well-being.

BON SECOURS BALTIMORE HOSPITAL MENTAL_HEALTH: 0, PRED_MH: 1
Operator of a full-service hospital in Baltimore

In [10]:
# False negatives (missed positives)
# False positives
var = 'mental_health'
pred_var = variable_dict.get(var)
col_list = ['id', 'name', 'description', var, pred_var]
fp_df = df_train.loc[(df_train['mental_health'] == 1) & (df_train['pred_mh'] == 0), col_list].\
                reset_index(drop=True)
for i in range(len(fp_df)):
    name = fp_df.iloc[i].get('name')
    description = fp_df.iloc[i].get('description')
    true = fp_df.iloc[i].get(var)
    pred = fp_df.iloc[i].get(pred_var)
    print(f'{name.upper()} {var.upper()}: {true}, {pred_var.upper()}: {pred}')
    print(description)
    print()

AMERICAN ACADEMIC HEALTH SYSTEM MENTAL_HEALTH: 1, PRED_MH: 0
Operator of academic and community-based acute care hospitals based in El Segundo, California. The company specializes in working with medical staff, nurses and other providers, enabling community-based organizations to address social determinants of health such as behavioral health issues, lack of access to quality healthcare services, environmental disparities, food scarcity and trauma.

BEHEALTH SOLUTIONS MENTAL_HEALTH: 1, PRED_MH: 0
Provider of digital health and wellness services intended to address various behavioral and mental health concerns. The company's engaging and interactive self-help programs incorporate effective cognitive-behavioral techniques that are generally only available through face-to-face visits with specially trained clinicians, enabling patients to get scientifically validated behavioral health programs online and through mobile devices.

BEAVER DAM COMMUNITY HOSPITALS MENTAL_HEALTH: 1, PRED_MH: 0
