### OpenAI predictions and performance for updated training set ###

In [4]:
import os
import copy
import numpy as np
import pandas as pd
import time

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
from llmt.openaimodel import OpenAIModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
data_dir = os.path.join(os.environ.get('DATA'), 'hcp')
print(data_dir)
train_file_name = 'hcp-train-250701.parquet'
train_file = os.path.join(data_dir, train_file_name)
print(train_file)
train_df = pd.read_parquet(train_file)
print(train_df.shape)
print(len(train_df['id'].unique()))
display(train_df.head(2))
company_id_list = list(train_df['id'].unique())
print(len(company_id_list))

/app/data/hcp
/app/data/hcp/hcp-train-250701.parquet
(187, 7)
187


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1,1,1,train
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1,1,1,train


187


In [18]:
# Define the names of the true and predicted columns
# We need to define these names somewhere to combine the correct columns
true_col_dict = {'true_mh_col': 'mental_health',
                 'true_ip_col': 'inpatient',
                 'true_op_col': 'outpatient',
                 'true_mh_ip_col': 'mh_ip',
                 'true_mh_op_col': 'mh_op'}

pred_col_dict = {'pred_mh_col': 'pred_mh',
                 'pred_ip_col': 'pred_ip',
                 'pred_op_col': 'pred_op',
                 'pred_mh_ip_col': 'pred_mh_ip',
                 'pred_mh_op_col': 'pred_mh_op'}

col_dict = true_col_dict
col_dict.update(pred_col_dict)

### Functions ###

In [20]:
# Run the prediction functions for a specific model
def predict_classes(model, name, description, temperature, version_dict=None):
    if version_dict is None:
        version_dict = {'mh': 3, 'ip': 1, 'op': 1}
    response_list = []
    response_mh = model.predict_mh(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('mh'))
    response_list.append(response_mh)
    response_ip = model.predict_ip(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('ip'))
    response_list.append(response_ip)
    response_op = model.predict_op(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('op'))
    response_list.append(response_op)
    assert len(response_list) == 3
    return response_list

# Combine mental health with inpatient/outpatient columns
def combine_mental_health_columns(data, **col_dict):
    ''' Combine mental health with inpatient/outpatient services columns '''
    data_ip = combine_columns(data=data, 
                              true_col_list=[col_dict.get('true_mh_col'), col_dict.get('true_ip_col')], 
                              pred_col_list=[col_dict.get('pred_mh_col'), col_dict.get('pred_ip_col')])
    
    data_op = combine_columns(data=data_ip, 
                              true_col_list=[col_dict.get('true_mh_col'), col_dict.get('true_op_col')], 
                              pred_col_list=[col_dict.get('pred_mh_col'), col_dict.get('pred_op_col')])
    output = data_op.\
        rename(columns={f'{col_dict.get('true_mh_col')}_{col_dict.get('true_ip_col')}': col_dict.get('true_mh_ip_col'),
                        f'{col_dict.get('true_mh_col')}_{col_dict.get('true_op_col')}': col_dict.get('true_mh_op_col'),
                        f'{col_dict.get('pred_mh_col')}_{col_dict.get('pred_ip_col')}': col_dict.get('pred_mh_ip_col'),
                        f'{col_dict.get('pred_mh_col')}_{col_dict.get('pred_op_col')}': col_dict.get('pred_mh_op_col')})
    return output

### Run the models on the test data ###

In [21]:
df = copy.deepcopy(train_df)
temperature = 0
output_base_name = 'pred_train_250704'
model_file_name = f'{output_base_name}.parquet'
start_time = time.perf_counter()

# LOOP OVER THE MODELS
deployment_id = 'gpt-4o-1120'
model = OpenAIModel(model=deployment_id)

# LOOP OVER THE PROMPT VERSIONS
version_dict = {'mh': 3, 'ip': 1, 'op': 1}

# LOOP OVER THE COMPANY IDs
results_df_list = []
for c, company_id in enumerate(company_id_list[:20]):
    if (c +1) % 20 == 0:
        dt = (time.perf_counter() - start_time) / 60
        print(f'Running predictions for id {c + 1} / {len(company_id_list)}: {dt: .2f} minutes')
    
    company_df = df.loc[df['id'] == company_id]
    name = company_df['name'].to_list()[0]
    description = company_df['description'].to_list()[0]

    response_list = predict_classes(model=model, name=name, description=description, 
                                    temperature=temperature, version_dict=version_dict)
    for response in response_list:
        company_df = company_df.assign(**response)
    
    
    results_df_list.append(company_df)
    
# Assemble the data complete data frame with predictions
results_df = pd.concat(results_df_list, axis=0, ignore_index=True).\
    assign(model=deployment_id)

Running predictions for id 20 / 187:  0.40 minutes


In [22]:
display(results_df.head())

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,model
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1,1,1,train,1,1,1,1,gpt-4o-1120
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1,1,1,train,1,1,0,1,gpt-4o-1120
2,431643-07,Actriv,Provider of healthcare staffing services based...,1,0,0,train,0,0,0,1,gpt-4o-1120
3,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train,1,1,1,1,gpt-4o-1120
4,107240-50,Alvarado Parkway Institute,Operator of a psychiatric health care facility...,1,1,1,train,1,1,1,1,gpt-4o-1120
