### OpenAI predictions and performance for updated training set ###

In [20]:
import os
import copy
import numpy as np
import pandas as pd
import time

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
from llmt.openaimodel import OpenAIModel

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
data_dir = os.path.join(os.environ.get('DATA'), 'hcp')
print(data_dir)
train_file_name = 'hcp-train-250701.parquet'
train_file = os.path.join(data_dir, train_file_name)
print(train_file)
train_df = pd.read_parquet(train_file)
print(train_df.shape)
print(len(train_df['id'].unique()))
display(train_df.head(2))
company_id_list = list(train_df['id'].unique())
print(len(company_id_list))

/app/data/hcp
/app/data/hcp/hcp-train-250701.parquet
(187, 7)
187


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1,1,1,train
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1,1,1,train


187


### Run the models on the test data ###

In [29]:
df = copy.deepcopy(train_df)
temperature = 0
output_base_name = 'pred_train_250704'
start_time = time.perf_counter()
model_file_name = f'{output_base_name}.parquet'

# LOOP OVER THE MODELS
deployment_id = 'gpt-4o-1120'

# LOOP OVER THE PROMPT VERSIONS
prompt_dict = {'mh': 3, 'ip': 1, 'op': 1}

# LOOP OVER THE COMPANY IDs
results_df_list = []
for c, company_id in enumerate(company_id_list[:5]):
    company_df = df.loc[df['id'] == company_id]
    name = company_df['name'].to_list()[0]
    description = company_df['description'].to_list()[0]
    
    response_mh = model.predict_mh(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('mh'))
    company_df = company_df.assign(**response_mh)
    response_ip = model.predict_ip(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('ip'))
    company_df = company_df.assign(**response_ip)
    response_op = model.predict_op(name=name, 
                                   description=description, 
                                   temperature=temperature, 
                                   version=prompt_dict.get('op'))
    company_df = company_df.assign(**response_op)
    results_df_list.append(company_df)
results_df = pd.concat(results_df_list, axis=0, ignore_index=True).\
    assign(model=deployment_id)

In [30]:
display(results_df)

Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset,pred_mh,pred_ip,pred_op,verified_op,model
0,491639-77,144-Bed Hospital Facility Joint Venture in Den...,Provider of behavioral health services located...,1,1,1,train,1,1,1,1,gpt-4o-1120
1,10995-58,Acadia Healthcare (NAS: ACHC),Acadia Healthcare Co Inc acquires and develops...,1,1,1,train,1,1,0,1,gpt-4o-1120
2,431643-07,Actriv,Provider of healthcare staffing services based...,1,0,0,train,0,0,0,1,gpt-4o-1120
3,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train,1,1,1,1,gpt-4o-1120
4,107240-50,Alvarado Parkway Institute,Operator of a psychiatric health care facility...,1,1,1,train,1,1,1,1,gpt-4o-1120
