### Script template to produce both OpenAI and Ollama predictions ###

In [1]:
import os
import pandas as pd
import numpy as np
import logging
import time
from pathlib import Path

# Import this module with autoreload
%load_ext autoreload
%autoreload 2

from llmt.openaimodel import OpenAIModel
from llmt.ollamamodel import OllamaModel

In [2]:
# Data directory
data_root = os.environ.get('DATA')
data_dir = os.path.join(data_root, 'hcp')

# Experiment name (for file names)
experiment_name = 'hcp_predictions_20250519'

# Set the output directory
output_dir = os.path.join(data_dir, f'{experiment_name}_output')
Path(output_dir).mkdir(parents=True, exist_ok=True)

### Prepare the data set ###

In [3]:
# Load the data set (the samples that were chosen in April)
dataset_name = 'hcp-traintest-250423.parquet'
df = pd.read_parquet(os.path.join(data_dir, dataset_name))
print(df.shape)
display(df.head())
for dset in ['train', 'test']:
    print(f'Samples {dset.upper()}: {len(df.loc[df["dset"]==dset, "id"].unique())}')
company_id_list = list(df['id'].unique())

(687, 7)


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,97840-81,Vheda Health,Developer of an integrated healthcare platform...,0.0,0.0,0.0,train
1,97356-34,Referral Solutions Group,Developer of an online platform designed to he...,0.0,0.0,0.0,train
2,96958-00,Franklin Community Health Network,Provider of evergreen behavioral and emergency...,1.0,1.0,1.0,train
3,87934-24,Forefront TeleCare,Provider of virtual behavioral health care int...,1.0,0.0,1.0,train
4,83377-27,Cone Health,Provider of not-for-profit healthcare network ...,1.0,1.0,2.0,train


Samples TRAIN: 187
Samples TEST: 500


In [4]:
# Log file
log_file = os.path.join(output_dir, f'{experiment_name}.log')
logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(log_file, mode='w'),
        logging.StreamHandler()
    ],
    datefmt='%Y%m%d-%H:%M'
)
logger = logging.getLogger(__name__)

In [5]:
# OpenAI experiment
model_name = 'gpt-4o'
model = OpenAIModel(model=model_name)
temperature = 0
start_time = time.perf_counter()
model_file_name = f'{experiment_name}_{model_name}.parquet'
results_df_list = []
for c, company_id in enumerate(company_id_list[:20]):
    if (c + 1) % 20 == 0:
        print(f'Sample {c + 1} / {len(company_id_list)}')
    company_df = df.loc[df['id'] == company_id]
    name = company_df['name'].to_list()[0]
    description = company_df['description'].to_list()[0]
    response_mh = model.predict_mh(name=name, description=description, temperature=temperature, version=3)
    response_ip = model.predict_ip(name=name, description=description, temperature=temperature, version=1)
    response_op = model.predict_op(name=name, description=description, temperature=temperature, version=1)
    company_df = company_df.assign(**response_mh)
    company_df = company_df.assign(**response_ip)
    company_df = company_df.assign(**response_op)
    results_df_list.append(company_df)
results_df = pd.concat(results_df_list, axis=0, ignore_index=True).\
                assign(model=model_name)
# Save the data for this model
results_df.to_parquet(os.path.join(output_dir, model_file_name))

Sample 20 / 687


In [6]:
# Local Ollama experiments
model_name = 'llama2:7b'
model_name_str = model_name.replace(':','_')
model = OllamaModel(model=model_name)
temperature = 0
start_time = time.perf_counter()
model_file_name = f'{experiment_name}_{model_name_str}.parquet'
results_df_list = []
for c, company_id in enumerate(company_id_list[:20]):
    if (c + 1) % 20 == 0:
        print(f'Sample {c + 1} / {len(company_id_list)}')
    company_df = df.loc[df['id'] == company_id]
    name = company_df['name'].to_list()[0]
    description = company_df['description'].to_list()[0]
    response_mh = model.predict_mh(name=name, description=description, temperature=temperature, version=3)
    response_ip = model.predict_ip(name=name, description=description, temperature=temperature, version=1)
    response_op = model.predict_op(name=name, description=description, temperature=temperature, version=1)
    company_df = company_df.assign(**response_mh)
    company_df = company_df.assign(**response_ip)
    company_df = company_df.assign(**response_op)
    results_df_list.append(company_df)
results_df = pd.concat(results_df_list, axis=0, ignore_index=True).\
                assign(model=model_name_str)
# Save the data for this model
results_df.to_parquet(os.path.join(output_dir, model_file_name))

llama2:7b: 100%|██████████| 557/557 [00:43<00:00, 12.7B/s, success]                  


Sample 20 / 687
