### Run local modesl with data ###

In [36]:
import os
import numpy as np
import pandas as pd
import logging
import time

logger = logging.getLogger(__name__)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
from llmt.filetools import FileOP
from llmt.ollamamodel import Ollama, OllamaModel
from llmt.llmtools import Prompt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
# Load the data
# Directories and files
data_dir = os.path.join(os.environ.get('HOME'), 'home_data', 'hcp')
test_file_name = 'hcp-alldata-250413.parquet'

# Load the data from URL
url = f'https://dsets.s3.us-east-1.amazonaws.com/{test_file_name}'
test_file = FileOP().download_from_url(url=url, download_dir=data_dir)
df = pd.read_parquet(test_file)

# Filter the labeled data
df = df.loc[df['dset'] == 'train'].\
                reset_index(drop=True).\
                astype({'mental_health': int,
                        'inpatient': int,
                        'outpatient': int})
display(df.head())
print(df.shape)

company_id_list = list(df['id'].unique())
print(f'Number of unique companies: {len(company_id_list)}')

Created .parquet file.


Unnamed: 0,id,name,description,mental_health,inpatient,outpatient,dset
0,431643-07,Actriv,Provider of healthcare staffing services based...,2,0,0,train
1,310749-31,Alima,Operator of a non-governmental organization in...,0,0,0,train
2,162054-28,Apothecare,Provider of pharmacy services intended to prov...,0,0,0,train
3,597285-28,April Health (Clinics/Outpatient Services),Provider of mental health services intended to...,1,0,1,train
4,373978-90,Arise Child and Family Service,Operator of independent living centers caterin...,2,0,0,train


(187, 7)
Number of unique companies: 187


In [9]:
# Pick an example
mental_health = 1
inpatient = 1
id_list = list(df.loc[(df['mental_health'] == mental_health) & (df['inpatient'] == inpatient), 'id'].unique())
print(f'Found {len(id_list)} company ids')
print()
# Pick a company ID
idx = 10
company_id = id_list[idx]
company_df = df.loc[df['id'] == company_id]
name = company_df['name'].values[0]
description = company_df['description'].values[0]
print(name)
print()
print(description)

Found 62 company ids

Purpose Healing Center

Provider of alcohol and drug treatment centers intended for inpatient and outpatient programs. The company offers a variety of programs including medical detox, medication assisted treatment, and inpatient and outpatient rehab, patients, including dual diagnosis, mental health services, enabling patients to get a safe recovery environment away from drugs.


In [31]:
model_list = Ollama().list_models()
print(model_list)

# Mental health messages
messages = Prompt().create_mh_messages(name=name, description=description, version=3)

['mistral:7b', 'llama2:13b', 'llama2:7b']


In [32]:
print(messages)

[{'role': 'system', 'content': 'You are an AI system assisting a healthcare policy researcher in identifying whether a business qualifies as a medical facility or organization that provides direct mental or behavioral healthcare services to human patients.\nA qualifying business must meet all of the following criteria:\n\nIt is a specialized healthcare facility or organization, such as a general hospital, mental health clinic, psychiatric hospital, counseling center, or behavioral health treatment center.  \nIt provides direct services to human patients, in-person or online, including assessments, diagnoses, therapy (individual, group, or family), psychiatric evaluations, medication management, and/or crisis intervention.  \nServices are delivered by licensed mental health professionals such as psychiatrists, psychologists, counselors, clinical social workers, or psychiatric nurse practitioners.\n\nExclude any business that falls into the following categories, even if they contribute t

In [33]:
model = OllamaModel(model='mistral:7b')
temperature = 0.5
response_mh = model.predict_mh(name=name, description=description, version=3, temperature=temperature)
print(response_mh)
response_ip = model.predict_ip(name=name, description=description, version=1, temperature=temperature)
print(response_ip)
response_op = model.predict_op(name=name, description=description, version=1, temperature=temperature)
print(response_op)

{'pred_mh': 1}
{'pred_ip': 1}
{'pred_op': 1, 'verified_op': 1}


In [None]:
# Run the entire data set and get some metrics on the results
model_name_list = ['mistral:7b', 'llama2:7b']

#for model_name in model_list:
model_name = model_name_list[1]
model = OllamaModel(model=model_name)
start_time = time.perf_counter()
execution_time_min = (time.perf_counter() - start_time) / 60
print(f'Execution time: {execution_time_min:.2f} min')
logger.info(f'Model {model_name}')

# Create a new file for each model
model_file_name = model_name.replace(':','_')
results_model_file_name = f'testrun_{model_file_name}.parquet'
results_run_df_list = []

for c, company_id in enumerate(company_id_list):
    if (c+1) % 20 == 0:
        print(f'Sample {c+1}/{len(company_id_list)}')
    company_df = df.loc[df['id'] == company_id]
    name = company_df['name'].values[0]
    description = company_df['description'].values[0]
    response_mh = model.predict_mh(name=name, description=description, version=3, temperature=temperature)
    response_ip = model.predict_ip(name=name, description=description, version=1, temperature=temperature)
    response_op = model.predict_op(name=name, description=description, version=1, temperature=temperature)
    company_df = company_df.assign(**response_mh)
    company_df = company_df.assign(**response_ip)
    company_df = company_df.assign(**response_op)
    results_run_df_list.append(company_df)
        
results_run_df = pd.concat(results_run_df_list, axis=0, ignore_index=True)
results_run_df = results_run_df.assign(temperature=temperature, model=model_file_name)
results_run_df.to_parquet(os.path.join(output_dir, results_run_file_name))