### Pydnatic OpenAI for text classification ###
https://github.com/daveebbelaar/openai-python-tutorial

In [None]:
import os
import numpy as np
import pandas as pd
from typing import Literal
from textwrap import dedent

# OpenAI and Pydantic libraries
import openai
from openai import AzureOpenAI
import pydantic
from pydantic import BaseModel, Field

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import llmt
# print(f'Package version: {llmt.__version__}')
print(f'Pydantic version: {pydantic.__version__}')

### Create the OpenAI client ###

In [None]:
api_dict = {'api_version': '2025-03-01-preview',
            'azure_endpoint': os.environ.get('API_ENDPOINT'),
            'api_key': os.environ.get('API_KEY')}
api_project = os.environ.get('API_PROJECT')
print(api_project)
# Model name needs to be in the deployment for the endpoint
model_name = 'gpt-4o'
# Now, we can create the API client
client = AzureOpenAI(**api_dict)

In [None]:
# Load the test data
data_dir = os.path.join(os.environ.get('HOME'), 'home_data', 'hcp')
test_file_name = f'hcp-alldata-250413.parquet'
test_file = os.path.join(data_dir, test_file_name)
df = pd.read_parquet(test_file)
display(df.head())
print(df.shape)
print(len(df['name'].unique()))

In [None]:
# Let's count the samples
patient_service_list = ['inpatient', 'outpatient']
for mental_health in [1, 0]:
    df_mh = df.loc[(df['mental_health'] == mental_health)]
    print()
    for patient in patient_service_list:
        counts = df_mh[patient].\
                        value_counts().\
                        to_frame().\
                        reset_index(drop=False)
        print(f'MENTAL HEALTH: {mental_health}')
        display(counts)

### Create the prompts and response format ###

In [None]:
# Some simple text processing, like removing new lines
def proc_prompt(prompt: str):
    output = dedent(prompt).replace('\n', '')
    return output
    
# Assemble the message
def create_messages(system_prompt: str, user_prompt: str):
    # Process the prompts
    system_message = {'role': 'system', 'content': system_prompt}
    user_message = {'role': 'user', 'content': user_prompt}
    message_list = [system_message, user_message]
    return message_list

### Prompt Engineering and Output Format ###

In [None]:
# Pydantic model to format the API response
class MentalHealth(BaseModel):
    mental_health_care: bool = Field(description='A business that provides mental health or behavioral healthcare services for human patients.')
    mental_health_care_score: float = Field(description='Confidence that the business provides mental health or behavioral healthcare services (0-1).')

mental_health_business_definition = """ 
    A healthcare business that provides mental and behavioral health care services to human patients is typically a 
    specialized facility or organization, such as a mental health clinic, psychiatric hospital, or counseling center, 
    that offers assessments, diagnoses, and evidence-based treatments for various mental health and behavioral disorders. 
    These services may include individual and group therapy, psychiatric evaluations, medication management, 
    and crisis intervention, delivered by a team of professionals such as psychiatrists, psychologists, licensed counselors, and social workers. 
    The aim of such a business is to support patients in managing their conditions, improving their mental health and overall well-being, 
    and fostering resilience and recovery in a compassionate and confidential environment.
"""

mental_health_business_definition = proc_prompt(mental_health_business_definition)

system_prompt = f""" 
    You are an advanced AI system designed to assist a healthcare policy researcher in determining whether a business qualifies as a 
    medical facility, such as a hospital or clinic, that provides mental or bahavioral healthcare services 
    for human patients according to the following definition: {mental_health_business_definition}"""

system_prompt = proc_prompt(system_prompt)

# For the user prompt, we want to include the name of the business and the description
def create_user_prompt(name: str, description: str):
    user_prompt = f"""
        The business or provider {name} is described as: {description} 
        Does the business provide mental or behavioral health care services?
        """
    return proc_prompt(user_prompt)

In [None]:
# Select a sample
mental_health = 0
inpatient = 0
outpatient = 0
df_sample = df.loc[
    (df['mental_health'] == mental_health) & 
    (df['inpatient'] == inpatient) & 
    (df['outpatient'] == outpatient).reset_index(drop=True)]

n_samples = df_sample.shape[0]
print(f'Samples with "mental_health = {mental_health}" & "inpatient = {inpatient}" & "outpatient = {outpatient}": {n_samples}')

# Select one sample
#np.random.seed(4)
#idx = np.random.randint(n_samples)
idx = 11
print(idx)
name = df_sample.iloc[idx].get('name')
description = df_sample.iloc[idx].get('description')
user_prompt = create_user_prompt(name=name, description=description)
messages = create_messages(system_prompt=system_prompt, user_prompt=user_prompt)
print(*messages, sep='\n\n')

### Send the messages to the model ###

In [None]:
# Communicate with the model
response = client.beta.chat.completions.parse(
    model=model_name,
    messages=messages,
    temperature=0,
    response_format=MentalHealth)
response_choice = response.choices[0]

In [None]:
# Here is the reponse that we want
response_message_dict = response_choice.message.parsed.model_dump()
print(response_message_dict)

# We can get other useful information
response_dict = response_choice.model_dump()

# Create an output dictionary
output_dict = response_message_dict.copy()
output_dict.update({'refusal': response_dict.get('refusal')})