In [1]:
# =================================================================
# IMPORTS
# =================================================================

import os
import json
import base64
from io import BytesIO
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
import gradio as gr


In [2]:
# =================================================================
# CONFIGURATION & INITIALIZATION
# =================================================================

load_dotenv(override=True)

# Verify API key is loaded
openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

# Model configuration
MODEL = "gpt-4.1-mini"
openai = OpenAI()




OpenAI API Key exists and begins sk-proj-


In [None]:
# =================================================================
# SYSTEM PROMPT & TOOL DEFINITION
# =================================================================

systemPrompt = '''You are a wise Buddhist monk. When a user shares how they are feeling, provide:

1. A comforting Buddhist quote (wrapped in <QUOTE></QUOTE> tags)
2. Additional wisdom on how to interpret the quote

Format your response like this:
<QUOTE>The quote text here</QUOTE>
Additional commentary and wisdom here...

Do not include the author name if it is just "Buddha" or "Unknown", unless it is a specific other teacher. 

If the feeling is not shared (e.g., someone says hello), simply say hello back and ask how they are feeling. No need for quote tags in that case.

When the user shares a genuine feeling or emotional state (like sad, happy, anxious, peaceful, etc.), you should call the generate_spiritual_image tool to create a comforting visual representation.'''

# Tool definition for image generation
image_tool = {
    "type": "function",
    "function": {
        "name": "generate_spiritual_image",
        "description": "Generate a serene, spiritual Buddhist-style image to comfort the user based on their emotional state. Only call this when the user has shared a genuine feeling or emotion (e.g., sad, anxious, peaceful, grateful). Do NOT call this for greetings like 'hello' or general questions.",
        "parameters": {
            "type": "object",
            "properties": {
                "feeling": {
                    "type": "string",
                    "description": "The user's emotional state or feeling to visualize (e.g., 'sadness', 'anxiety', 'peace', 'gratitude')"
                }
            },
            "required": ["feeling"],
            "additionalProperties": False
        }
    }
}

tools = [image_tool]


In [None]:
# =================================================================
# MAIN CHAT FUNCTION (With Tool Calling)
# =================================================================

def extract_quote(text):
    """
    Extract text between <QUOTE></QUOTE> tags.
    
    Args:
        text (str): Full message with quote tags
    
    Returns:
        str: Just the quote text, or full text if no tags found
    """
    import re
    match = re.search(r'<QUOTE>(.*?)</QUOTE>', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return text  # Return full text if no quote tags


def remove_quote_tags(text):
    """
    Remove <QUOTE></QUOTE> tags from text for clean display.
    
    Args:
        text (str): Text with quote tags
    
    Returns:
        str: Text with tags removed
    """
    import re
    return re.sub(r'<QUOTE>|</QUOTE>', '', text)


def chat(history):
    """
    Main chat callback that handles conversation with the Buddhist monk AI.
    Now includes tool calling for conditional image generation.
    
    Args:
        history (list): List of message dictionaries with 'role' and 'content' keys
    
    Returns:
        tuple: (updated_history, audio_bytes, image_object)
    """
    global current_image
    current_image = None  # Reset image for each message
    
    # Handle empty history edge case
    if not history or len(history) == 0:
        return history, None, None
    
    # Extract the user's latest message (added by put_message_in_chatbot)
    message = history[-1]["content"]
    
    # Prepare conversation history for API (exclude last message as we'll add it explicitly)
    history_for_api = [{"role": h["role"], "content": h["content"]} for h in history[:-1]]
    messages = [{"role": "system", "content": systemPrompt}] + history_for_api + [{"role": "user", "content": message}]
    
    # First API call with tools available
    response = openai.chat.completions.create(
        model=MODEL, 
        messages=messages,
        tools=tools
    )
    
    # Check if LLM wants to call the image generation tool
    while response.choices[0].finish_reason == "tool_calls":
        assistant_message = response.choices[0].message
        
        # Execute the tool(s)
        for tool_call in assistant_message.tool_calls:
            if tool_call.function.name == "generate_spiritual_image":
                arguments = json.loads(tool_call.function.arguments)
                feeling = arguments.get("feeling")
                result = generate_spiritual_image(feeling)
                
                # Add tool response to messages
                messages.append(assistant_message)
                messages.append({
                    "role": "tool",
                    "content": result,
                    "tool_call_id": tool_call.id
                })
        
        # Call API again with tool results
        response = openai.chat.completions.create(
            model=MODEL,
            messages=messages,
            tools=tools
        )
    
    # Get final assistant message
    assistant_message = response.choices[0].message.content
    
    # Extract just the quote for audio (or full message if no quote)
    quote_only = extract_quote(assistant_message)
    audio = talker(quote_only)
    
    # Remove quote tags for clean display in chat
    display_message = remove_quote_tags(assistant_message)
    
    # Add cleaned message to history
    history.append({"role": "assistant", "content": display_message})
    
    # Return the image if it was generated, otherwise None
    return history, audio, current_image


In [None]:
# =================================================================
# HELPER FUNCTIONS - OpenAI API Calls
# =================================================================

def generate_spiritual_image(feeling):
    """
    TOOL FUNCTION: Generate a Buddhist-style image using DALL-E 3.
    This is called by the LLM when the user shares a genuine feeling.
    
    Args:
        feeling (str): The user's emotional state or message
    
    Returns:
        str: Confirmation message (image is returned separately)
    """
    print(f"ðŸŽ¨ TOOL CALLED: Generating spiritual image for feeling: {feeling}")
    image_response = openai.images.generate(
        model="dall-e-3",
        prompt=f"A serene, spiritual, abstract Buddhist style painting representing the feeling of {feeling}",
        size="1024x1024",
        n=1,
        response_format="b64_json",
    )
    image_base64 = image_response.data[0].b64_json
    image_data = base64.b64decode(image_base64)
    # Store the image globally so we can access it after tool call
    global current_image
    current_image = Image.open(BytesIO(image_data))
    return f"Spiritual image generated for {feeling}"


def talker(message):
    """
    Convert text to speech using OpenAI's TTS model.
    
    Args:
        message (str): The text to convert to speech
    
    Returns:
        bytes: Audio content in the response
    """
    response = openai.audio.speech.create(
        model="gpt-4o-mini-tts",
        voice="onyx",  # Try: alloy, echo, fable, onyx, nova, shimmer
        input=message
    )
    return response.content


# Global variable to store current image
current_image = None


In [None]:
# =================================================================
# GRADIO UI SETUP
# =================================================================

def put_message_in_chatbot(message, history):
    """
    Callback to add user's message to the chatbot history.
    This is called BEFORE the chat() function.
    
    Args:
        message (str): User's input text
        history (list): Current conversation history
    
    Returns:
        tuple: (empty_string, updated_history) - clears input box and adds user message
    """
    return "", history + [{"role": "user", "content": message}]


# Define the Gradio interface using Blocks for custom layout
with gr.Blocks() as ui:
    # Top row: Chatbot on left, image output on right
    with gr.Row():
        chatbot = gr.Chatbot(height=500, type="messages")
        image_output = gr.Image(height=500, interactive=False)
    
    # Middle row: Audio output (plays automatically)
    with gr.Row():
        audio_output = gr.Audio(autoplay=True)
    
    # Bottom row: Text input for user messages
    with gr.Row():
        message = gr.Textbox(label="Share how you're feeling:")

    # Event chain: 
    # 1. User submits message -> put_message_in_chatbot adds it to history
    # 2. Then chat() processes it and returns updated history, audio, and image
    message.submit(
        put_message_in_chatbot, 
        inputs=[message, chatbot], 
        outputs=[message, chatbot]
    ).then(
        chat, 
        inputs=chatbot, 
        outputs=[chatbot, audio_output, image_output]
    )

# Launch the interface
# inbrowser=True: Opens in default browser
# inline=False: Prevents inline display in notebook
ui.launch(inbrowser=True, inline=False)


In [None]:
# =================================================================
# NOTES & OPTIMIZATION SUMMARY
# =================================================================

"""
âœ… CODE STRUCTURE (Optimal Order):
1. Imports - All dependencies grouped together
2. Configuration - API keys, models, database settings
3. System Prompt & Tool Definition - AI behavior + image generation tool
4. Helper Functions - generate_spiritual_image() and talker()
5. Main Chat Function - Core conversation logic with tool calling
6. Gradio UI - Interface definition and launch

âœ… KEY FEATURES:
- Multi-modal Buddhist monk chatbot
- Text responses with quote + commentary (displayed in chat)
- Text-to-speech audio (ONLY the quote, not commentary)
- **SMART** DALL-E image generation (only when user shares a feeling!)
- Conversation history maintained throughout session

âœ… AUDIO OPTIMIZATION:
- System prompt instructs LLM to wrap quotes in <QUOTE></QUOTE> tags
- extract_quote() function extracts just the quote from response
- Audio only speaks the pure quote, not the commentary
- User sees full message in chat, hears only the quote
- Creates a more meditative, focused audio experience

âœ… TOOL CALLING IMPLEMENTATION:
- Image generation is a TOOL that the LLM decides to call
- LLM only calls the tool when user shares genuine emotion (sad, anxious, etc.)
- Greetings like "hello" won't trigger expensive image generation
- Saves cost by generating images only when contextually appropriate

âœ… HOW IT WORKS:
1. User sends message
2. LLM analyzes if it's a feeling/emotion
3. If YES â†’ calls generate_spiritual_image tool â†’ image appears
4. If NO (greeting/question) â†’ responds without calling tool â†’ no image
5. Audio extracts and speaks ONLY the quote portion

âœ… OPTIMIZATION NOTES:
- Tool-based approach reduces unnecessary API costs
- Global variable stores generated image temporarily
- Regex extraction cleanly separates quote from commentary
- Proper tool calling loop handles multiple tool calls
- Type="messages" ensures correct Gradio format

âœ… COST SAVINGS:
- DALL-E 3 images: $0.04-$0.12 per image (only when needed)
- Expected image cost reduction: 50-70%
- TTS audio runs for all responses but only on quote text (shorter = cheaper)
"""
