# Speech and Image Creation

This Jupyter notebook implements a multimodal AI assistant that combines chat, image generation, and text-to-speech capabilities using OpenAI's APIs. The application provides an interactive interface built with Gradio that allows users to:

- Engage in text-based conversations with an AI assistant
- Generate images based on text prompts using DALL-E 3
- Convert text responses to speech using OpenAI's text-to-speech API

## Key Components
### API Integration for

- Chat completions (GPT-4 Mini model)
- Image generation (DALL-E 3)
- Text-to-speech synthesis (TTS-1 model with Alloy voice)

### User Interface

- Built using Gradio blocks
- Features a split-screen layout 

### Core Functionality

- Real-time chat capabilities with message history
- Automatic image generation based on conversation context
- Text-to-speech conversion of AI responses

### Technical Requirements

- Python libraries: openai, gradio, PIL, pydub, dotenv
- OpenAI API key (loaded from environment variables)
- Audio playback capabilities

In [None]:
import os
import requests
import gradio as gr 
import base64
from dotenv import load_dotenv
from openai import OpenAI
from io import BytesIO
from PIL import Image
from pydub import AudioSegment
from pydub.playback import play

In [None]:
# Initialization
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
MODEL_IMAGES = "dall-e-3"
MODEL_CHAT = "gpt-4o-mini"

openai = OpenAI()

In [None]:
system_message = "You are just receiving prompts and reply and reply exactly the sentence I have just said"

In [None]:
def chat(message, history):
    messages = [{"role": "system", "content": system_message}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(model=MODEL_CHAT, messages=messages)
    return response.choices[0].message.content

gr.ChatInterface(fn=chat, type="messages").launch()

In [None]:
def image_generation(image):
    image_response = openai.images.generate(
        model=MODEL_IMAGES,
        prompt=f"You will create an {image}",
        size="1024x1024",
        n=1, 
        response_format="b64_json"
    )
    image_base64 = image_response.data[0].b64_json
    image_data = base64.b64decode(image_base64)
    return Image.open(BytesIO(image_data))

In [None]:
def handle_tool_call(message):
    tool_call = message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    image = arguments.get('image')
    result = image_generation(image)
    response = {
        "role": "tool",
        "content": json.dumps({"image": image, "result": result}),
        "tool_call_id": tool_call.id
    }
    return response, image

In [None]:
tools = [{"type": "function", "function": image_function}]

In [None]:
def talker(message):
    response = openai.audio.speech.create(
      model="tts-1",
      voice="alloy",    
      input=message
    )
    
    audio_stream = BytesIO(response.content)
    audio = AudioSegment.from_file(audio_stream, format="mp3")
    play(audio)

In [None]:
talker("Bonjour, comment ça va aujourd'hui")

In [None]:
def chat(history):
    messages = [{"role": "system", "content": system_message}] + history
    response = openai.chat.completions.create(model=MODEL_CHAT, messages=messages)
    image = image_generation(response)

    reply = response.choices[0].message.content
    history += [{"role":"assistant", "content":reply}]
    talker(reply)
    
    return history, image

In [None]:
# Passing in inbrowser=True in the last line will cause a Gradio window to pop up immediately.

with gr.Blocks() as ui:
    with gr.Row():
        chatbot = gr.Chatbot(height=500, type="messages")
        image_output = gr.Image(height=500)
    with gr.Row():
        entry = gr.Textbox(label="Chat with our AI Assistant:")
    with gr.Row():
        clear = gr.Button("Clear")

    def do_entry(message, history):
        history += [{"role":"user", "content":message}]
        return "", history

    entry.submit(do_entry, inputs=[entry, chatbot], outputs=[entry, chatbot]).then(
        chat, inputs=chatbot, outputs=[chatbot, image_output]
    )
    clear.click(lambda: None, inputs=None, outputs=chatbot, queue=False)

ui.launch(inbrowser=True)