# GPT-4v camera capture assistant, based on ChatGPT gpt-4v and tts api

[By Brain Assistant](https://assistant.ruoguedu.com)

With the assistant, GPT can now not only see but also interpret the world around us.
Open it from [Colab](https://colab.research.google.com/github/davideuler/awesome-assistant-api/blob/main/GPT-4V-Vision-Interpreter-by-Camera-And-TTS.ipynb)

A demo from:
[Awesome Assistant API.](https://github.com/davideuler/awesome-assistant-api)

The Tools and core technologies used here:
``` Markdown
** OpenCV (Open Source Computer Vision Library): A library of programming functions aimed at real-time computer vision.

** OpenAI’s API: Provides access to powerful AI models capable of understanding and generating natural language and now, analyzing images.
```

In [None]:
!pip install openai opencv-python requests

In [None]:
import os
# setup OpenAI api_key
import getpass
os.environ['OPENAI_API_KEY'] = getpass.getpass("Enter the OpenAI API Key(which starts with sk-): ")


Enter the OpenAI API Key(which starts with sk-): ··········


In [38]:
from openai import OpenAI
import cv2
import time
import base64
import requests

# for colab camera
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))


class CameraApp:
    def __init__(self, camera_index, api_key):
        self.camera_index = camera_index
        self.api_key = api_key

    def initialize_camera(self):
        self.cap = cv2.VideoCapture(self.camera_index)
        if not self.cap.isOpened():
            print("Cannot open camera")
            return False
        return True

    # take photo in Google Colab:
    # https://colab.research.google.com/notebooks/snippets/advanced_outputs.ipynb#scrollTo=buJCl90WhNfq

    def take_photo(self, filename='photo.jpg', quality=0.8):
      js = Javascript('''
        async function takePhoto(quality) {
          const div = document.createElement('div');
          const capture = document.createElement('button');
          capture.textContent = 'Capture';
          div.appendChild(capture);

          const video = document.createElement('video');
          video.style.display = 'block';
          const stream = await navigator.mediaDevices.getUserMedia({video: true});

          document.body.appendChild(div);
          div.appendChild(video);
          video.srcObject = stream;
          await video.play();

          // Resize the output to fit the video element.
          google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

          // Wait for Capture to be clicked.
          await new Promise((resolve) => capture.onclick = resolve);

          const canvas = document.createElement('canvas');
          canvas.width = video.videoWidth;
          canvas.height = video.videoHeight;
          canvas.getContext('2d').drawImage(video, 0, 0);
          stream.getVideoTracks()[0].stop();
          div.remove();
          return canvas.toDataURL('image/jpeg', quality);
        }
        ''')
      display(js)
      data = eval_js('takePhoto({})'.format(quality))
      binary = b64decode(data.split(',')[1])
      with open(filename, 'wb') as f:
        f.write(binary)
      return filename

    def capture_image(self, filename='capture.jpg'):
        if not self.initialize_camera():
            return False
        time.sleep(2)  # Camera warm-up
        for _ in range(10): self.cap.read()  # Autoexposure adjustment
        ret, frame = self.cap.read()
        if ret:
            cv2.imwrite(filename, frame)
            print("Image captured successfully")
        else:
            print("Failed to capture image")
        self.cap.release()
        cv2.destroyAllWindows()
        return ret

    def encode_image(self, image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def send_request(self, image_data, custom_prompt=None):
        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}

        # Default prompt asks a question based on the image
        if custom_prompt is None:
            custom_prompt = """Analyze this image and provide a brief description (2-3 sentences),
    then ask me ONE thoughtful question about what you see. Your question should be directly
    related to something specific in the image and conversational. Keep your entire response
    concise (50-80 words) as it will be converted to speech. End with your question."""

        payload = {
            "model": "gpt-4o",  # Changed from "gpt-4" to "gpt-4o" (vision-capable model)
            "messages": [{
                "role": "user",
                "content": [
                    {"type": "text", "text": custom_prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
                ]
            }],
            "max_tokens": 300
        }

        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response_json = response.json()

        # Debug: print the response to see what's wrong
        if 'error' in response_json:
            print("API Error:", response_json['error'])

        return response_json
    def save_response_as_audio(self, response_data, audio_output="output.mp3"):
      # Check if response has the expected structure
      if 'choices' not in response_data:
          print("Error: Unexpected response structure")
          print("Full response:", response_data)
          return None

      response = client.audio.speech.create(
          model="tts-1",
          voice="alloy",
          input=response_data['choices'][0]['message']['content'],
      )
      response.write_to_file(audio_output)  # Changed from stream_to_file to write_to_file
      return response_data['choices'][0]['message']['content']
    def openai_api_calculate_cost(self, usage, model="gpt-4o"):  # Changed from "gpt-40" to "gpt-4o"
      pricing = {
          'gpt-3.5-turbo': {
              'prompt': 0.0005,
              'completion': 0.0015,
          },
          'gpt-4': {
              'prompt': 0.03,
              'completion': 0.06,
          },
          'gpt-4o': {  # GPT-4o pricing
              'prompt': 0.0025,
              'completion': 0.01,
          },
          'gpt-4-turbo': {
              'prompt': 0.01,
              'completion': 0.03,
          }
      }

      try:
          model_pricing = pricing[model]
      except KeyError:
          raise ValueError(f"Invalid model specified: {model}")

      prompt_cost = usage['prompt_tokens'] * model_pricing['prompt'] / 1000
      completion_cost = usage['completion_tokens'] * model_pricing['completion'] / 1000

      total_cost = prompt_cost + completion_cost
      print(f"\nTokens used:  {usage['prompt_tokens']:,} prompt + {usage['completion_tokens']:,} completion = {usage['total_tokens']:,} tokens")
      print(f"Total cost for {model}: ${total_cost:.4f}\n")

      return total_cost

      try:
          model_pricing = pricing[model]
      except KeyError:
          raise ValueError("Invalid model specified")

      prompt_cost = usage['prompt_tokens'] * model_pricing['prompt'] / 1000
      completion_cost = usage['completion_tokens'] * model_pricing['completion'] / 1000

      total_cost = prompt_cost + completion_cost
      print(f"\nTokens used:  {usage['prompt_tokens']:,} prompt + {usage['completion_tokens']:,} completion = {usage['total_tokens']:,} tokens")
      print(f"Total cost for {model}: ${total_cost:.4f}\n")

      return total_cost




Now, let's run the application, capture image from the camera, and then save the response to mp3 audio file, and play the audio.

In [45]:
from IPython.display import Audio
from IPython.display import display

camera_app = CameraApp(camera_index=1, api_key=client.api_key)
if camera_app.take_photo(filename='capture.jpg'):
    encoded_image = camera_app.encode_image('capture.jpg')
    response_data = camera_app.send_request(encoded_image)

    print("Response data:", response_data)

    interpretation = camera_app.save_response_as_audio(response_data, audio_output="output.mp3")

    if interpretation:
        print("AI Question: %s" % interpretation)
        total_cost = camera_app.openai_api_calculate_cost(response_data['usage'])
        wn = Audio("output.mp3", autoplay=True)
        display(wn)

<IPython.core.display.Javascript object>

Response data: {'id': 'chatcmpl-COBx0fVQAvWevj0ynMgd8aWEecPq0', 'object': 'chat.completion', 'created': 1759881846, 'model': 'gpt-4o-2024-08-06', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The image shows a person holding a "Honey Stinger Energy Waffle" in a vanilla flavor, with a focused expression. The setting appears to be indoors, with a window and some wall decor visible in the background. How do you usually use energy products like this—before workouts, during activities, or just for an energy boost?', 'refusal': None, 'annotations': []}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 502, 'completion_tokens': 67, 'total_tokens': 569, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'service_tier': 'default', 'system_fingerprint': 'fp_cbf1785567'}
AI Question: The image show