In [17]:
import requests
from icecream import ic
import os
from dotenv import load_dotenv

load_dotenv()

def ask_text_query(text_prompt, model_name='gpt-4o-mini', api_base="https://api.openai.com/v1", timeout=5):
    # For 1 text query, return the output of the VLM
    # Send the prompt to the API and get the results
    try:
        payload = {
            "model": model_name,
            "messages": [
                {
                    "role": "user",
                    "content": text_prompt
                }
            ],
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}",
        }

        # Set a specific timeout for the request
        response = requests.post(f"{api_base}/chat/completions", json=payload, headers=headers, timeout=timeout)
        data = response.json()
        
        ic(response.status_code)
        ic(data)
        
        output = data['choices'][0]['message']['content']
    
    except Exception as e:
        output = f"Exception: {str(e)}"

    return output

In [None]:
user_query = "go outside"

prompt1 = f"""
Given is user query: "{user_query}".
We are currently in an indoor environment that can be a warehouse, office, factory. Commands are given to a robot to navigate the environment.
Which objects or entities could the user be referring to when they say "{user_query}"? The robot would then need to go to that object or entity.
Remember that the robot should be able to go to the possible object and then perform an action suitable to the user query.
Return the possible objects in a json format.
""" + """
Eg if the query is "go upstairs", the possible objects could be "stairs", "staircase", "steps". Hence the json output would be:
{
    "possible_objects": [
        "stairs",
        "staircase",
        "steps"
    ]
}
"""

In [13]:
out = ask_text_query(prompt1)

ic| response.status_code: 200
ic| data: {'choices': [{'finish_reason': 'stop',
                        'index': 0,
                        'logprobs': None,
                        'message': {'content': '```json
          '
                                               '{
          '
                                               '    "possible_objects": [
          '
                                               '        "exit door",
          '
                                               '        "main entrance",
          '
                                               '        "loading dock",
          '
                                               '        "emergency exit",
          '
                                               '        "window",
          '
                                               '        "outdoor area",
          '
                                               '        "patio"
          '
                                               '    ]

In [16]:
from IPython.display import Markdown

Markdown(out)

```json
{
    "possible_objects": [
        "exit door",
        "main entrance",
        "loading dock",
        "emergency exit",
        "window",
        "outdoor area",
        "patio"
    ]
}
```

In [37]:
# parse json output and check with pydantic model
import json
import re
from pydantic import BaseModel
from typing import List

class PossibleObjects(BaseModel):
    possible_objects: List[str]
    
    
def postprocess_llm(response):
    json_string = re.search(r'```json\n(.*?)\n```', response, re.DOTALL).group(1)
    return PossibleObjects(**json.loads(json_string))


In [40]:
obj = postprocess_llm(out)
print(obj)

possible_objects=['exit door', 'main entrance', 'loading dock', 'emergency exit', 'window', 'outdoor area', 'patio']
