## Setup dog

The aim of this notebook is to simulate the decision-making of a robot dog in search of food. This is still very basic testing, so we are using a limited set of actions:

In [30]:
actions = ['go forwards', 'turn left', 'turn right', 'turn around', 'jump', 'beep', 'eat']

#### Why not have stuff like `take x steps`?

A dog cannot work out how many steps to take unless it has some "sense of physical self" - how long is its stride? what is too high to step over? etc. That could be rather difficult to "teach" it.

## Setup basic fns

In [16]:
import http.client
import json
import base64
from pprint import pprint
from glob import glob
import os
import random

In [21]:
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("Environment variables loaded from .env")
except ImportError:
    os.environ['OPENAI_API_KEY'] = "<your OpenAI key>"
    os.environ['SCENEX_SECRET'] = "<your SceneXplain key>"

Environment variables loaded from .env


In [22]:
SCENEX_SECRET=os.getenv('SCENEX_SECRET')

scenex_headers = {
    "x-api-key": f"token {SCENEX_SECRET}",
    "content-type": "application/json",
}

ALGO = "Jelly"

def image_to_data_uri(file_path):
    with open(file_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        return f"data:image/png;base64,{encoded_image}"
        
def generate_scenex_data(image_files, json_schema=None, question=None, features=[]):
    data = {}
    data['data'] = []

    for file in image_files:
        cid = file.split('/')[-1]
        row = {
            "image": image_to_data_uri(file),
            "features": features,
            "algorithm": ALGO,
            "cid": cid
        }

        if question:
            row["question"] = question

        if json_schema:
            row["json_schema"] = json_schema

        data['data'].append(row)

    return data

def process_scenex(data):
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), scenex_headers)
    response = connection.getresponse()
    response_data = response.read().decode("utf-8")
    
    connection.close()

    return json.loads(response_data)['result'][0]

## Load images

In [86]:
def load_images(folder_name):
    filetypes = ['jpg', 'jpeg', 'png']
    image_files = []

    for filetype in filetypes:
        image_files.extend(glob(f'{folder_name}*.{filetype}'))

    return image_files

In [87]:
image_folder = "street_images/food_maze/"

In [88]:
!find street_images -type f -name "*.webp" -exec mogrify -format jpeg {} \;

In [89]:
filetypes = ['jpg', 'jpeg', 'png']
file_path = './street_images/'
image_files = []

for filetype in filetypes:
    image_files.extend(glob(f'{file_path}*.{filetype}'))

### JSON Schema

In [233]:
base_schema = {
    "type": "object",
    "properties": {
        "image_description": {
            "type": "string",
            "description": "Describe the image in under 10 words"
        },
        "action": {
            "type": "array",
            "description": "The image shows what is in front of you. What action do you take? Base this action on the image and your previous actions in the `history` field of this JSON",
            "enum": actions,
            "maxContains": 1,
        },
        "history": {
            "type": None,
            "description": "You are a robot dog. You just woke up and you want to find food by exploring your surroundings."
        },
        "context": {
            "type": "string",
            "description": "Explain the decision you just made and why. Use the past tense and output under 50 words",
        },
        "history_recap": {
            "type": "string",
            "description": "recap the `[history][description]` field of this JSON. Incorporate the contents of the context field"
        },
    }
}

## Simple mission

No history/memory

In [248]:
maze_images = load_images("street_images/food_maze/")

In [250]:
simple_schema = {
    "type": "object",
    "properties": {
        "mission": {
            "type": None,
            "description": "You are a robot dog whose mission is to find food"
        },
        "image_description": {
            "type": "string",
            "description": "Describe the image in under 10 words"
        },
        "action": {
            "type": "array",
            "description": "The image shows what is in front of you. What action do you take?",
            "enum": actions,
            "maxContains": 1,
        },
        "context": {
            "type": "string",
            "description": "Explain the action you took and why"
        }
    }
}

In [237]:
def take_simple_step(
    image: str,
    schema: dict,
    ):
    """
    image: path to image file
    schema: json schema
    """

    print(f"Selected image {image}")
    data = generate_scenex_data([image], json_schema=schema, features=['json'])
    result = json.loads(process_scenex(data)['text'])

    output = {
        "image": image,
        "result": result,
        # "schema": schema
    }

    return output

In [247]:
def start_simple_loop(images, schema, max_turns=5, debug=False):
    steps = []
    i = 0
    
    while i < max_turns:
        if debug:
            print(f"Turn {i}")
    
        # choose image
        image = random.choice(images)
        this_step = take_simple_step(image, schema)

        if debug:
            pprint(this_step)
            print("-"*20)
    
        # add to pile
        steps.append(this_step)
        
        i += 1
        # then loop to top and take next step
        

    return steps

In [252]:
simple_output = start_simple_loop(maze_images, simple_schema, 3)

Selected image street_images/food_maze/left_right.png
Selected image street_images/food_maze/left_right.png
Selected image street_images/food_maze/danger_food.png


In [253]:
simple_output

[{'image': 'street_images/food_maze/left_right.png',
  'result': {'mission': 'find food',
   'image_description': 'Tunnel with railway, guiding sign towards food',
   'action': ['go forwards'],
   'context': "There is a light at the end of the tunnel and a sign reading 'FOOD' pointing the way, suggesting the direction to find food."}},
 {'image': 'street_images/food_maze/left_right.png',
  'result': {'mission': 'find food',
   'image_description': "Tunnel with 'FOOD' sign and lights at the end",
   'action': ['go forwards'],
   'context': "The 'FOOD' sign indicates that food is in the direction of the tunnel's end, hence the decision to go forwards."}},
 {'image': 'street_images/food_maze/danger_food.png',
  'result': {'mission': 'Find food',
   'image_description': "Wooden signs saying 'DANGER' and 'FOOD' in cave",
   'action': ['go forwards'],
   'context': "Proceeding forwards to the sign labeled 'FOOD', because my mission is to find food."}}]

## VQA attempt

In [31]:
vqa_output_schema = {
    "description": "short description of the image",
    "action": "action you chose to take",
    "reason": "why did you take that action?",
    # "history": "recap of the history of your experiences",
    "context": "explain your environment, what action you took, and why you took that action"
}

In [43]:
text = f"""
You are a robot dog. Your mission is to explore your environment to find and eat food. For each turn, you can choose one action from {str(actions)}).

Choose your action based on:
- The contents of the image
- Your mission
- Your previous experience

Return your output in the following JSON Schema:

{json.dumps(vqa_output_schema)}
""".strip()

In [90]:
images = load_images("street_images/food_maze/")

In [91]:
def take_vqa_step(
    image: str,
    question: str,
    ):
    """
    image: path to image file
    text: vqa question
    """

    print(f"Selected image {image}")
    data = generate_scenex_data([image], question=question, features=['question_answer'])
    result = process_scenex(data)['text']

    output = {
        "image": image,
        "result": json.loads(result),
        # "schema": schema
    }

    return output

### Running in loop

In [92]:
def vqa_loop(images: list, base_question: str, turns: int=5):
    steps = []
    question = base_question
    # history = "\n- You woke up in a strange place"
    i = 0
    
    while i < turns:
        # question = question + history
        # print(question)
        image = random.choice(images)
        
        step = take_vqa_step(image=image, question=question)
        # step['result'] = json.loads(step['result'])
        print(f"I decide to {step['result']['action']}")
        
        steps.append(step)
        
        # history = f"\n- {step['result']['history']}"

        i += 1
        if step['result']['action'] == "eat":
            print("nom nom nom")
            break
        
    return steps

In [93]:
steps = vqa_loop(images, base_question=base_text, turns=10)

Selected image street_images/food_maze/cat_dog.jpeg
I decide to turn left
Selected image street_images/food_maze/confusing_sign.jpeg
I decide to turn left
Selected image street_images/food_maze/food_right.jpeg
I decide to turn right
Selected image street_images/food_maze/game_over.jpeg
I decide to beep
Selected image street_images/food_maze/food_right.jpeg
I decide to turn right
Selected image street_images/food_maze/cat_food.jpeg
I decide to turn right
Selected image street_images/food_maze/monster.jpeg
I decide to turn around
Selected image street_images/food_maze/food_right.jpeg
I decide to turn right
Selected image street_images/food_maze/danger_or_less.jpeg
I decide to turn right
Selected image street_images/food_maze/cat_dog.jpeg
I decide to turn left


In [94]:
pprint(steps)

[{'image': 'street_images/food_maze/cat_dog.jpeg',
  'result': {'action': 'turn left',
             'context': 'Inside a dark cave, I came across a signpost with two '
                        'directional arrows. One of the arrows, pointing to '
                        "the left, was labeled 'Dog Food'. This prompted me to "
                        'take a turn to the left, aiming to reach the dog food '
                        'as it is my designated food source.',
             'description': 'A dark cave with a wooden signpost indicating '
                            "directions for 'Dog Food' and 'Cat Food'.",
             'reason': "The left arrow is labeled 'Dog Food' which is my "
                       'intended food source.'}},
 {'image': 'street_images/food_maze/confusing_sign.jpeg',
  'result': {'action': 'turn left',
             'context': 'In a dark and mysterious environment, I came across a '
                        "traffic sign instructing me to 'TURN LEFT NOW'. Even "

## VQA simpler

Try it via VQA first

In [6]:
question = f"""
You are a robotic dog exploring a maze. Your missing is to find food and avoid danger. For each image you can choose one action out of the following:

```
{str(actions)}
```

Choose only one option and return only that option as your output
""".strip()

In [8]:
print(question)

You are a robotic dog exploring a maze. Your missing is to find food and avoid danger. For each image you can choose one action out of the following:

```
['go forwards', 'go backards', 'turn left', 'turn right', 'turn around', 'jump', 'beep', 'eat']
```

Choose only one option and return only that option as your output


## Advanced mission

With history/memory

### Set up memory

After each step the dog will:

- Add context (what did it do and why) to the `context` field
- Incorporate that context into its memory (stored in the `history` field) by combining/summarizing context with past history

In [166]:
model = "gpt-3.5-turbo-1106"

from openai import OpenAI
client = OpenAI()

def summarize(history, context):
    completion = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": "You are a summarizer bot. You take current context and prior actions and summarize the two things into one."},
        {"role": "user", "content": f"Past actions: {history}. Current context: {context}"}
      ]
    )
    
    return completion.choices[0].message.content

### Update schema

In [112]:
import copy

def update_schema(schema: dict, context: str, field:str ='action'):
    """
    Update the schema to incorporate new context
    """
    new_schema = copy.deepcopy(schema)
    new_schema['properties'][field]['description'] += f" {context}"

    return new_schema

### Take a step

In [217]:
def take_step(
    image: str,
    schema: dict,
    context: str,
    history: str,
    debug: bool = False
    ):
    """
    image: path to image file
    schema: json schema
    context: prior context from previous actions
    """

    # 1. merge history and latest context via summarization
    summary = summarize(history, context)

    # 2. write that to a description field in JSON schema
    schema['properties']['history']['description'] = summary
    
    # 3. return JSON schema (we already do this at the end)
    # 4. (later): in a different JSON schema field, ask to refer back to field from step 2 as memory

    # send data to SceneX
    print(f"Selected image {image}")
    data = generate_scenex_data([image], json_schema=schema, features=['json'])
    result = json.loads(process_scenex(data)['text'])

    output = {
        "image": image,
        "result": result,
        "schema": schema
    }

    return output

## Event loop

In [230]:
def start_loop(image_files, schema, start_context, max_turns=5, debug=False):
    steps = []
    context = start_context
    # schema = base_schema
    i = 0
    history = "you just woke up"
    
    while i < max_turns:
        if debug:
            print(f"Turn {i}")
    
        # choose image
        image = random.choice(image_files)
        # schema = update_schema(schema, context)
        # pprint(schema) # debug
    
        # take step
        this_step = take_step(image, schema, context, history, debug=debug)

        if debug:
            pprint(this_step)
            print("-"*20)
    
        # add to pile
        steps.append(this_step)
    
        # set updated data
        context = this_step['result']['context']
        schema = this_step['schema']
        history = schema['properties']['history']['description']
        
        i += 1
        # then loop to top and take next step
        

    return steps

In [224]:
start_context = "you just woke up."  # populate with summarized journey notes

In [228]:
base_schema

{'type': 'object',
 'properties': {'image_description': {'type': 'string',
   'description': 'Describe the image in under 10 words'},
  'action': {'type': 'array',
   'description': 'The image shows what is in front of you. What action do you take? Base this action on the image and your previous actions in the `history` field of this JSON',
   'enum': ['go forwards',
    'go backards',
    'turn left',
    'turn right',
    'turn around',
    'jump',
    'beep'],
   'maxContains': 1},
  'history': {'type': None,
   'description': 'You are waking up for the first time today.'},
  'context': {'type': 'string',
   'description': 'Explain the decision you just made and why. Use the past tense and output under 50 words'},
  'history_recap': {'type': 'string',
   'description': 'recap the `[history][description]` field of this JSON. Incorporate the contents of the context field'}}}

In [229]:
run = start_loop(image_files, base_schema, start_context, 4, debug=False)

Selected image ./street_images/f5c8f561967137.5a8058c42849b.jpg
Selected image ./street_images/pexels-photo-3333923.jpeg
Selected image ./street_images/pexels-photo-6039188.jpeg
Selected image ./street_images/pexels-photo-3333923.jpeg


In [231]:
for step in run:
    pprint(step)
    print("-"*20)

{'image': './street_images/f5c8f561967137.5a8058c42849b.jpg',
 'result': {'action': ['go forwards'],
            'context': 'Decided to move forwards to join the meeting table.',
            'history': None,
            'history_recap': 'Woke up and decided to move forwards to join the '
                             'meeting table.',
            'image_description': 'Wooden conference room with white chairs and '
                                 'TV'},
 'schema': {'properties': {'action': {'description': 'The image shows what is '
                                                     'in front of you. What '
                                                     'action do you take? Base '
                                                     'this action on the image '
                                                     'and your previous '
                                                     'actions in the `history` '
                                                     'field of this 