In [None]:
from IPython.display import display, Image, Audio
import json
import cv2  # We're using OpenCV to read video, to install !pip install opencv-python
import base64
import time
import random
from openai import OpenAI
import os
import requests
from utils import chat_vision
from constants import *
from tqdm.auto import tqdm

from openai import AzureOpenAI
from utils import *


MODEL = "gpt-4-turbo-2024-04-09"
REGION = "eastus2"
API_KEY = "YOUR_API_KEY"
API_BASE = "https://api.openai.com"
ENDPOINT = f"{API_BASE}/{REGION}"

client = AzureOpenAI(
    api_key=API_KEY,
    api_version="2024-02-01",
    azure_endpoint=ENDPOINT,
)

In [None]:
from tokencost import count_message_tokens, count_string_tokens

message_prompt = [{"role": "user", "content": "Hello world"}]
# Counting tokens in prompts formatted as message lists
print(count_message_tokens(message_prompt, model="gpt-4-turbo"))

In [None]:
def read_video(path):
    video = cv2.VideoCapture(path)
    base64Frames = []
    buffers = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        buffers.append(frame)
    video.release()

    if len(buffers) > 8:
        buffers = buffers[::int(len(buffers)/8)]
    else:
        buffers = buffers
    # concat 4 to one, as the max number of gpt4v frames is 10
    for i in range(0, len(buffers)-4, 4):
        frame = cv2.vconcat(
            [buffers[i], buffers[i+1], buffers[i+2], buffers[i+3]])
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    # print(len(base64Frames), "frames read.")
    # resample to 50 frames
    return base64Frames

In [None]:
from utils import chat_vision

data_dir = "../../dataset/level_0"
train_file = f"{data_dir}/train.json"

total_cost = 0
false = 0

with open(train_file, "r") as f:
    lines = f.readlines()
    test_files = [json.loads(line) for line in lines]
    # shuffle
    random.shuffle(test_files)

    for n, test_file in enumerate(tqdm(test_files[:100])):

        with open(test_file["text"], 'r') as f:
            text_log = f.readlines()

        base64Frames = read_video(test_file["camera"])

        # Naive sampling code. You can replace it with your own sampling code to test different combinations.
        preference = test_file["preference"]

        video_path_1 = test_files[n+1]["camera"]
        preference_1 = test_files[n+1]["preference"]

        video_path_2 = test_files[n+2]["camera"]
        preference_2 = test_files[n+2]["preference"]

        video_path_3, preference_3 = get_same_demo_video(
            test_file["preference"], test_files)

        # shuffle the video logs and corresponding preferences
        demos = [[video_path_1, preference_1], [
            video_path_2, preference_2], [video_path_3, preference_3]]
        random.shuffle(demos)
        video_path_1, preference_1 = demos[0]
        video_path_2, preference_2 = demos[1]
        video_path_3, preference_3 = demos[2]

        base64Frames_1 = read_video(video_path_1)
        base64Frames_2 = read_video(video_path_2)
        base64Frames_3 = read_video(video_path_3)

        # print("len(base64Frames):", len(base64Frames))
        messages = []
        instructions = "You are a robot assistant that can help summarize the host's preference. Please read the text log file and summarize the user's preference."
        # possible_preferences = Rearrangement[0]['Level0'] + Rearrangement[2]['Level2']
        possible_preferences = Sequence_Preferences['name']

        instructions += f"Choose from following preference: \n{parse_concat(possible_preferences, replace=', ')}.\n"

        PROMPT_MESSAGES = [
            {
                "role": "user",
                "content": [
                    instructions,
                    "Quesiton: What's the user's preference? Choose from the preference listed before:",
                ],
            },
            {
                "role": "user",
                "content": [
                    *map(lambda x: {"image": x, "resize": 512},
                         base64Frames_1[:2]),
                    f"the preference is {preference_1}",
                    *map(lambda x: {"image": x, "resize": 512},
                         base64Frames_2[:2]),
                    f"the preference is {preference_2}",
                    *map(lambda x: {"image": x, "resize": 512},
                         base64Frames_3[:2]),
                    f"the preference is {preference_3}",
                    *map(lambda x: {"image": x, "resize": 512},
                         base64Frames[:2]),
                    f"the preference is?",
                ],
            },
        ]

        while (True):
            try:
                answer, messages, cost = chat_vision(
                    client, MODEL, None, PROMPT_MESSAGES, role="assistant")
                break
            except Exception as e:
                print(e)
                time.sleep(1)

        gt = test_file["preference"]

        answer = answer.lower()

        for keyword in gt.split(" ")[:]:
            if keyword.lower() not in answer:
                false += 1
                print(f"False: {answer} vs {gt}")
                break

        print(f"True: {n+1-false}/{n+1}, total cost: {total_cost}")

    print(f"True: {n+1-false}/{len(test_files)}")