## Setup

In [None]:
import os

!rm -rf notebooks
!git clone https://github.com/alexcg1/notebooks

if os.getcwd() != 'scenex-gpt4v':
    os.chdir('notebooks/scenex/scenex-gpt4v')

### Load API keys

If you have `dotenv` installed and a `.env` file, the below code will load it from there. Otherwise you'll need to set `SCENEX_SECRET` and `OPENAI_API_KEY` environment variables manually

In [None]:
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("Environment variables loaded from .env")
except ImportError:
    os.environ['OPENAI_API_KEY'] = "<your OpenAI key>"
    os.environ['SCENEX_SECRET'] = "<your SceneXplain key>"

## Pre-process images

Convert all `webp` files to `jpeg`

**Note:** No need to do this for existing images in repo

In [None]:
!find ./images -type f -name "*.webp" -exec mogrify -format jpeg {} \;

## Generate image list

In [None]:
from glob import glob

filetypes = ['jpg', 'jpeg', 'png']
file_path = './images/**/'
image_files = []

for filetype in filetypes:
    image_files.extend(glob(f'{file_path}*.{filetype}'))

In [None]:
image_files

## Define functions for SceneXplain and OpenAI I/O

In [None]:
import http.client
import json
import base64
from pprint import pprint

#### SceneXplain

In [None]:
# SceneX setup and functions

SCENEX_SECRET=os.getenv('SCENEX_SECRET')

scenex_headers = {
    "x-api-key": f"token {SCENEX_SECRET}",
    "content-type": "application/json",
}

ALGO = "Jelly"

def image_to_data_uri(file_path):
    with open(file_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        return f"data:image/png;base64,{encoded_image}"
        
def generate_scenex_data(image_files, json_schema=None, question=None, features=[]):
    data = {}
    data['data'] = []

    for file in image_files:
        cid = file.split('/')[-1]
        row = {
            "image": image_to_data_uri(file),
            "features": features,
            "algorithm": ALGO,
            "cid": cid
        }

        if question:
            row["question"] = question

        if json_schema:
            row["json_schema"] = json_schema

        data['data'].append(row)

    return data

def process_scenex(data):
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), scenex_headers)
    response = connection.getresponse()
    response_data = response.read().decode("utf-8")
    
    connection.close()

    return json.loads(response_data)['result']

#### OpenAI

In [None]:
!pip install openai

In [None]:
# OpenAI functions

from openai import OpenAI
client = OpenAI()

def encode_openai_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def generate_openai_data(image_files, text=""):
    output = []
    
    for file in image_files:
        base64_image = f"data:image/jpeg;base64,{encode_openai_image(file)}"
        data = [
            {
                "role": "user",
                # "filename": file,
                 "content": [
                    {
                    "type": "image_url",
                    "image_url": base64_image
                    },
                    {
                    "type": "text",
                    "text": text
                    }
                ]
            }
        ]
        
        output.extend(data)
        
    return output


def process_openai(data):
    output = []

    for record in data:
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[record],
            max_tokens=1000,
        )

        output.append(response.choices[0])

    return output

## Basic captioning

### SceneXplain

In [None]:
scenex_data = generate_scenex_data(image_files)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, "what is in this image?")

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Visual question answering

In [None]:
question = 'what does the text say in this image?'

### SceneXplain

In [None]:
features = ['question_answer']

In [None]:
scenex_data = generate_scenex_data(image_files, question=question, features=features)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, question)

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Extract JSON from image

We'll use SceneXplain's "Extract JSON from Image" feature to extract each text string and output to JSON that fits a strict schema.

We'll also try a klugy way to do this with OpenAI, in the interests of fairness.

In [None]:
json_schema = {
  "type": "object",
  "properties": {
    "text_strings": {
      "type": "array",
      "description": "Every text string contained in the image. Consider all languages"
    }
  }
}

### SceneX

In [None]:
features = [
    "json",
]

In [None]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [None]:
scenex_response = process_scenex(scenex_data)

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [None]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [None]:
openai_data = generate_openai_data(image_files, question)

In [None]:
openai_response = process_openai(openai_data)

### Compare results

In [None]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    print(file)
    sx_answer = json.loads(sx_record['i18n']['en'])
    oa_answer = json.loads(oa_record.message.content)

    print(f"SceneXplain: \t{sx_answer}")
    print(f"OpenAI:\t\t{oa_answer}")
    print("-"*10)

## Extract more complex JSON

Now we'll extract:

- A general description of the scene
- All text objects. Each object will contain:
    - The text string itself
    - The (human-readable) language of the string
    - The ISO 639-1 language code (e.g. `en-US`)
    - An English translation of the string

In [None]:
json_schema = {
  "type": "object",
  "properties": {
    "alt_tag": {
      "type": "string",
      "description": "a short and concise alt-tag for the image, so visually-impaired users can know what they're looking at"
    },
    "long_description": {
      "type": "string",
      "description": "description of the image. Up to 20 words."
    },
    "text_strings": {
      "type": "array",
      "description": "list of all text strings in the image"
    },
    "languages": {
      "type": "array",
      "description": "all the languages used in the image"
    },
    "iso_codes": {
      "type": "array",
      "description": "the iso-639-1 code for the language of every string in the image"
    }
  }
}

### SceneX

In [None]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
for file, sx_record in zip(image_files, scenex_response):
    print(file)
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])

        
        pprint(sx_answer)
        print("-"*10)
    except:
        pass

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [None]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [None]:
openai_data = generate_openai_data(image_files, question)

In [None]:
openai_response = process_openai(openai_data)

### Compare results

In [None]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])
        oa_answer = json.loads(oa_record.message.content)
        print(file)
    
        print("SceneXplain")
        pprint(sx_answer)
    
        print('\n')
    
        print("OpenAI")
        pprint(oa_answer)
    except:
        print(f"{file} failed")
        
    print("-"*10)

In [None]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])['text_strings']
        oa_answer = json.loads(oa_record.message.content)['text_strings']
        print(file)
    
        print(f"SceneXplain:\t{sx_answer}")    
        print(f"OpenAI:\t\t{oa_answer}")
    except:
        print(f"{file} failed")
        
    print("-"*10)