## Setup

1. Put the images you want to process in `./images/`. This is already prepopulated with several images organized by language
2. Run the notebook

In [140]:
import os

!rm -rf scenex-tests
!git clone https://github.com/alexcg1/scenex-tests

if os.getcwd() != 'scenex-tests':
    os.chdir('scenex-tests')

Cloning into 'scenex-tests'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 68 (delta 21), reused 59 (delta 12), pack-reused 0[K
Receiving objects: 100% (68/68), 5.51 MiB | 11.81 MiB/s, done.
Resolving deltas: 100% (21/21), done.


### Load API keys

If you have `dotenv` installed and a `.env` file, the below code will load it from there. Otherwise you'll need to set `SCENEX_SECRET` and `OPENAI_API_KEY` environment variables manually

In [122]:
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("Environment variables loaded from .env")
except ImportError:
    os.environ['OPENAI_API_KEY'] = "<your OpenAI key>"
    os.environ['SCENEX_SECRET'] = "<your SceneXplain key>"
    # print("Please set OPENAI_API_KEY and SCENEX_SECRET environment variables")

Environment variables loaded from .env


## Pre-process images

Convert all `webp` files to `jpeg`

**Note:** No need to do this for existing images in repo

In [123]:
!find ./images -type f -name "*.webp" -exec mogrify -format jpeg {} \;

## Generate image list

In [143]:
from glob import glob

filetypes = ['jpg', 'jpeg', 'png']
file_path = './images/**/'
image_files = []

for filetype in filetypes:
    image_files.extend(glob(f'{file_path}*.{filetype}'))

# jpegs = glob('./images/**/*.jpeg')
# jpegs = glob('./images/**/*.jpeg')

In [144]:
image_files

['./images/ar/artworks-000043896167-z3x71u-t500x500.jpg',
 './images/ar/ad_34496406_ce0f9beaaf3797bf_web.jpg',
 './images/ar/0a6b502ffc7c748cc918cbb80d73f960.jpg',
 './images/ar/create-arabic-food-poster-banner-or-social-media-ads.jpg',
 './images/ar/d46l5mu-22863ee5-ff85-4f18-ac95-2a76e16e9f77.jpg',
 './images/ko/pexels-photo-5051602.jpeg',
 './images/ko/pexels-photo-6314649.jpeg',
 './images/cn/pexels-photo-704379.jpeg',
 './images/cn/pexels-photo-5765624.jpeg',
 './images/cn/pexels-photo-2670327.jpeg',
 './images/cn/pexels-photo-2599543.jpeg',
 './images/cn/pexels-photo-3603453.jpeg',
 './images/ja/free-photo-of-prohibition-sign-in-japanese.jpeg',
 './images/ja/free-photo-of-illuminated-lantern-in-the-street.jpeg',
 './images/ja/pexels-photo-8366393.jpeg',
 './images/ar/pexels-photo-11127113.jpeg',
 './images/ar/free-photo-of-signs-around-entrance-on-seashore.jpeg']

## Define functions for SceneXplain and OpenAI I/O

In [125]:
import http.client
import json
import base64
from pprint import pprint

#### SceneXplain

In [126]:
# SceneX setup and functions

SCENEX_SECRET=os.getenv('SCENEX_SECRET')

scenex_headers = {
    "x-api-key": f"token {SCENEX_SECRET}",
    "content-type": "application/json",
}

ALGO = "Jelly"

def image_to_data_uri(file_path):
    with open(file_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        return f"data:image/png;base64,{encoded_image}"
        
def generate_scenex_data(image_files, json_schema=None, question=None, features=[]):
    data = {}
    data['data'] = []

    for file in image_files:
        cid = file.split('/')[-1]
        row = {
            "image": image_to_data_uri(file),
            "features": features,
            "algorithm": ALGO,
            "cid": cid
        }

        if question:
            row["question"] = question

        if json_schema:
            row["json_schema"] = json_schema

        data['data'].append(row)

    return data

def process_scenex(data):
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), scenex_headers)
    response = connection.getresponse()
    response_data = response.read().decode("utf-8")
    
    connection.close()

    return json.loads(response_data)['result']

#### OpenAI

In [127]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [128]:
# OpenAI functions

from openai import OpenAI
client = OpenAI()

def encode_openai_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def generate_openai_data(image_files, text=""):
    output = []
    
    for file in image_files:
        base64_image = f"data:image/jpeg;base64,{encode_openai_image(file)}"
        data = [
            {
                "role": "user",
                # "filename": file,
                 "content": [
                    {
                    "type": "image_url",
                    "image_url": base64_image
                    },
                    {
                    "type": "text",
                    "text": text
                    }
                ]
            }
        ]
        
        output.extend(data)
        
    return output


def process_openai(data):
    output = []

    for record in data:
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[record],
            max_tokens=1000,
        )

        output.append(response.choices[0])

    return output

## Basic captioning

### SceneXplain

In [None]:
scenex_data = generate_scenex_data(image_files)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, "what is in this image?")

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Visual question answering

In [None]:
question = 'what does the text say in this image?'

### SceneXplain

In [None]:
features = ['question_answer']

In [None]:
scenex_data = generate_scenex_data(image_files, question=question, features=features)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, question)

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Extract JSON from image

We'll use SceneXplain's "Extract JSON from Image" feature to extract each text string and output to JSON that fits a strict schema.

We'll also try a klugy way to do this with OpenAI, in the interests of fairness.

In [11]:
json_schema = {
  "type": "object",
  "properties": {
    "text_strings": {
      "type": "array",
      "description": "Every text string contained in the image. Consider all languages"
    }
  }
}

### SceneX

In [129]:
features = [
    "json",
]

In [92]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [93]:
scenex_response = process_scenex(scenex_data)

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [95]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [96]:
openai_data = generate_openai_data(image_files, question)

In [97]:
openai_response = process_openai(openai_data)

### Compare results

In [98]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    print(file)
    sx_answer = json.loads(sx_record['i18n']['en'])
    oa_answer = json.loads(oa_record.message.content)

    print(f"SceneXplain: \t{sx_answer}")
    print(f"OpenAI:\t\t{oa_answer}")
    print("-"*10)

./images/ko/pexels-photo-5051602.jpeg
SceneXplain: 	{'text_strings': ['삼부식육점', '号号 号, 番号 7▲ 1.244-3800', '15']}
OpenAI:		{'text_strings': ['삼부', '전구', '디오', '문구', '생활용품', '금구', '도매', '1244-3800']}
----------
./images/ko/pexels-photo-6314649.jpeg
SceneXplain: 	{'text_strings': ['|야', '좋은 날에 만나', '단이 방', '50', '좋은 나를 만나']}
OpenAI:		{'text_strings': ['삶은 날에 만나', '삶은 나를 만나', '닷이 발다']}
----------
./images/cn/pexels-photo-704379.jpeg
SceneXplain: 	{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', '歡', '光', '健', '媽祖廟口', '服務台', '金瑞刊', '康']}
OpenAI:		{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', 'GOOD', '綜合果汁', '招牌麵線', 'QQ', '臭豆腐']}
----------
./images/cn/pexels-photo-5765624.jpeg
SceneXplain: 	{'text_strings': ['宇', '川']}
OpenAI:		{'text_strings': ['京都嵐山']}
----------
./images/cn/pexels-photo-2670327.jpeg
SceneXplain: 	{'text_strings': ['金钱肚20元', '旺角牛筋腩20元', '旺角牛杂18元']}
OpenAI:		{'text_strings': ['魚丸湯', '20元', '貢丸', '20元', '丸仔湯', '18元']}
----------
./images/cn/pexels-photo-259954

## Extract more complex JSON

Now we'll extract:

- A general description of the scene
- All text objects. Each object will contain:
    - The text string itself
    - The (human-readable) language of the string
    - The ISO 639-1 language code (e.g. `en-US`)
    - An English translation of the string

In [145]:
json_schema = {
  "type": "object",
  "properties": {
    "long_description": {
      "type": "string",
      "description": "description of the image. Up to 20 words."
    },
    "text_objects": {
      "type": "array",
      "items": {
        "type": "object",
        "description": "array of JSON objects representing every text string in the image",
        "properties": {
          "text_string": {
            "type": "string",
            "description": "the text string"
          },
          "english_translation": {
            "type": "string",
            "description": "the english translation of text string"
          },
          "language": {
            "type": "string",
            "description": "the language of the text string"
          },
          "iso_code": {
            "type": "string",
            "description": "iso-639-1 code for the language of the string"
          }
        }
      }
    }
  }
}

### SceneX

In [147]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [148]:
scenex_response = process_scenex(scenex_data)

In [149]:
for file, sx_record in zip(image_files, scenex_response):
    print(file)
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])

        
        pprint(sx_answer)
        print("-"*10)
    except:
        pass

./images/ar/artworks-000043896167-z3x71u-t500x500.jpg
{'long_description': 'An advertisement for 4G LTE. Two men, one in western '
                     'clothes and one in Omani clothing, engaging in a playful '
                     'fight.',
 'text_objects': [{'english_translation': 'Life became faster',
                   'iso_code': 'ar',
                   'language': 'Arabic',
                   'text_string': 'الحياة صارت أكثر سرعة'},
                  {'english_translation': 'EAG',
                   'iso_code': 'Undefined',
                   'language': 'Undefined',
                   'text_string': 'EAG'},
                  {'english_translation': 'LTE',
                   'iso_code': 'Undefined',
                   'language': 'Undefined',
                   'text_string': 'LTE'}]}
----------
./images/ar/ad_34496406_ce0f9beaaf3797bf_web.jpg
{'long_description': 'An image of an advertisement showing a woman under a '
                     'clear blue sky advertising the produc

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [104]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [105]:
openai_data = generate_openai_data(image_files, question)

In [106]:
openai_response = process_openai(openai_data)

InternalServerError: Error code: 503 - {'error': {'code': 503, 'message': 'Service Unavailable.', 'param': None, 'type': 'cf_service_unavailable'}}

### Compare results

In [98]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    print(file)
    sx_answer = json.loads(sx_record['i18n']['en'])
    oa_answer = json.loads(oa_record.message.content)

    print(f"SceneXplain: \t{sx_answer}")
    print(f"OpenAI:\t\t{oa_answer}")
    print("-"*10)

./images/ko/pexels-photo-5051602.jpeg
SceneXplain: 	{'text_strings': ['삼부식육점', '号号 号, 番号 7▲ 1.244-3800', '15']}
OpenAI:		{'text_strings': ['삼부', '전구', '디오', '문구', '생활용품', '금구', '도매', '1244-3800']}
----------
./images/ko/pexels-photo-6314649.jpeg
SceneXplain: 	{'text_strings': ['|야', '좋은 날에 만나', '단이 방', '50', '좋은 나를 만나']}
OpenAI:		{'text_strings': ['삶은 날에 만나', '삶은 나를 만나', '닷이 발다']}
----------
./images/cn/pexels-photo-704379.jpeg
SceneXplain: 	{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', '歡', '光', '健', '媽祖廟口', '服務台', '金瑞刊', '康']}
OpenAI:		{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', 'GOOD', '綜合果汁', '招牌麵線', 'QQ', '臭豆腐']}
----------
./images/cn/pexels-photo-5765624.jpeg
SceneXplain: 	{'text_strings': ['宇', '川']}
OpenAI:		{'text_strings': ['京都嵐山']}
----------
./images/cn/pexels-photo-2670327.jpeg
SceneXplain: 	{'text_strings': ['金钱肚20元', '旺角牛筋腩20元', '旺角牛杂18元']}
OpenAI:		{'text_strings': ['魚丸湯', '20元', '貢丸', '20元', '丸仔湯', '18元']}
----------
./images/cn/pexels-photo-259954