## Setup

1. Put the images you want to process in `./images/`. This is already prepopulated with several images organized by language
2. Run the notebook

In [30]:
import os

!rm -rf scenex-tests
!git clone https://github.com/alexcg1/scenex-tests

if os.getcwd() != 'scenex-tests':
    os.chdir('scenex-tests')

Cloning into 'scenex-tests'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 112 (delta 31), reused 100 (delta 19), pack-reused 0[K
Receiving objects: 100% (112/112), 8.28 MiB | 11.70 MiB/s, done.
Resolving deltas: 100% (31/31), done.


### Load API keys

If you have `dotenv` installed and a `.env` file, the below code will load it from there. Otherwise you'll need to set `SCENEX_SECRET` and `OPENAI_API_KEY` environment variables manually

In [2]:
try:
    from dotenv import load_dotenv
    load_dotenv()
    print("Environment variables loaded from .env")
except ImportError:
    os.environ['OPENAI_API_KEY'] = "<your OpenAI key>"
    os.environ['SCENEX_SECRET'] = "<your SceneXplain key>"
    # print("Please set OPENAI_API_KEY and SCENEX_SECRET environment variables")

Environment variables loaded from .env


## Pre-process images

Convert all `webp` files to `jpeg`

**Note:** No need to do this for existing images in repo

In [123]:
!find ./images -type f -name "*.webp" -exec mogrify -format jpeg {} \;

## Generate image list

In [39]:
from glob import glob

skip_langs = ['ar', 'zh', 'ko', 'ja', 'cn']

filetypes = ['jpg', 'jpeg', 'png']
file_path = './images/**/'
image_files = []

for filetype in filetypes:
    image_files.extend(glob(f'{file_path}*.{filetype}'))

for image in image_files:
    lang = image.split('/')[2]
    # print(lang)
    if lang in skip_langs:
        print(f"found {lang}")
        image_files.remove(image)

print(image_files)

found ja
found ar
found ar
found ko
found cn
found cn
found ja
found ja
['./images/he/s-l400-3959857765.jpg', './images/hi/dabur-sarso-avala-hair-oil-ad-dainik-bhaskar-jaipur-12-12-2017.jpg', './images/ka/393644331_708375714670258_5034910646992091937_n.jpg', './images/iu/image.jpg', './images/th/maggi-fusian-thailand-3740487815.jpg', './images/ar/artworks-000043896167-z3x71u-t500x500.jpg', './images/ar/0a6b502ffc7c748cc918cbb80d73f960.jpg', './images/ar/d46l5mu-22863ee5-ff85-4f18-ac95-2a76e16e9f77.jpg', './images/ko/pexels-photo-6314649.jpeg', './images/cn/pexels-photo-2670327.jpeg', './images/cn/pexels-photo-3603453.jpeg', './images/th/119143302-2914108.jpeg', './images/ar/pexels-photo-11127113.jpeg', './images/klingon/illustration-digital-art-minimalism-text-logo-Star-Trek-vector-art-brand-Klingon-font-163176-4241574032.png', './images/hi/hindi Z-hotstar(1).png', './images/nastalik/image.png']


In [32]:
image_files

['./images/he/s-l400-3959857765.jpg',
 './images/hi/dabur-sarso-avala-hair-oil-ad-dainik-bhaskar-jaipur-12-12-2017.jpg',
 './images/ka/393644331_708375714670258_5034910646992091937_n.jpg',
 './images/iu/image.jpg',
 './images/th/maggi-fusian-thailand-3740487815.jpg',
 './images/ja/1940s-japan-japanese-magazine-ad-1940-advertising-for-the-pneumonia-medicine-anti-lungen-in-the-sunday-mainichi-of-march-10-1940-showa-15-this-weekly-japanese-magazine-was-established-by-mainichi-shimbun-in-1922-taisho-11-and-is-still-being-sold-toda.jpg',
 './images/ar/artworks-000043896167-z3x71u-t500x500.jpg',
 './images/ar/ad_34496406_ce0f9beaaf3797bf_web.jpg',
 './images/ar/0a6b502ffc7c748cc918cbb80d73f960.jpg',
 './images/ar/create-arabic-food-poster-banner-or-social-media-ads.jpg',
 './images/ar/d46l5mu-22863ee5-ff85-4f18-ac95-2a76e16e9f77.jpg',
 './images/ko/pexels-photo-5051602.jpeg',
 './images/ko/pexels-photo-6314649.jpeg',
 './images/cn/pexels-photo-704379.jpeg',
 './images/cn/pexels-photo-2670327

## Define functions for SceneXplain and OpenAI I/O

In [5]:
import http.client
import json
import base64
from pprint import pprint

#### SceneXplain

In [6]:
# SceneX setup and functions

SCENEX_SECRET=os.getenv('SCENEX_SECRET')

scenex_headers = {
    "x-api-key": f"token {SCENEX_SECRET}",
    "content-type": "application/json",
}

ALGO = "Jelly"

def image_to_data_uri(file_path):
    with open(file_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        return f"data:image/png;base64,{encoded_image}"
        
def generate_scenex_data(image_files, json_schema=None, question=None, features=[]):
    data = {}
    data['data'] = []

    for file in image_files:
        cid = file.split('/')[-1]
        row = {
            "image": image_to_data_uri(file),
            "features": features,
            "algorithm": ALGO,
            "cid": cid
        }

        if question:
            row["question"] = question

        if json_schema:
            row["json_schema"] = json_schema

        data['data'].append(row)

    return data

def process_scenex(data):
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), scenex_headers)
    response = connection.getresponse()
    response_data = response.read().decode("utf-8")
    
    connection.close()

    return json.loads(response_data)['result']

#### OpenAI

In [7]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
# OpenAI functions

from openai import OpenAI
client = OpenAI()

def encode_openai_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def generate_openai_data(image_files, text=""):
    output = []
    
    for file in image_files:
        base64_image = f"data:image/jpeg;base64,{encode_openai_image(file)}"
        data = [
            {
                "role": "user",
                # "filename": file,
                 "content": [
                    {
                    "type": "image_url",
                    "image_url": base64_image
                    },
                    {
                    "type": "text",
                    "text": text
                    }
                ]
            }
        ]
        
        output.extend(data)
        
    return output


def process_openai(data):
    output = []

    for record in data:
        response = client.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[record],
            max_tokens=1000,
        )

        output.append(response.choices[0])

    return output

## Basic captioning

### SceneXplain

In [None]:
scenex_data = generate_scenex_data(image_files)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, "what is in this image?")

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Visual question answering

In [None]:
question = 'what does the text say in this image?'

### SceneXplain

In [None]:
features = ['question_answer']

In [None]:
scenex_data = generate_scenex_data(image_files, question=question, features=features)

In [None]:
scenex_response = process_scenex(scenex_data)

In [None]:
scenex_response

### OpenAI

In [None]:
openai_data = generate_openai_data(image_files, question)

In [None]:
openai_response = process_openai(openai_data)

In [None]:
openai_response

## Extract JSON from image

We'll use SceneXplain's "Extract JSON from Image" feature to extract each text string and output to JSON that fits a strict schema.

We'll also try a klugy way to do this with OpenAI, in the interests of fairness.

In [11]:
json_schema = {
  "type": "object",
  "properties": {
    "text_strings": {
      "type": "array",
      "description": "Every text string contained in the image. Consider all languages"
    }
  }
}

### SceneX

In [9]:
features = [
    "json",
]

In [92]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [93]:
scenex_response = process_scenex(scenex_data)

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [95]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [96]:
openai_data = generate_openai_data(image_files, question)

In [97]:
openai_response = process_openai(openai_data)

### Compare results

In [98]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    print(file)
    sx_answer = json.loads(sx_record['i18n']['en'])
    oa_answer = json.loads(oa_record.message.content)

    print(f"SceneXplain: \t{sx_answer}")
    print(f"OpenAI:\t\t{oa_answer}")
    print("-"*10)

./images/ko/pexels-photo-5051602.jpeg
SceneXplain: 	{'text_strings': ['삼부식육점', '号号 号, 番号 7▲ 1.244-3800', '15']}
OpenAI:		{'text_strings': ['삼부', '전구', '디오', '문구', '생활용품', '금구', '도매', '1244-3800']}
----------
./images/ko/pexels-photo-6314649.jpeg
SceneXplain: 	{'text_strings': ['|야', '좋은 날에 만나', '단이 방', '50', '좋은 나를 만나']}
OpenAI:		{'text_strings': ['삶은 날에 만나', '삶은 나를 만나', '닷이 발다']}
----------
./images/cn/pexels-photo-704379.jpeg
SceneXplain: 	{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', '歡', '光', '健', '媽祖廟口', '服務台', '金瑞刊', '康']}
OpenAI:		{'text_strings': ['饒河街觀光夜市', 'Raohe St. Night Market', 'GOOD', '綜合果汁', '招牌麵線', 'QQ', '臭豆腐']}
----------
./images/cn/pexels-photo-5765624.jpeg
SceneXplain: 	{'text_strings': ['宇', '川']}
OpenAI:		{'text_strings': ['京都嵐山']}
----------
./images/cn/pexels-photo-2670327.jpeg
SceneXplain: 	{'text_strings': ['金钱肚20元', '旺角牛筋腩20元', '旺角牛杂18元']}
OpenAI:		{'text_strings': ['魚丸湯', '20元', '貢丸', '20元', '丸仔湯', '18元']}
----------
./images/cn/pexels-photo-259954

## Extract more complex JSON

Now we'll extract:

- A general description of the scene
- All text objects. Each object will contain:
    - The text string itself
    - The (human-readable) language of the string
    - The ISO 639-1 language code (e.g. `en-US`)
    - An English translation of the string

In [13]:
json_schema = {
  "type": "object",
  "properties": {
    "alt_tag": {
      "type": "string",
      "description": "a short and concise alt-tag for the image, so visually-impaired users can know what they're looking at"
    },
    "long_description": {
      "type": "string",
      "description": "description of the image. Up to 20 words."
    },
    "text_strings": {
      "type": "array",
      "description": "list of all text strings in the image"
    },
    "languages": {
      "type": "array",
      "description": "all the languages used in the image"
    },
    "iso_codes": {
      "type": "array",
      "description": "the iso-639-1 code for the language of every string in the image"
    }
  }
}

### SceneX

In [40]:
scenex_data = generate_scenex_data(image_files, json_schema=json.dumps(json_schema), features=features)

In [41]:
scenex_response = process_scenex(scenex_data)

In [None]:
for file, sx_record in zip(image_files, scenex_response):
    print(file)
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])

        
        pprint(sx_answer)
        print("-"*10)
    except:
        pass

### OpenAI

We have to kluge it a bit by asking for JSON output as part of the question. This may not always provide reliably strict output.

In [42]:
question = f"Extract the text strings from this image and populate a JSON that follows this schema:\n\n{str(json_schema)}. Return just the output JSON. Do not put it in a code block"

In [43]:
openai_data = generate_openai_data(image_files, question)

In [44]:
openai_response = process_openai(openai_data)

### Compare results

In [45]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])
        oa_answer = json.loads(oa_record.message.content)
        print(file)
    
        print("SceneXplain")
        pprint(sx_answer)
    
        print('\n')
    
        print("OpenAI")
        pprint(oa_answer)
    except:
        print(f"{file} failed")
        
    print("-"*10)

./images/he/s-l400-3959857765.jpg
SceneXplain
{'alt_tag': 'Old SOYA COFFEE advertisement',
 'iso_codes': ['en', 'he'],
 'languages': ['English', 'Hebrew'],
 'long_description': 'An old piece of advertisement for SOYA COFFEE, featuring '
                     'a sketch and text in English and Hebrew.',
 'text_strings': ['SOYA', 'COFFEE', 'QU & MOKAF']}


OpenAI
{'alt_tag': 'Vintage advertisement for SOVA coffee.',
 'iso_codes': ['en', 'he'],
 'languages': ['English', 'Hebrew'],
 'long_description': 'Old-fashioned print advertisement for SOVA Coffee and '
                     'Mokar Products.',
 'text_strings': ['SOVA',
                  'COFFEE',
                  'פילטר',
                  'מוקר',
                  'בטעם קפה ערב',
                  'חוליט כפרי']}
----------
./images/hi/dabur-sarso-avala-hair-oil-ad-dainik-bhaskar-jaipur-12-12-2017.jpg
SceneXplain
{'alt_tag': 'Hair oil advertisement with a woman and two bottles',
 'iso_codes': ['hi', 'en'],
 'languages': ['Hindi', 'Engli

In [46]:
for file, sx_record, oa_record in zip(image_files, scenex_response, openai_response):
    try:
        sx_answer = json.loads(sx_record['i18n']['en'])['text_strings']
        oa_answer = json.loads(oa_record.message.content)['text_strings']
        print(file)
    
        print(f"SceneXplain:\t{sx_answer}")    
        print(f"OpenAI:\t\t{oa_answer}")
    except:
        print(f"{file} failed")
        
    print("-"*10)

./images/he/s-l400-3959857765.jpg
SceneXplain:	['SOYA', 'COFFEE', 'QU & MOKAF']
OpenAI:		['SOVA', 'COFFEE', 'פילטר', 'מוקר', 'בטעם קפה ערב', 'חוליט כפרי']
----------
./images/hi/dabur-sarso-avala-hair-oil-ad-dainik-bhaskar-jaipur-12-12-2017.jpg
SceneXplain:	['सरसों आँवला', 'केश तेल', 'सरसों और आँवला', 'का पोषण बिना', 'चिपचिपाहट', 'में', 'नया', 'पैक', '₹9/-', 'सरसों आँवला', '40ml', 'DABUR CARES: CALL OR WRITE', '+ TOLL FREE 1800-103-1644']
OpenAI:		['सच्चा आमला', 'केश तेल', 'सरसों और आमला का पावर विना चिपचिपाहट', 'सिर्फ़ ₹x9* में', 'नया वैक', '40ml', 'BABUR CARES: CALL NOW OR WRITE', 'ई ज़ोन कोलकाता PIN 700107', 'EMAIL: daborcares.feedback@dabur', 'www.dabur.com', 'TOLL FREE 1800-103-1644']
----------
./images/ka/393644331_708375714670258_5034910646992091937_n.jpg
SceneXplain:	['Happy Thanksgiving', 'თანხმდებს სამმარის ჟადრი']
OpenAI:		['თბილებული დაგეგმეტ']
----------
./images/iu/image.jpg
SceneXplain:	['Building Nunavut Together', 'Batir le Memavut ensemble', 'Acbba Abb4 (b Der Ab 66, 0