## Import

In [1]:
import anthropic
import google.generativeai as genai
import glob
import os
import base64
from time import sleep
import json

## Preprocess

In [2]:
DATA_PATH = "data"

In [3]:
def get_data(dir) :
    """
    get file names of the directory dir
    Input:
    dir: str-like, name of the slideshow
    Output :
    fnames: list[str], list of fnames
    """
    fnames = glob.glob(f"{DATA_PATH}/{dir}/*.jpg")
    return fnames

In [4]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [5]:
def send_prompt_to_claude(base64_image,image_type,prompt) :
    claude = anthropic.Anthropic()
    message = claude.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1024,
    messages=[{"role": "user","content": [{"type": "image","source": {
                        "type": "base64",
                        "media_type": image_type,
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }])
    return message.content[0].text

In [6]:
def send_prompt_to_gemini(base64_image,image_type,prompt) :
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    model = genai.GenerativeModel(model_name="gemini-1.5-flash")
    response = model.generate_content([prompt, base64_image])
    return response.text

In [7]:
last_dir = -1

In [8]:
def generate_qa() :
    global last_dir
    listdir = sorted(os.listdir(DATA_PATH), key=lambda dir: int(dir))
    image_type = "image/jpeg"
    prompt = "Generate only one question/answer pair about this slide. The question should be specific and descriptive enough that a user could find the relevant slide from a large collection of slides, for instance it should name the brand or company it relates to if that is relevant. The question should not be trivial and must need the visual information to be answered correctly. Both the question and the answer must be in French, in a json format [{\"question\": ..., \"answer\": ...}]. Your answer have to be return only in this format. Becareful to put double quotes on keyword question and answer in the json format. It's forbiden to generate more than one pair of question/answer"
    
    for i in range(last_dir+1, len(listdir)) :
        dir = listdir[i]
        print(dir)
        fnames = get_data(dir)
        qas = []
        for fname in fnames :
            base64_image = encode_image(fname)
            response = send_prompt_to_gemini(base64_image, image_type, prompt)
            sleep(12)
            qas.append(json.loads(response))
        print(qas)
        
        json_path = f"{DATA_PATH}/{dir}/qa_{dir}.json"
        if os.path.exists(json_path) : os.remove(json_path)
        else : pass
        with open(json_path, "w") as f :
            json.dump(qas, f)
        last_dir = i

In [None]:
generate_qa()

## Q/A generation test

In [44]:
prompt = "Generate only one question/answer pair about this slide. The question should be specific and descriptive enough that a user could find the relevant slide from a large collection of slides. The question should not be trivial and must need the visual information to be answered correctly. Both the question and the answer must be in French in exactly this format [{\"question\": ..., \"answer\": ...}]. Your answer have to be return only in this format. Becareful to put double quotes on keyword question and answer."

In [45]:
fnames = get_data("955364")
fnames[1]

'data/955364\\slide_1.jpg'

In [40]:
# Getting the base64 string
base64_image = encode_image(fnames[1])
image_type = "image/jpeg"

In [41]:
base64_image

'/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAUFBQUFBQUGBgUICAcICAsKCQkKCxEMDQwNDBEaEBMQEBMQGhcbFhUWGxcpIBwcICkvJyUnLzkzMzlHREddXX0BBQUFBQUFBQYGBQgIBwgICwoJCQoLEQwNDA0MERoQExAQExAaFxsWFRYbFykgHBwgKS8nJScvOTMzOUdER11dff/CABEIAd4CfgMBIQACEQEDEQH/xAA1AAEAAwEBAQEBAAAAAAAAAAAABQYHBAMCAQgBAQABBQEAAAAAAAAAAAAAAAAGAQIEBQcD/9oADAMBAAIQAxAAAAD+ywAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAiDs6wAAAAAAAAAAAAAAAAAAAAAAAAAZmTtvAAAAAAAAAAAAAAAAAAAAAAAAABmZO28AAAAAAAAAAAAAAAAAAAAAAAAAGZk7bwAAAAAAAAAAAAAAAAAAAAAAAAAZmTtvAAAAAAAAAAAAAAAAAAAAAAGbXeOMf1Jfi9WfK2qZsya34W+1rzMnbeAAAiTPvQ1MMRNuAAAIoi5AjaEbQMQ28DD9wAAAAAAAAAfy/6YeJf2Z64NuyhW0aF5Zmd8nnm6dmhO28AAB/OJ/RwxmIJauEV1Hp/SJhfAWegHT7HaPidIuCP6UxcrkmfO+GGSJCVwsvwc34StbPT4N9mQAAAAHmpT7pWxHkgpf8eavvmZO28AABFmP3ozvsI+QEkWW9H8+2k94gqUiafnBYvoiOw2P+fCT/Dl30w/kO31PbPi9Q5L5yXWcNe+gAAAAAAABmZO28AAAAAAAGfnzaCaAAAAAMYLNoJVoEmLeAAAAAAAAAzMnbeAAAAAAAAAAAAAAAAAAAAAAAAADMydt4AAAAAAAAAAAAAAAAAAAAAAAAAM5Jq2AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [46]:
message = send_prompt_to_gemini(base64_image, image_type, prompt)

In [47]:
message

'```json\n{"question": "Quel est le nom du bâtiment représenté sur cette diapositive ?", "answer": "Le bâtiment représenté sur cette diapositive est le «\xa0Centre Pompidou\xa0». "}\n```'

In [None]:
claude = anthropic.Anthropic()
message = claude.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image_type,
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": prompt
                }
            ],
        }
    ],
)

In [None]:
print(message.content[0].text)

{
  "question": "Quel est le titre principal de cette diapositive sur la gastronomie, et qui est mentionné comme auteur?",
  "answer": "Le titre principal de la diapositive est 'CUISINE FRANÇAISE', et l'auteur mentionné est Nuriyya Guliyeva."
}
