In [4]:
import base64

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_image


In [5]:
import dotenv, os
import openai
from openai import OpenAI
dotenv.load_dotenv()
openai.api_key  = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [6]:
# Simple function to call GPT-4 with one image and a prompt
def get_message_json(image_base64):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant. Given an image of a PDF page," +
                          "please extract the text and convert it to Markdown format," +
                          "You may use LaTeX for mathematical equations," +
                          "You should ignore headers, footers, page numbers," +
                          "and other non text contents such as images, figures, and meaningless symbols. " +
                          "Keep the table and fomulas as markdown format." +
                          "Output the Markdown content only within <markdown> tags.",
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Here is the image of the page I want to extract:",
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_base64}",
                    },
                },
            ],
        },
    ]
    return messages


def get_completion_from_image(image_base64, model="gpt-4o-2024-08-06", temperature=0):
    messages = get_message_json(image_base64)
    response = client.chat.completions.create(
        model=model, messages=messages, temperature=temperature
    )

    content = response.choices[0].message.content
    token_dict = {
        "prompt_tokens": response.usage.prompt_tokens,
        "completion_tokens": response.usage.completion_tokens,
        "total_tokens": response.usage.total_tokens,
    }
    return content, token_dict


# {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o-2024-08-06", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 9999}}
# Above is a jsonl row of a request to openai api
import hashlib
import random
def get_request_jsonl(image_base64, model="gpt-4o-2024-08-06", temperature=0):
    messages = get_message_json(image_base64)
    # get hash of image_base64
    hash_object = hashlib.sha256()
    hash_object.update(image_base64.encode('utf-8'))
    image_hash = hash_object.hexdigest()
    # get random number between 0 and 100000000
    random_number = random.randint(0, 100000000)
    request_jsonl = {
        "custom_id": f"{image_hash}_{random_number}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "messages": messages,
            "max_tokens": 9999,
            "temperature": temperature,
        }
    }
    return request_jsonl

# Example of usage:
# image_path = "./textbook_screenshot/1.png"  # Your image path
# encoded_image = encode_image_to_base64(image_path)
# response, token_usage = get_completion_from_image(encoded_image)

# print("Markdown output:")
# print(response)
# print("Token usage:")
# print(token_usage)

In [8]:
import os
import re

# Function to get the largest image index from the directory
def get_max_image_count(directory):
    # List all files in the directory
    files = os.listdir(directory)
    
    # Use regex to match files with the format {i}.png
    image_files = [f for f in files if re.match(r"(\d+)\.png", f)]
    
    if not image_files:
        return 0  # If no image files are found, return 0
    
    # Extract the numbers from the filenames and convert them to integers
    image_numbers = [int(re.match(r"(\d+)\.png", f).group(1)) for f in image_files]
    
    # Return the maximum number found
    return max(image_numbers)

# Function to save the markdown content between <markdown> tags
def save_markdown_from_response(response, i):
    # Use regex to extract content within <markdown> tags
    markdown_content = re.search(r"<markdown>(.*?)</markdown>", response, re.DOTALL)
    # Edge case: if <markdown> tags are not found, but no </markdown> tags are found, then the markdown_content will be all content after <markdown> tags
    if not markdown_content:
        markdown_content = re.search(r"<markdown>(.*)", response, re.DOTALL)
    
    if markdown_content:
        # Get the matched markdown content
        markdown_content = markdown_content.group(1).strip()
        
        # Directory where markdown files are saved
        output_dir = "./textbook_markdown/"
        os.makedirs(output_dir, exist_ok=True)  # Create directory if not exists
        
        # File path
        file_path = os.path.join(output_dir, f"{i}.md")
        
        # Check if the file already exists
        if os.path.exists(file_path):
            print(f"File {file_path} already exists. Skipping.", end = "\r")
        else:
            # Save the content to a markdown file
            if markdown_content:
                with open(file_path, "w") as md_file:
                    md_file.write(markdown_content)
                print(f"Saved markdown to {file_path}", end = "\r")
            else:
                # Save empty file if no markdown content
                with open(file_path, "w") as md_file:
                    md_file.write("")  # Write empty content
                print(f"Saved empty markdown to {file_path}", end = "\r")
    else:
        print(f"No markdown content found in response for page {i}. Saving empty file.", end = "\r")
        # Save an empty file when there's no markdown content
        file_path = os.path.join("./textbook_markdown/", f"{i}.md")
        with open(file_path, "w") as md_file:
            md_file.write("")  # Write empty content

import json
def save_token_usage(token_usage, i):
    # save the token usage to a json file at ./token_usage/{i}.json
    with open(f"./token_usage/{i}.json", "w") as f:
        json.dump(token_usage, f)

# Example of usage with loop:
def process_pdf_images():
    # Determine the image count based on available PNG files
    screenshot_dir = "./textbook_screenshot/"
    image_count = get_max_image_count(screenshot_dir)
    
    for i in range(1, image_count + 1):  # Loop through all images
        file_path = os.path.join("./textbook_markdown/", f"{i}.md")
        
        # Check if the markdown file already exists
        if os.path.exists(file_path):
            print(f"Markdown file {file_path} already exists. Skipping image {i}.", end = "\r")
            continue  # Skip processing this image
        
        image_path = os.path.join(screenshot_dir, f"{i}.png")
        
        # If the image file exists, proceed
        if os.path.exists(image_path):
            encoded_image = encode_image_to_base64(image_path)
            request_jsonl = get_request_jsonl(encoded_image)
            # append the request_jsonl to a jsonl file
            with open(jsonl_file_path, "a") as f:
                f.write(json.dumps(request_jsonl) + "\n")
        else:
            print(f"Image file {image_path} does not exist. Skipping.", end = "\r")

# Example call for processing a PDF with multiple pages
jsonl_file_path = "./0906_request.jsonl"
process_pdf_images()


In [9]:
with open(jsonl_file_path, "r") as f:
    lines = f.readlines()

In [10]:
len(lines)

628

In [11]:


import os

# 定義拆分檔案的函數
def split_jsonl_file(jsonl_file_path, lines_per_file=200):
    with open(jsonl_file_path, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    # 依照行數分割成多個檔案
    for i in range(0, len(lines), lines_per_file):
        batch_lines = lines[i:i + lines_per_file]
        batch_file_path = f"{jsonl_file_path}_batch_{i//lines_per_file}.jsonl"
        with open(batch_file_path, "w", encoding="utf-8") as batch_file:
            batch_file.writelines(batch_lines)
        yield batch_file_path

# 上傳分批的檔案
def upload_batches(jsonl_file_path):
    batch_file_paths = list(split_jsonl_file(jsonl_file_path))
    batch_input_files = []
    for batch_file_path in batch_file_paths:
        with open(batch_file_path, "rb") as batch_file:
            batch_input_file = client.files.create(
                file=batch_file,
                purpose="batch",
            )
        batch_input_files.append(batch_input_file)
        # print(f"Uploaded {batch_file_path}, file id: {batch_input_file['id']}")
      
    return batch_input_files

# 呼叫 upload_batches 函數上傳檔案
batch_input_files = upload_batches(jsonl_file_path)

In [12]:
list(batch_input_files)

[FileObject(id='file-WI0JlFm5TMR8ZNm1S6eysd62', bytes=82205180, created_at=1725624597, filename='0906_request.jsonl_batch_0.jsonl', object='file', purpose='batch', status='processed', status_details=None),
 FileObject(id='file-6B3ZODKvddSa89G2ru30io1o', bytes=87229431, created_at=1725624632, filename='0906_request.jsonl_batch_1.jsonl', object='file', purpose='batch', status='processed', status_details=None),
 FileObject(id='file-t2EzdNyZuUmvoJLwfSTRjuRJ', bytes=86938125, created_at=1725624667, filename='0906_request.jsonl_batch_2.jsonl', object='file', purpose='batch', status='processed', status_details=None),
 FileObject(id='file-CdHsRgzHfTiEhZe034BFnOyy', bytes=12316211, created_at=1725624675, filename='0906_request.jsonl_batch_3.jsonl', object='file', purpose='batch', status='processed', status_details=None)]

In [13]:
jobs = []
for batch_input_file in batch_input_files:
    batch_input_file_id = batch_input_file.id

    result = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "nightly eval job" + batch_input_file_id
        }
    )
    jobs.append(result)

In [17]:
# save batch job id to a json file
batch_job_ids = [job.id for job in jobs]
with open(f"./{jsonl_file_path}batch_job_ids.json", "w") as f:
    json.dump(batch_job_ids, f)

In [18]:
result = client.batches.list()
result.data

[Batch(id='batch_k0kFYGIADP2J5F8ued9EfOzt', completion_window='24h', created_at=1725624677, endpoint='/v1/chat/completions', input_file_id='file-CdHsRgzHfTiEhZe034BFnOyy', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725711077, failed_at=None, finalizing_at=None, in_progress_at=1725624678, metadata={'description': 'nightly eval jobfile-CdHsRgzHfTiEhZe034BFnOyy'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=28)),
 Batch(id='batch_kdwLHYNDWaVf4Rw4KDhzPhSC', completion_window='24h', created_at=1725624677, endpoint='/v1/chat/completions', input_file_id='file-t2EzdNyZuUmvoJLwfSTRjuRJ', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1725711077, failed_at=None, finalizing_at=None, in_progress_at=1725624680, metadata={'description': 'n