# Test photocatalysis reaction prompt

In [22]:
import os
import cv2
import numpy as np
import base64
import json
import glob
import requests

prompt_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/"
image_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/photocatalysis-relevant/"
json_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/"
cropped_image_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/cropped/"
get_data_prompt = "rxn_opt_prompt_photocatal_v2"
update_dict_prompt = "rxn_opt_update_prompt_photocatal"
api_key = "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"


In [23]:
def encode_image(image_path): 
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def reformat_json(input_file):
    """
    Clean and format JSON data by removing unwanted characters and ensuring proper JSON formatting
    """
    with open(input_file, 'r') as file:
        json_content = file.read()
        json_content = json_content.replace("\"```json\\n", '').replace('```"', '').strip()       
        json_content = json_content.replace('\\n', '').replace('\\"', '"')
        data = json.loads(json_content)
        formatted_json = json.dumps(data, indent=4)

    # Write the formatted JSON to the output file
    with open(input_file, 'w') as file:
        file.write(formatted_json)

def adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, image_directory, json_directory):
    # Get all subfigures files 
    image_paths = glob.glob(os.path.join(image_directory, f"{image_name}_*.png"))
    if not image_paths:
        print(f"No subimages found for {image_name}")
        return

    base64_images = [encode_image(image_path) for image_path in image_paths]

    # Get user prompt file
    user_prompt_path = os.path.join(prompt_directory, f"{get_data_prompt}.txt")
    with open(user_prompt_path, "r") as file:
        user_message = file.read().strip()

    # Get image captions and response file paths
    image_caption_path = os.path.join(image_directory, f"{image_name}.txt")
    response_path = os.path.join(json_directory, f"{image_name}_response.json")
    token_path = os.path.join(f"{json_directory}/token_count/", f"{image_name}_tokencount.json") # token count is saved in a subfolder in json_directory

    # Create base message
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": user_message}]
    }]

    # Add each encoded image as a separate entry
    messages[0]["content"].extend({
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
    } for base64_image in base64_images)
    
    # If the image caption file exists, append it to the messages content
    if os.path.exists(image_caption_path):
        with open(image_caption_path, "r") as file:
            image_caption = file.read().strip()
        messages[0]["content"].append({"type": "text","text": image_caption})
        print('Caption appended!')
    else: 
        print('No caption found!')

    # API request headers and payload
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": messages,
        "max_tokens": 4000
    }

    # Send API request
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()  # Raise error if the request failed
        reaction_data = response.json()['choices'][0]['message']['content']
        token_count = response.json()['usage']

        # Save responses
        with open(response_path, 'w') as json_file:
            json.dump(reaction_data, json_file)
        print(f"Reaction data saved to {response_path}!")

        with open(token_path, 'w') as token_file: 
            json.dump(token_count, token_file)
        print(f"Token count saved to {response_path}!")

        # Clean response: 
        try: 
            reformat_json(response_path)
            print(f"{response_path}: Reaction data cleaned.")

        except Exception as e: 
            print(f"{response_path}: Reaction data not cleaned. Error: {e}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

def update_dict(api_key, prompt_directory, update_dict_prompt, json_file, json_directory):
    
    # Get user prompt file
    user_prompt_path = os.path.join(prompt_directory, f"{update_dict_prompt}.txt")
    with open(user_prompt_path, "r") as file:
        user_message = file.read().strip()
    
    # Get Json file
    json_path = os.path.join(json_directory, f"{json_file}.json")
    with open(json_path, "r") as file2:
        json_dict = file2.read().strip()

    # Get response paths
    response_path = os.path.join(json_directory, f"{json_file}_updated.json")
    token_path = os.path.join(f"{json_directory}/token_count/", f"{json_file}_updated_tokencount.json")

    # Construct message
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_message
                },
                {
                    "type": "text",
                    "text": json_dict
                }              
            ]
        }
    ]

    # API request headers and payload
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": messages,
        "max_tokens": 4000
    }

    # Send API request
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()  # Raise error if the request failed
        reaction_data = response.json()['choices'][0]['message']['content']
        token_count = response.json()['usage']

        # Save response
        with open(response_path, 'w') as json_file:
            json.dump(reaction_data, json_file)

        print(f"Reaction data saved to {response_path}!")

        with open(token_path,'w') as token_file:
            json.dump(token_count, token_file)

        print(f"Token count saved to {token_path}!")
        
        # Clean response
        try:
            reformat_json(response_path)
            print(f"{response_path}: Reaction data cleaned.")
        except Exception as e:
            print(f"{response_path}: Reaction data not cleaned.Error: {e}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

In [24]:
image_name = "cs3c05785_page_1_table_1"
json_file = os.path.join(json_directory, f"{image_name}_response.json")

# adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, cropped_image_directory, json_directory)
# update_dict(prompt_directory, update_dict_prompt, json_file, json_directory)


# Batch 
failed_images = []

for file in os.listdir(image_directory):
    if (file.endswith(".png")):
        try: 
            image_name = file.removesuffix('.png')
            adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, cropped_image_directory, json_directory)
        except Exception as e:
            print(f"Error processing image: {file}, Error: {e}")
            failed_images.append(file)

# for file in os.listdir(json_directory):
#     if (file.endswith(".json")):
#         try: 
#             json_name = file.removesuffix('.json')
#             update_dict(api_key, prompt_directory, update_dict_prompt, json_name, json_directory)
#         except Exception as e:
#             print(f"Error processing JSON: {file}, Error: {e}")
#             failed_images.append(file)

if failed_images:
    print("Failed images:")
    for failed in failed_images:
        print(failed)

No caption found!
Reaction data saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/adsc_201400638_page_1_table_2_response.json!
Token count saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/adsc_201400638_page_1_table_2_response.json!
/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/adsc_201400638_page_1_table_2_response.json: Reaction data cleaned.
No caption found!
Reaction data saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/anie201705333_page_0_table_1_response.json!
Token count saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/TF_to_json/photocatalysis/json/anie201705333_page_0_table_1_