# Test photocatalysis reaction prompt

In [11]:
import os
import cv2
import numpy as np
import base64
import json
import glob
import requests
import math 

prompt_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/prompts/"
image_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/relevant_images/"
json_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/jsons/"
cropped_image_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/cropped_images/"
get_data_prompt = "org_syn_get_data_prompt_v2"
update_dict_prompt = "org_syn_update_dict_prompt"
api_key = "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"


In [7]:
def encode_image(image_path): 
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def reformat_json(input_file):
    """
    Clean and format JSON data by removing unwanted characters and ensuring proper JSON formatting
    """
    with open(input_file, 'r') as file:
        json_content = file.read()
        json_content = json_content.replace("\"```json\\n", '').replace('```"', '').strip()       
        json_content = json_content.replace('\\n', '').replace('\\"', '"')
        data = json.loads(json_content)
        formatted_json = json.dumps(data, indent=4)

    # Write the formatted JSON to the output file
    with open(input_file, 'w') as file:
        file.write(formatted_json)

def adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, image_directory, json_directory):
    # Get all subfigures files 
    image_paths = glob.glob(os.path.join(image_directory, f"{image_name}_*.png"))
    if not image_paths:
        print(f"No subimages found for {image_name}")
        return

    base64_images = [encode_image(image_path) for image_path in image_paths]

    # Get user prompt file
    user_prompt_path = os.path.join(prompt_directory, f"{get_data_prompt}.txt")
    with open(user_prompt_path, "r") as file:
        user_message = file.read().strip()

    # Get image captions and response file paths
    image_caption_path = os.path.join(image_directory, f"{image_name}.txt")
    response_path = os.path.join(json_directory, f"{image_name}_response.json")
    token_path = os.path.join(f"{json_directory}/token_count/", f"{image_name}_tokencount.json") # token count is saved in a subfolder in json_directory

    # Create base message
    messages = [{
        "role": "user",
        "content": [{"type": "text", "text": user_message}]
    }]

    # Add each encoded image as a separate entry
    messages[0]["content"].extend({
        "type": "image_url",
        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
    } for base64_image in base64_images)
    
    # If the image caption file exists, append it to the messages content
    if os.path.exists(image_caption_path):
        with open(image_caption_path, "r") as file:
            image_caption = file.read().strip()
        messages[0]["content"].append({"type": "text","text": image_caption})
        print('Caption appended!')
    else: 
        print('No caption found!')

    # API request headers and payload
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": messages,
        "max_tokens": 4000
    }

    # Send API request
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()  # Raise error if the request failed
        reaction_data = response.json()['choices'][0]['message']['content']
        token_count = response.json()['usage']

        # Save responses
        with open(response_path, 'w') as json_file:
            json.dump(reaction_data, json_file)
        print(f"Reaction data saved to {response_path}!")

        with open(token_path, 'w') as token_file: 
            json.dump(token_count, token_file)
        print(f"Token count saved to {response_path}!")

        # Clean response: 
        try: 
            reformat_json(response_path)
            print(f"{response_path}: Reaction data cleaned.")

        except Exception as e: 
            print(f"{response_path}: Reaction data not cleaned. Error: {e}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

def update_dict(api_key, prompt_directory, update_dict_prompt, json_file, json_directory):
    
    # Get user prompt file
    user_prompt_path = os.path.join(prompt_directory, f"{update_dict_prompt}.txt")
    with open(user_prompt_path, "r") as file:
        user_message = file.read().strip()
    
    # Get Json file
    json_path = os.path.join(json_directory, f"{json_file}.json")
    with open(json_path, "r") as file2:
        json_dict = file2.read().strip()

    # Get response paths
    response_path = os.path.join(json_directory, f"{json_file}_updated.json")
    token_path = os.path.join(f"{json_directory}/token_count/", f"{json_file}_updated_tokencount.json")

    # Construct message
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_message
                },
                {
                    "type": "text",
                    "text": json_dict
                }              
            ]
        }
    ]

    # API request headers and payload
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o-2024-08-06",
        "messages": messages,
        "max_tokens": 4096
    }

    # Send API request
    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        response.raise_for_status()  # Raise error if the request failed
        reaction_data = response.json()['choices'][0]['message']['content']
        token_count = response.json()['usage']

        # Save response
        with open(response_path, 'w') as json_file:
            json.dump(reaction_data, json_file)

        print(f"Reaction data saved to {response_path}!")

        with open(token_path,'w') as token_file:
            json.dump(token_count, token_file)

        print(f"Token count saved to {token_path}!")
        
        # Clean response
        try:
            reformat_json(response_path)
            print(f"{response_path}: Reaction data cleaned.")
        except Exception as e:
            print(f"{response_path}: Reaction data not cleaned.Error: {e}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

In [19]:
image_name = "d4ra04674a_page_2_table_0"
#image_name = "d3gc02735j_page_2_table_0"
json_file = os.path.join(json_directory, f"{image_name}_response")

#adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, cropped_image_directory, json_directory)
# #reformat_json(json_file)
update_dict(api_key, prompt_directory, update_dict_prompt, json_file, json_directory)


# # Batch 
# failed_images = []

# for file in os.listdir(image_directory):
#     if (file.endswith(".png")):
#         try: 
#             image_name = file.removesuffix('.png')
#             adaptive_get_data(api_key, prompt_directory, get_data_prompt, image_name, cropped_image_directory, json_directory)
#         except Exception as e:
#             print(f"Error processing image: {file}, Error: {e}")
#             failed_images.append(file)

# # for file in os.listdir(json_directory):
# #     if (file.endswith(".json")):
# #         try: 
# #             json_name = file.removesuffix('.json')
# #             update_dict(api_key, prompt_directory, update_dict_prompt, json_name, json_directory)
# #         except Exception as e:
# #             print(f"Error processing JSON: {file}, Error: {e}")
# #             failed_images.append(file)

# if failed_images:
#     print("Failed images:")
#     for failed in failed_images:
#         print(failed)

Reaction data saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/jsons/d4ra04674a_page_2_table_0_response_updated.json!
Token count saved to /mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/jsons/d4ra04674a_page_2_table_0_response_updated_tokencount.json!
/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/org_syn/jsons/d4ra04674a_page_2_table_0_response_updated.json: Reaction data cleaned.


In [None]:
for file in os.listdir(image_directory):
    image_name = file.removesuffix('.png')
    print(image_name)

In [6]:
def crop_image(image_name, image_directory, cropped_image_directory, min_segment_height=120): 
    """
    Adaptively crop a given figure into smaller subfigures before passing to VLM based on image length

    parameters: 
    image_name: base image name
    image_directory: root directory where original images are saved 
    cropped_image_directory: output directory to save cropped images 
    min_segment_height: minimum height of each segmented subfigure
    """
    def find_split_line(image, threshold, region_start, region_end, percentage_threshold, step_size):
        """
        Helper function to determine where to segment the figure
        """
        
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Convert the image to grayscale            
        _, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY) # Identify white pixels
        white_pixel_count = np.count_nonzero(thresh == 255, axis=1) 

        # Find the last line with >= the specified percentage of white pixels in the specified region
        split_line = region_end
        while split_line > region_start:
            min_white_pixels = int(percentage_threshold * len(thresh[split_line]))

            if white_pixel_count[split_line] >= min_white_pixels:
                break
            split_line -= step_size

        return split_line if split_line > region_start else region_start

    def adaptive_split_lines(image, first_split_line, min_segment_height, threshold, percentage_threshold, step_size):
        """
        Helper function to identify all the split lines for an image
        """
        
        # Calculate the remaining height after the first split line
        first_region_end = int(3/8 *len(image))
        remaining_height = image.shape[0] - first_region_end
        num_segments = math.ceil(remaining_height / min_segment_height)
        segment_height = remaining_height // num_segments  # Determine the approximate height of each segment

        split_lines = [first_split_line]  # Start with the first fixed split line
        region_start_list = [first_region_end] 

        for i in range(1, num_segments):
            # Calculate dynamic region start and end for each segment
            region_start = region_start_list[-1]
            region_end = region_start + segment_height
            region_start_list.append(region_end)

            # Find the split line for the current region
            split_line = find_split_line(image, threshold, region_start, region_end, percentage_threshold, step_size)
            split_lines.append(split_line)

        return split_lines
    
    def segment_image(image, split_lines):
        """
        Helper function to crop image based on split lines
        """
        segments = []
        prev_line = 0

        for split_line in split_lines:
            segments.append(image[prev_line:split_line, :])
            prev_line = split_line

        segments.append(image[prev_line:, :]) # Add the final segment (from the last split line to the end of the image)

        return segments
    
    image_path = os.path.join(image_directory, f"{image_name}.png")
    image = cv2.imread(image_path)

    if image is None:
        print(f"Error: Image {image_name}.png not found.")
        return
    
    # Set parameters
    threshold = 254.8
    percentage_threshold = 0.995
    step_size = 10

    # Find the first split line within the first 1/4 of the image (usually the reaction diagram)
    region_start_1 = int(1/4 * len(image))
    region_end_1 = int(3/8 *len(image))
    first_split_line = find_split_line(image, threshold, region_start_1, region_end_1, percentage_threshold, step_size)

    try: 
        # Find adaptive split lines based on the remaining height after the first split line
        split_lines = adaptive_split_lines(image, first_split_line, min_segment_height, threshold, percentage_threshold, step_size)

        # Check if split lines are valid
        if len(split_lines) < 1:
            raise ValueError(f"Error: Unable to find valid split lines for {image_name}. Saving original image.")

        # Crop the image into segments
        segments = segment_image(image, split_lines)

        # Check if cropped segments have valid size
        valid_segments = 0
        for idx, segment in enumerate(segments): 
            if segment.size > 0:
                cv2.imwrite(os.path.join(cropped_image_directory, f"{image_name}_{idx+1}.png"), segment)
                valid_segments += 1
            else: 
                print(f"Warning: Segment {idx+1} of {image_name} has zero size. Skipping.")
        
        if valid_segments == 0:
            raise ValueError(f"Error: No valid segments for {image_name}. Saving original image.")

    except Exception as e: 
        print(str(e))
        cv2.imwrite(os.path.join(cropped_image_directory, f"{image_name}_original.png"), image)


In [28]:
#image_name = "1-s2.0-S2666554924000966-main_page_4_table_1"

for image in os.listdir(image_directory): 
    image_name = image.removesuffix(".png")
    crop_image(image_name, image_directory, cropped_image_directory, min_segment_height = 250)

In [34]:
def update_dict_with_smiles(image_name, json_directory): 
        """
        Combine reaction dictionary with reaction SMILES
        """
        # load optimization runs dictionary 
        dict_path = os.path.join(json_directory, f"{image_name}_response_updated.json")
        with open(dict_path, "r") as file:
            opt_dict = json.load(file)
        opt_data = opt_dict["Optimization Runs"]

        # load reaction smiles list
        smiles_path = os.path.join(json_directory, f"{image_name}_rxnsmiles.json")
        with open(smiles_path, "r") as file2: 
            smiles_list = json.load(file2)

        if not smiles_list or 'NR' in smiles_list[0].get('reactants', '') or 'NR' in smiles_list[0].get('products', ''):
            reactants = 'NR' if not smiles_list or 'NR' in smiles_list[0].get('reactants', '') else smiles_list[0]['reactants']
            products = 'NR' if not smiles_list or 'NR' in smiles_list[0].get('products', '') else smiles_list[0]['products']
        else:
            reactants = smiles_list[0].get('reactants', 'NR')
            products = smiles_list[0].get('products', 'NR')

        # Combine and save
        updated_dict = {
            "SMILES": {
                "reactants": reactants, 
                "products": products
            }, 
            "Optimization Runs": opt_data
        }

        output_path = os.path.join(json_directory, f"{image_name}_full_opt_dictionary.json")
        with open(output_path, 'w') as output_file: 
            json.dump(updated_dict, output_file, indent = 4)

        print (f"Reaction optimization dictionary updated with reaction smiles for {image_name}")


In [35]:
json_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/electrosynthesis/json_responses_no SMILES/"
image_directory = "/mnt/c/Users/Shi Xuan/OneDrive - University of Toronto/Project_MERLIN/Project_Digital esyn corpus/manuscript preparation/datasets/electrosynthesis/"

for image in os.listdir(image_directory):
    if image.endswith(".png"): 
        image_name = image.removesuffix(".png")
        try: 
            update_dict_with_smiles(image_name, json_directory)
        except Exception as e:
            print(f"error {e} for {image_name}")



Reaction optimization dictionary updated with reaction smiles for RSC10_table_1
Reaction optimization dictionary updated with reaction smiles for RSC18_table_1
Reaction optimization dictionary updated with reaction smiles for RSC2_table_1
Reaction optimization dictionary updated with reaction smiles for wiley17_table_1
Reaction optimization dictionary updated with reaction smiles for wiley26_table_1
Reaction optimization dictionary updated with reaction smiles for wiley37_table_1
