In [265]:
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from together import Together
import json
import os
import re


In [272]:
load_dotenv()
client = Together(api_key=os.getenv("SECRET_KEY"))


In [240]:

def get_graph_metadata(graph):

    url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#"

    figure = graph.find_parent("figure")
    figure_flag = False

    section = graph.find_parent(id=re.compile(r'^section\d+-\d+$'))
    section_id = section.get("id")
    section_url = url + section_id

    section_heading = section.find("h2").get_text()
    section_subheading = section.find("h3").get_text()
    headings = section_heading + " > " + section_subheading

    attribution = ""

    
    if figure:

        figure_flag = True

        image_url = graph.get("src")

        name = figure.select_one(".obj_head").get_text()
        all_p = [p.get_text() for p in figure.find_all("p") if not p.attrs]
        caption = all_p[0]
        label = name + " " + caption

        attribution = "(" + figure.select_one('[aria-label="Attribution"]').get_text() + ")"
        number = "_".join(re.findall(r"(.{1})\.", name)).lower()
        referee_id = f"figure_{number}"
        
          
    else:

        image_url = graph.get("src")

        table_section = graph.find_parent("section")

        name = table_section.select_one(".obj_head").get_text()
        caption = table_section.select_one(".caption p").get_text()
        label = name + " " + caption

        number = "_".join(re.findall(r"(.{1})\.", name)).lower()
        referee_id = f"table_{number}"
    

    return attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url
        

In [257]:
def get_response_text(url, figure_flag):
    
    table_description_prompt = """
    You are a vision-language model. Your task is to extract the data from the image of a table and convert it into 
    structured natural language. Describe the content of the table in full sentences, preserving the structure and
    relationships.

    Guidelines:
    - A row should form a sentence.
    - For each cell, put column header and data point in pairs, and seperate the column header and data point with colons.
    - For each row, arrange the cells in a linear order, and seperate each cell with commas to form a sentence.
    - Ignore any symbols in the tables.

    The format should be as follow:
    
    Row 0 - column header1: data point[0][0],  column header2: data point[0][1]. 
    Row 1 - column header1: data point[1][0],  column header2: data point[1][1]. 
    
    Ensure the output is suitable for downstream language models to process as if it were read by a human. 
    Be precise and complete. 
    Do not add any additional characters beside the description.
    """

    figure_description_prompt = """
    You are a vision-language model. Your task is to extract and understand the flow of a process from a flowchart image. 
    Represent this process in a clear, linear text format using -> to denote transitions between steps.

    Guidelines:

    - Each step should be written in text, exactly as it appears in the flowchart.
    - Use -> to indicate directional flow from one step to the next.
    - Preserve loops or retries by showing steps that return to earlier points.
    - Ensure the structure is readable and maintains the original logic of the flowchart.

    Ensure the output is suitable for downstream language models to process as if it were read by a human. 
    Be precise and complete.
    Do not add any additional characters beside the description.
    """

    get_description_prompt = figure_description_prompt if figure_flag else table_description_prompt

    stream = client.chat.completions.create(
        model="meta-llama/Llama-Vision-Free",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": get_description_prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": url,
                        },
                    },
                ],
            }
        ],
        stream=False,
    )
# for chunk in stream:
#     print(chunk.choices[0].delta.content or "" if chunk.choices else "", end="", flush=True)
    return stream.choices[0].message.content


In [258]:
def to_chunk(text_block, section_url, referee_id, headings):

    d = {
    "text": text_block,
    "metadata": {
        "section": section_url,
        "type": "table image",
        "referee_id": referee_id,
        "headings": headings,
        }
    }
    return d

In [236]:
with open('../data/raw/source.html', encoding="utf-8") as f:
    html = f.read()
    soup = BeautifulSoup(html)

In [259]:
doc = []
for graph in soup.select(".graphic"):
    attribution, caption, figure_flag, headings, image_url, label, name, referee_id, section_url = get_graph_metadata(graph)
    text_block = label + " " + get_response_text(image_url, figure_flag) + " " + f"{attribution}"
    chunk = to_chunk(text_block, section_url, referee_id, headings)
    doc.append(chunk)


In [264]:
with open("../data/processed/graphs.json", "w", encoding="utf-8") as f:
    json.dump(doc, f, indent=4)