In [3]:
import re

abbr_map = {
    "ACT":   "Acceptance and commitment therapy",
    "ADHD":  "Attention-deficit hyperactivity disorder",
    "AI":    "Artificial intelligence",
    "BA":    "Behavioural activation",
    "CAM":   "Complementary and alternative medicine",
    "CANMAT":"Canadian Network for Mood and Anxiety Treatments",
    "CBASP": "Cognitive behavioural analysis system of psychotherapy",
    "CBT":   "Cognitive-behavioural therapy",
    "CPD":   "Continuing professional development",
    "CYP":   "Cytochrome P450",
    "DBS":   "Deep brain stimulation",
    "DHI":   "Digital health intervention",
    "DLPFC": "Dorsolateral prefrontal cortex",
    "DSM-5-TR": "Diagnostic and Statistical Manual, 5th edition, Text Revision",
    "DSM-IV-TR":"Diagnostic and Statistical Manual, 4th edition, Text Revision",
    "DTD":   "Difficult-to-treat depression",
    "ECG":   "Electrocardiography",
    "ECT":   "Electroconvulsive therapy",
    "EEG":   "Electroencephalography",
    "GRADE": "Grading of Recommendations Assessment, Development, and Evaluation",
    "ICD":   "International Classification of Diseases",
    "IPT":   "Interpersonal therapy",
    "MAOI":  "Monoamine oxidase inhibitor",
    "MBC":   "Measurement-based care",
    "MBCT":  "Mindfulness-based cognitive therapy",
    "MCT":   "Metacognitive therapy",
    "MDD":   "Major depressive disorder",
    "MDE":   "Major depressive episode",
    "MI":    "Motivational interviewing",
    "MST":   "Magnetic seizure therapy",
    "NbN":   "Neuroscience-based nomenclature",
    "NDRI":  "Norepinephrine-dopamine reuptake inhibitor",
    "NMDA":  "N-methyl-D-aspartate",
    "NSAID": "Nonsteroidal anti-inflammatory drug",
    "PDD":   "Persistent depressive disorder",
    "PDT":   "Psychodynamic psychotherapy",
    "PHQ":   "Patient health questionnaire",
    "PST":   "Problem-solving therapy",
    "RCT":   "Randomized controlled trial",
    "rTMS":  "Repetitive transcranial magnetic stimulation",
    "SDM":   "Shared decision-making",
    "SNRI":  "Serotonin-norepinephrine reuptake inhibitor",
    "SSRI":  "Selective serotonin reuptake inhibitor",
    "STPP":  "Short-term psychodynamic psychotherapy",
    "TBS":   "Theta burst stimulation",
    "TCA":   "Tricyclic antidepressants",
    "tDCS":  "Transcranial direct current stimulation",
    "TMS":   "Transcranial magnetic stimulation",
    "TRD":   "Treatment-resistant depression",
    "USA":   "United States of America",
    "VNS":   "Vagus nerve stimulation",
    "WHO":   "World Health Organization",
}

# append definition to abbreviation
def append_definition(match: re.Match) -> str:
    abbr = match.group(1)
    definition = abbr_map.get(abbr, "")
    return f"{abbr} ({definition})"







In [None]:
from together import Together
llm_client = Together(api_key='4f6e44b7689d6592b2b5b57ad3940ac9f488d14c22802e8bcdf641b06e98cbbe')
#4f6e44b7689d6592b2b5b57ad3940ac9f488d14c22802e8bcdf641b06e98cbbe

def img_to_text(image_path: str) -> str:

    response = llm_client.chat.completions.create(
    model="meta-llama/Llama-Vision-Free", #don't change the model!
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text",
                "text": "please extract the text in this img. If there is a table, extract it to csv format."},
                {"type": "image_url",
                "image_url": {"url": "https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/e8dc176dd369/10.1177_07067437241245384-table1.jpg"}}
            ]
        }
    ],
    max_tokens=500
    )
    return response.choices[0].message.content
print(img_to_text("https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/e8dc176dd369/10.1177_07067437241245384-table1.jpg"))

In [None]:
from bs4 import BeautifulSoup, Tag, NavigableString

import sys
import re
import os
import json

chunk_id = 1

filename = "../data/raw/source.html"
# if there's no input from command line, use the default filename
# if len(sys.argv) > 1:
    # filename = sys.argv[1]
    
with open(filename, "r", encoding="utf-8") as f:
    html = f.read()


# # I manually substitute the Level 1, Level 2, Level 3, Level 4 link with text like (Leve 1), (Level 2), (Level 3), (Level 4)
level1 = 'https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/62befe587468/10.1177_07067437241245384-img1.jpg'
level2 = 'https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/b9ea5ad77490/10.1177_07067437241245384-img2.jpg'
level3 = 'https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/5be38aafe33f/10.1177_07067437241245384-img3.jpg'
level4 = 'https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/68e56cd87632/10.1177_07067437241245384-img4.jpg'


soup = BeautifulSoup(html, "html.parser")
output = []


# parse the h1 title
title = soup.find("h1")
if title:
    title = title.decode_contents().replace('\n', '')
    
output.append({
    "text": title,
    "metadata":{
    "section": "title",
    "type": "title",
    "chunk_index": chunk_id,
    "headings": "Title",
    "referenced_tables": [],
    }
})

chunk_id += 1

countlevel1 = 0
countlevel2 = 0
countlevel3 = 0
countlevel4 = 0
# parse the main body
for p in soup.find_all("p"):
    referenced_tables = set()
    
    #-----------------------replace the <img> tags---------------------
    # we also manually delete the duplication in first occurrence mentioning Levels
    for img in p.find_all('img'):
        src = img.get('src')
        if src == level1:
            replacement_text = "(Level 1)"
            countlevel1 += 1
        elif src == level2:
            replacement_text = "(Level 2)"
            countlevel2 += 1
        elif src == level3:
            replacement_text = "(Level 3)"
            countlevel3 += 1
        elif src == level4:
            replacement_text = "(Level 4)"
            countlevel4 += 1
        else:
            continue    
            
        text_node = NavigableString(replacement_text)
        img.replace_with(text_node)
        referenced_tables.add('Table A')


        
        
    # ----------------------get section id----------------------------------
    parent_sec = p.find_parent(["section",'figure'], id=True)
    sec_id = parent_sec.get("id") if parent_sec else None
    
    #TODO: I'm currently ignoring the HTML tables, becuase AIKA's processing them.
    
    
    #-----------------------get the headings---------------------
    # special case: # manually finding the "No heading" in the html file to fix the No heading issue
    # delete the <div><\div> outside this to get the correct heading: <p>Protracted Discontinuation Symptoms and Hyperbolic Tapering Schedules.</p>
    
    # get the closest heading
    heading = p.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
    headings = heading.get_text(strip=True) if heading else 'No heading' 
    if 'fig' in sec_id:
        referenced_tables.add(headings)
    #while parent still has parents
    while parent_sec:
        # print(f"parent_sec: {parent_sec.get('id')}")
        heading = parent_sec.find_previous_sibling(lambda tag: bool(re.match(r'^h[2-6]$', tag.name)))
        if heading:
            headings = heading.get_text(strip=True) + ' > ' + headings
        parent_sec = parent_sec.find_parent("section", id=True)

    headings = headings.strip().replace('\n', ' ')
    



    #-----------------------get the text---------------------
    text = p.get_text(separator=' ', strip=True) # get only text
    # text = p.decode_contents().replace('\n', ' ') #get the text with <href> links and other tags
    
    
    #----------------------- mark Type of the text ------------------------
    type = 'paragraph'
    
    if 'table' in sec_id:  # either table image or table in HTML format
        if p.get('class') and 'img-box' in p.get('class'):
            type = 'table image '
        else: 
            continue
        img_link = p.find('img').get('src')
        # print("img_link: ", img_link)
        ############################################
        # --------------------- use vision model to get the text from the image ------------------------
        response = llm_client.chat.completions.create(
            model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
            messages=[
                {
                    "role": "user",
                    "content": [
                        # Change prompt here?
                        {"type": "text", "text": "Extract the text in this img. If there is a table, extract it to csv format. Only return the text in csv format, no other explanation."},
                        {"type": "image_url", "image_url": {"url": link}}
                    ]
                }
            ],
            max_tokens=3000,
            temperature=0.05
        )
        parsed_results = response.choices[0].message.content
        print("parsed_results: ", parsed_results)
        # --------------------- use vision model to get the text from the image ------------------------
        
        # put the response directly to json
        text = str(parsed_results)
        ############################################
        # text = str(img_link)
        

        # I only have 2 here with parent = <figure>, most of img's parent are <section id = 'table...'>
    elif p.get('class') and 'img-box' in p.get('class'):
        type = 'figure image'
        img_link = p.find('img').get('src')
        
        # print("img_link: ", img_link)
        ############################################
        ############################################

        text = str(img_link)
        
        # sec_id = p.find_parent("figure", id=True).get("id")
    elif 'box' in sec_id: 
        type = 'box'
    
    
    
    #----------------------- get referenced tables ------------------------
    all_links = p.find_all('a')
    for link in all_links:
        href = link.get('href')
        if href.startswith('#'):
            referenced_tables.add(link.get_text(strip=True))
            
    # ----------------------- replace abbreviation with definition ------------------------
    # \b(ACT|ADHD|AI|…)\b
    pattern = re.compile(
        r'\b(' + '|'.join(re.escape(k) for k in abbr_map.keys()) + r')\b'
    )
    text = pattern.sub(append_definition, text)
    
            
    #----------------------- formate the chunks ------------------------
    chunk = {
        "text": "From section: "+ headings + " > paragraph id: " + str(chunk_id) + "\n"+ text,
        "metadata": {
        "section": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11351064/#" + sec_id,
        "type": type,
        "chunk_index": chunk_id,
        "headings": headings,
        "referenced_tables": list(referenced_tables),
        }
    }
    
    output.append(chunk)
    
    chunk_id += 1
    
    
print("count of levels: ", countlevel1, countlevel2, countlevel3, countlevel4)
    
# ----------------------- write to json ------------------------
with open("../data/guideline_db.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=4)
    print(f"output.json file created with {len(output)} chunks.")

In [4]:
from together import Together
llm_client = Together(api_key='4f6e44b7689d6592b2b5b57ad3940ac9f488d14c22802e8bcdf641b06e98cbbe')
#4f6e44b7689d6592b2b5b57ad3940ac9f488d14c22802e8bcdf641b06e98cbbe

model = "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo"
link = "https://cdn.ncbi.nlm.nih.gov/pmc/blobs/843a/11351064/eb6b31476a11/10.1177_07067437241245384-table12.jpg"

# set the temperature of the model to 0.05
response = llm_client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Extract the text in this img. If there is a table, extract it to csv format. Only return the text in csv format, no other explanation."},
                {"type": "image_url", "image_url": {"url": link}}
            ]
        }
    ],
    max_tokens=3000,
    temperature=0.05
)

# Print the response
print(response.choices[0].message.content)

**Text Extraction:**

The image contains a table with various medications and their corresponding information. The text in the image is as follows:

*   Line of treatment
*   Antidepressant
*   Daily dose
*   Mechanism
*   Level of evidence

**Table Extraction in CSV Format:**

Here is the table extracted in CSV format:

"Line of treatment","Antidepressant","Daily dose","Mechanism","Level of evidence"
"First line","Citalopram","20–40 mg","SSRI",""
"First line","Escitalopram","10–20 mg","SSRI",""
"First line","Fluoxetine","20–60 mg","SSRI",""
"First line","Fluvoxamine","100–300 mg","SSRI",""
"First line","Paroxetine","20–50 mg","SSRI",""
"First line","Sertraline","50–200 mg","SSRI",""
"First line","Desvenlafaxine","50–100 mg","SNRI",""
"First line","Duloxetine","60–120 mg","SNRI",""
"First line","Levomilnacipran*","40–120 mg","SNRI",""
"First line","Venlafaxine-XR","75–225 mg","SNRI",""
"First line","Bupropion","150–450 mg","NDRI",""
"First line","Mirtazapine","30–60 mg","α2 antagonist;

In [None]:
import pandas as pd

df = pd.read_json('../data/to_excel.json')

# Save to Excel
df.to_excel('output.xlsx', index=False)

In [None]:
import pdfplumber

all_tables = []

with pdfplumber.open("../guideline.pdf") as pdf:
    for page_num, page in enumerate(pdf.pages, 1):
        width = page.width
        mid = width / 2  # 假设左右等宽分栏

        # 裁出左栏、右栏两个独立区域
        left = page.within_bbox((0, 0, mid, page.height))
        right = page.within_bbox((mid, 0, width, page.height))

        for region in [left, right]:
            # 提取文字
            # text = region.extract_text()
            # 提取表格
            tables = region.extract_tables()
            # all_tables.append(tables)
            # print(tables)

            # 处理文本与表格


IndexError: list index out of range

In [None]:
import os
from PyPDF2 import PdfReader, PdfWriter
import camelot

def split_and_parse(pdf_path):
    tables = []
    reader = PdfReader(pdf_path)

    for i, page in enumerate(reader.pages):
        width = float(page.mediabox.width)
        height = float(page.mediabox.height)

        # 左半页
        left_writer = PdfWriter()
        page_left = page.clone()
        page_left.mediabox.upper_right = (width / 2, height)
        left_writer.add_page(page_left)
        left_path = f"left_{i}.pdf"
        with open(left_path, "wb") as f:
            left_writer.write(f)

        # 右半页
        right_writer = PdfWriter()
        page_right = page.clone()
        page_right.mediabox.lower_left = (width / 2, 0)
        right_writer.add_page(page_right)
        right_path = f"right_{i}.pdf"
        with open(right_path, "wb") as f:
            right_writer.write(f)

        # Camelot解析
        for path in [left_path, right_path]:
            try:
                parsed = camelot.read_pdf(path, flavor="lattice", pages="1")
                tables.extend(parsed)
            finally:
                os.remove(path)

    return tables

# 用法
pdf_file = "your_file.pdf"
tables = split_and_parse(pdf_file)
for idx, table in enumerate(tables):
    print(f"Table {idx}")
    print(table.df)
