In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import sys
import os

# Replace 'Your_Folder_Name' with the actual name of the folder containing your .py files
folder_path = '/content/drive/MyDrive/ColabNotebooks/pdfPipeline'


# Add the folder to sys.path
if folder_path not in sys.path:
    sys.path.append(folder_path)

print(f"'{folder_path}' added to sys.path: {folder_path in sys.path}")

# You can optionally list the files in the directory to confirm they are there
print(f"Files in {folder_path}: {os.listdir(folder_path)}")

'/content/drive/MyDrive/ColabNotebooks/pdfPipeline' added to sys.path: True
Files in /content/drive/MyDrive/ColabNotebooks/pdfPipeline: ['README.md', 'openAlexSort.py', 'getPDF.ipynb', 'getPDFidsWithBioentrez.py', '__pycache__']


In [3]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.3 MB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m2.3/3.3 MB[0m [31m33.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [4]:
from getPDFidsWithBioentrez import get_articles
from openAlexSort import fetch_openalex_metadata
import pandas as pd

In [5]:
# getPDF.py

# 1. Kullanıcıdan parametre alma
topic = "NAFLD"
article_count = 25  # BioEntrez ile çekilecek makale sayısı
top_n = 10          # En çok atıf alan kaç makale seçilsin

# 2. BioEntrez ile DOI'leri al
df_pubmed, doi_list = get_articles(topic, article_count)
filtered_doi_list = [doi for doi in df_pubmed['DOI'] if pd.notna(doi) and doi != "NA"]
print(filtered_doi_list)


# İlk kontrol çıktısı
print(f"***Toplam BioEntrez ile {len(df_pubmed)} makale bulundu.")
print("***filtered_doi_list length after filtering by valid doi link: " + str(len(filtered_doi_list)))

# 3. OpenAlex üzerinden metadata çek
df_openalex = fetch_openalex_metadata(filtered_doi_list)
print("\n\n***df_openalex length after filtering by open access: " + str(len(df_openalex)))

# 4. Cited_By_Count değerine göre sırala ve ilk N tanesini al
df_top = df_openalex.sort_values(by="Cited_By_Count", ascending=False).head(top_n)

# 5. Okunabilir bağlantılar (Read_Link) listesi oluştur
readable_links = df_top["Read_Link"].tolist()

# 6. Çıktılar
print("\nEn çok atıf alan makaleler:")
print(df_top[["DOI", "Title", "Cited_By_Count", "Read_Link"]])

print(f"\nOkunabilir linkler ({len(readable_links)} adet):")
for i, link in enumerate(readable_links, 1):
    print(f"{i}. {link}")




Bioentrez df: 
       PMID       PMCID                           DOI  \
0  25920090          NA    10.1016/j.jhep.2014.12.012   
1  29967350  PMC6553468     10.1038/s41591-018-0104-9   
2  35287643  PMC8919523    10.1186/s12902-022-00980-1   
3  28930295          NA     10.1038/nrgastro.2017.109   
4  38301798          NA  10.1016/j.drudis.2024.103910   

                                               Title  \
0                      NAFLD: a multisystem disease.   
1  Mechanisms of NAFLD development and therapeuti...   
2  Non-alcoholic fatty liver disease (NAFLD): a r...   
3  Global burden of NAFLD and NASH: trends, predi...   
4  NAFLD and NASH: etiology, targets and emerging...   

                                            Abstract  \
0  Non-alcoholic fatty liver disease (NAFLD) is t...   
1  There has been a rise in the prevalence of non...   
2  Given the increasing prevalence of diabetes an...   
3  NAFLD is one of the most important causes of l...   
4  Non-alcoholic fatty li

OpenAlex'ten veri çekiliyor: 100%|██████████| 25/25 [00:18<00:00,  1.37it/s]



***df_openalex length after filtering by open access: 19

En çok atıf alan makaleler:
                              DOI  \
1       10.1038/s41591-018-0104-9   
0      10.1016/j.jhep.2014.12.012   
6      10.1136/gutjnl-2020-320622   
2      10.1186/s12902-022-00980-1   
5   10.1080/03602532.2017.1293683   
15          10.1210/er.2019-00034   
17  10.1080/17461391.2019.1571114   
7                    10.2741/4892   
16          10.3390/ijms140611963   
18     10.1136/gutjnl-2018-318146   

                                                Title  Cited_By_Count  \
1   Mechanisms of NAFLD development and therapeuti...            3453   
0                        NAFLD: A multisystem disease            2671   
6   NAFLD and increased risk of cardiovascular dis...             626   
2   Non-alcoholic fatty liver disease (NAFLD): a r...             600   
5   Non-alcoholic fatty liver disease (NAFLD) – pa...             564   
15  The Liver as an Endocrine Organ—Linking NAFLD ...             




In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_pdf_link(article_url):
    try:
        response = requests.get(article_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        for link in soup.find_all('a', href=True):
            href = link['href']
            if '.pdf' in href.lower():
                return urljoin(article_url, href)  # Göreli linkse tamamla
        return None
    except Exception as e:
        print(f"Hata oluştu: {e}")
        return None


In [7]:
import requests, re, time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed

# DOI -> PDF link (bulursa string, yoksa None)
def scihub_pdf_link_from_doi(doi: str, timeout=20, threads=4) -> str | None:
    mirrors = [
        "https://sci-hub.se",
        "https://sci-hub.st",
        "https://sci-hub.ru",
        "https://sci-hub.et-fine.com",
    ]
    # aynı DOI için birden fazla mirror’ı paralel dene
    with ThreadPoolExecutor(max_workers=min(threads, len(mirrors))) as ex:
        futs = [ex.submit(_try_mirror_get_pdf, m, doi, timeout) for m in mirrors]
        for fut in as_completed(futs):
            url = fut.result()
            if url:
                return url
    return None

def _try_mirror_get_pdf(mirror: str, doi: str, timeout: int) -> str | None:
    url = f"{mirror}/https://doi.org/{doi}"
    s = requests.Session()
    s.headers.update({
        "User-Agent": "Mozilla/5.0",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://doi.org/",
    })
    try:
        r = s.get(url, timeout=timeout, allow_redirects=True)
        # bazı mirror’lar doğrudan PDF’e yönlendirebilir
        if _head_is_pdf(r.url, session=s):
            return r.url
        # HTML içinden PDF’i ayıkla
        pdf = _extract_pdf_from_html(r.text, r.url, session=s, timeout=timeout)
        return pdf
    except requests.RequestException:
        return None

def _head_is_pdf(url: str, session: requests.Session | None = None, timeout=12) -> bool:
    try:
        sess = session or requests
        h = sess.head(url, allow_redirects=True, timeout=timeout)
        return "pdf" in (h.headers.get("Content-Type","").lower())
    except Exception:
        return False

def _extract_pdf_from_html(html: str, base_url: str, session: requests.Session, timeout: int) -> str | None:
    soup = BeautifulSoup(html, "html.parser")

    # 1) <iframe src=...>
    tag = soup.find("iframe", src=True)
    if tag:
        cand = urljoin(base_url, tag["src"])
        if _head_is_pdf(cand, session=session, timeout=timeout): return cand

    # 2) <embed src=...>
    tag = soup.find("embed", src=True)
    if tag:
        cand = urljoin(base_url, tag["src"])
        if _head_is_pdf(cand, session=session, timeout=timeout): return cand

    # 3) <object data=...>
    tag = soup.find("object", data=True)
    if tag:
        cand = urljoin(base_url, tag["data"])
        if _head_is_pdf(cand, session=session, timeout=timeout): return cand

    # 4) meta refresh (bazı mirror’lar bunu kullanır)
    meta = soup.find("meta", attrs={"http-equiv": re.compile("^refresh$", re.I)})
    if meta and meta.get("content"):
        m = re.search(r'url=([^;]+)', meta["content"], flags=re.I)
        if m:
            cand = urljoin(base_url, m.group(1).strip())
            if _head_is_pdf(cand, session=session, timeout=timeout): return cand

    # 5) link tahminleri
    for a in soup.find_all("a", href=True):
        href = a["href"]
        text = (a.get_text(strip=True) or "").lower()
        if (".pdf" in href.lower()) or ("pdf" in text):
            cand = urljoin(base_url, href)
            if _head_is_pdf(cand, session=session, timeout=timeout): return cand

    return None


In [8]:
from urllib.parse import urlparse

pdf_links = []   # Direkt PDF URL’leri
miss_dois = []   # Link bulunamayan DOI'ler

for i, link in enumerate(readable_links, 1):
    # 1) Sayfadan PDF bul
    pdf_url = extract_pdf_link(link)

    # 2) Eğer sayfada yoksa Sci-Hub dene
    if not pdf_url:
        doi = None
        try:
            u = urlparse(link)
            if 'doi.org' in u.netloc.lower():
                doi = u.path.lstrip('/')
        except:
            pass

        if doi:
            pdf_url = scihub_pdf_link_from_doi(doi, timeout=20, threads=4)
            if not pdf_url:
                miss_dois.append(doi)  # Daha sonra SciDownl ile indirilecek

    # 3) PDF link bulunduysa ekle
    if pdf_url:
        pdf_links.append(pdf_url)
        print(f"{i}. PDF bulundu (link): {pdf_url}")
    else:
        print(f"{i}. PDF link bulunamadı: {link}")

print("\nÖZET")
print("Direkt link bulunanlar:", len(pdf_links))
print("SciDownl’a kalacak DOI sayısı:", len(miss_dois))


1. PDF bulundu (link): https://2024.sci-hub.se/6995/2320f964eca07b2110b5e1575572b9d9/10.1038@s41591-018-0104-9.pdf#navpanes=0&view=FitH
2. PDF bulundu (link): https://moscow.sci-hub.se/3881/ed58e2382f406605593ba5608b051c28/10.1016@j.jhep.2014.12.012.pdf#navpanes=0&view=FitH
3. PDF bulundu (link): https://2024.sci-hub.se/8111/f916ad9136c99b1f2f33e2fbc3138a85/10.1136@gutjnl-2020-320622.pdf#navpanes=0&view=FitH
4. PDF bulundu (link): https://bmcendocrdisord.biomedcentral.com/counter/pdf/10.1186/s12902-022-00980-1.pdf
5. PDF bulundu (link): https://2024.sci-hub.se/6277/4ccca2ac67f37396d709cc6ef597fa2e/10.1080@03602532.2017.1293683.pdf#navpanes=0&view=FitH
6. PDF bulundu (link): https://2024.sci-hub.se/7590/5331502c1bc485c7c0054720c2a4fc39/10.1210@er.2019-00034.pdf#navpanes=0&view=FitH
7. PDF bulundu (link): https://2024.sci-hub.se/7332/4a45ca3764a581657522ab7c8600cffb/10.1080@17461391.2019.1571114.pdf#navpanes=0&view=FitH
8. PDF link bulunamadı: https://doi.org/10.2741/4892
9. PDF bulundu 

In [9]:
# === PDF Linklerinden Doğrudan İndirme Bloğu ===
import os, re, time, random, shutil
import requests
from urllib.parse import urlparse, unquote
from concurrent.futures import ThreadPoolExecutor, as_completed

# Geçici indirme klasörü
OUT_DIR = "/content/pdfs"
os.makedirs(OUT_DIR, exist_ok=True)

# Nihai hedef klasör
FINAL_DIR = "downloaded_pdfs"
os.makedirs(FINAL_DIR, exist_ok=True)

def _slug_filename(url: str, idx: int) -> str:
    """URL'den geçici dosya adı türetir"""
    path = urlparse(url).path
    name = os.path.basename(path) or f"file_{idx}.pdf"
    name = unquote(name)
    if not name.lower().endswith(".pdf"):
        name += ".pdf"
    name = re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_")
    return name

def download_pdf(url: str, idx: int, timeout=2) -> str | None:
    try:
        r = requests.get(url, stream=True, timeout=timeout, headers={"User-Agent": "Mozilla/5.0"})
        r.raise_for_status()
        if "pdf" not in r.headers.get("Content-Type", "").lower():
            return None
        out_path = os.path.join(OUT_DIR, _slug_filename(url, idx))
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        if os.path.getsize(out_path) < 1024:  # 1KB altı bozuksa sil
            os.remove(out_path)
            return None
        return out_path
    except Exception:
        return None

# 1) Paralel indirme
results = {}
with ThreadPoolExecutor(max_workers=8) as ex:
    futs = {ex.submit(download_pdf, url, i): url for i, url in enumerate(pdf_links, 1)}
    for fut in as_completed(futs):
        url = futs[fut]
        path = fut.result()
        if path:
            results[url] = path
            print(f"[OK] {url} -> {path}")
        else:
            results[url] = None
            print(f"[FAIL] {url}")

# 2) Başarılı olanları yeniden adlandırıp taşı
success_files = [p for p in results.values() if p]
for idx, file_path in enumerate(success_files, 1):
    new_name = os.path.join(FINAL_DIR, f"article_{idx}.pdf")
    shutil.move(file_path, new_name)
    print(f"[MOVE] {file_path} -> {new_name}")

print("\nİndirme özeti:")
print(f"Başarılı: {len(success_files)}")
print(f"Başarısız: {sum(1 for p in results.values() if not p)}")
print(f"PDF'ler {FINAL_DIR} klasöründe.")


[OK] https://bmcendocrdisord.biomedcentral.com/counter/pdf/10.1186/s12902-022-00980-1.pdf -> /content/pdfs/s12902-022-00980-1.pdf
[OK] https://2024.sci-hub.se/6277/4ccca2ac67f37396d709cc6ef597fa2e/10.1080@03602532.2017.1293683.pdf#navpanes=0&view=FitH -> /content/pdfs/10.1080_03602532.2017.1293683.pdf
[OK] https://2024.sci-hub.se/7332/4a45ca3764a581657522ab7c8600cffb/10.1080@17461391.2019.1571114.pdf#navpanes=0&view=FitH -> /content/pdfs/10.1080_17461391.2019.1571114.pdf
[OK] https://2024.sci-hub.se/8111/f916ad9136c99b1f2f33e2fbc3138a85/10.1136@gutjnl-2020-320622.pdf#navpanes=0&view=FitH -> /content/pdfs/10.1136_gutjnl-2020-320622.pdf
[OK] https://moscow.sci-hub.se/3881/ed58e2382f406605593ba5608b051c28/10.1016@j.jhep.2014.12.012.pdf#navpanes=0&view=FitH -> /content/pdfs/10.1016_j.jhep.2014.12.012.pdf
[OK] https://moscow.sci-hub.se/4224/da4bb7e499e48da3c4335634c2e9a90d/10.3390@ijms140611963.pdf#navpanes=0&view=FitH -> /content/pdfs/10.3390_ijms140611963.pdf
[OK] https://2024.sci-hub.se/

In [None]:
!pip install --upgrade pip
!pip install "mineru[core]>=2.0.0"

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting mineru>=2.0.0 (from mineru[core]>=2.0.0)
  Downloading mineru-2.1.1-py3-none-any.whl.metadata (56 kB)
Collecting boto3>=1.28.43 (from mineru>=2.0.0->mineru[core]>=2.0.0)
  Downloading boto3-1.39.9-py3-none-any.whl.metadata (6.7 kB)
Collecting loguru>=0.7.2 (from mineru>=2.0.0->mineru[core]>=2.0.0)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting pdfminer.six==20250506 (from mineru>=2.0.0->mineru[core]>=2.0.0)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4

In [None]:
!mineru -v

2025-07-21 12:50:38.145090: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753102238.389418    6715 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753102238.455345    6715 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-21 12:50:38.968354: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
mineru, version 2.1.1


In [None]:
# Copyright (c) Opendatalab. All rights reserved.
import copy
import json
import os
from pathlib import Path

from loguru import logger

from mineru.cli.common import convert_pdf_bytes_to_bytes_by_pypdfium2, prepare_env, read_fn
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
from mineru.utils.models_download_utils import auto_download_and_get_model_root_path


def do_parse(
    output_dir,  # Output directory for storing parsing results
    pdf_file_names: list[str],  # List of PDF file names to be parsed
    pdf_bytes_list: list[bytes],  # List of PDF bytes to be parsed
    p_lang_list: list[str],  # List of languages for each PDF, default is 'ch' (Chinese)
    backend="pipeline",  # The backend for parsing PDF, default is 'pipeline'
    parse_method="auto",  # The method for parsing PDF, default is 'auto'
    p_formula_enable=True,  # Enable formula parsing
    p_table_enable=True,  # Enable table parsing
    server_url=None,  # Server URL for vlm-sglang-client backend
    f_draw_layout_bbox=True,  # Whether to draw layout bounding boxes
    f_draw_span_bbox=True,  # Whether to draw span bounding boxes
    f_dump_md=True,  # Whether to dump markdown files
    f_dump_middle_json=True,  # Whether to dump middle JSON files
    f_dump_model_output=True,  # Whether to dump model output files
    f_dump_orig_pdf=True,  # Whether to dump original PDF files
    f_dump_content_list=True,  # Whether to dump content list files
    f_make_md_mode=MakeMode.MM_MD,  # The mode for making markdown content, default is MM_MD
    start_page_id=0,  # Start page ID for parsing, default is 0
    end_page_id=None,  # End page ID for parsing, default is None (parse all pages until the end of the document)
):

    if backend == "pipeline":
        for idx, pdf_bytes in enumerate(pdf_bytes_list):
            new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
            pdf_bytes_list[idx] = new_pdf_bytes

        infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = pipeline_doc_analyze(pdf_bytes_list, p_lang_list, parse_method=parse_method, formula_enable=p_formula_enable,table_enable=p_table_enable)

        for idx, model_list in enumerate(infer_results):
            model_json = copy.deepcopy(model_list)
            pdf_file_name = pdf_file_names[idx]
            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)

            images_list = all_image_lists[idx]
            pdf_doc = all_pdf_docs[idx]
            _lang = lang_list[idx]
            _ocr_enable = ocr_enabled_list[idx]
            middle_json = pipeline_result_to_middle_json(model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, p_formula_enable)

            pdf_info = middle_json["pdf_info"]

            pdf_bytes = pdf_bytes_list[idx]
            if f_draw_layout_bbox:
                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")

            if f_draw_span_bbox:
                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")

            if f_dump_orig_pdf:
                md_writer.write(
                    f"{pdf_file_name}_origin.pdf",
                    pdf_bytes,
                )

            if f_dump_md:
                image_dir = str(os.path.basename(local_image_dir))
                md_content_str = pipeline_union_make(pdf_info, f_make_md_mode, image_dir)
                md_writer.write_string(
                    f"{pdf_file_name}.md",
                    md_content_str,
                )

            if f_dump_content_list:
                image_dir = str(os.path.basename(local_image_dir))
                content_list = pipeline_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
                md_writer.write_string(
                    f"{pdf_file_name}_content_list.json",
                    json.dumps(content_list, ensure_ascii=False, indent=4),
                )

            if f_dump_middle_json:
                md_writer.write_string(
                    f"{pdf_file_name}_middle.json",
                    json.dumps(middle_json, ensure_ascii=False, indent=4),
                )

            if f_dump_model_output:
                md_writer.write_string(
                    f"{pdf_file_name}_model.json",
                    json.dumps(model_json, ensure_ascii=False, indent=4),
                )

            logger.info(f"local output dir is {local_md_dir}")
    else:
        if backend.startswith("vlm-"):
            backend = backend[4:]

        f_draw_span_bbox = False
        parse_method = "vlm"
        for idx, pdf_bytes in enumerate(pdf_bytes_list):
            pdf_file_name = pdf_file_names[idx]
            pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id)
            local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
            image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
            middle_json, infer_result = vlm_doc_analyze(pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url)

            pdf_info = middle_json["pdf_info"]

            if f_draw_layout_bbox:
                draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf")

            if f_draw_span_bbox:
                draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf")

            if f_dump_orig_pdf:
                md_writer.write(
                    f"{pdf_file_name}_origin.pdf",
                    pdf_bytes,
                )

            if f_dump_md:
                image_dir = str(os.path.basename(local_image_dir))
                md_content_str = vlm_union_make(pdf_info, f_make_md_mode, image_dir)
                md_writer.write_string(
                    f"{pdf_file_name}.md",
                    md_content_str,
                )

            if f_dump_content_list:
                image_dir = str(os.path.basename(local_image_dir))
                content_list = vlm_union_make(pdf_info, MakeMode.CONTENT_LIST, image_dir)
                md_writer.write_string(
                    f"{pdf_file_name}_content_list.json",
                    json.dumps(content_list, ensure_ascii=False, indent=4),
                )

            if f_dump_middle_json:
                md_writer.write_string(
                    f"{pdf_file_name}_middle.json",
                    json.dumps(middle_json, ensure_ascii=False, indent=4),
                )

            if f_dump_model_output:
                model_output = ("\n" + "-" * 50 + "\n").join(infer_result)
                md_writer.write_string(
                    f"{pdf_file_name}_model_output.txt",
                    model_output,
                )

            logger.info(f"local output dir is {local_md_dir}")


def parse_doc(
        path_list: list[Path],
        output_dir,
        lang="en",
        backend="pipeline",
        method="auto",
        server_url=None,
        start_page_id=0,  # Start page ID for parsing, default is 0
        end_page_id=None  # End page ID for parsing, default is None (parse all pages until the end of the document)
):
    """
        Parameter description:
        path_list: List of document paths to be parsed, can be PDF or image files.
        output_dir: Output directory for storing parsing results.
        lang: Language option, default is 'ch', optional values include['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']。
            Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
            Adapted only for the case where the backend is set to "pipeline"
        backend: the backend for parsing pdf:
            pipeline: More general.
            vlm-transformers: More general.
            vlm-sglang-engine: Faster(engine).
            vlm-sglang-client: Faster(client).
            without method specified, pipeline will be used by default.
        method: the method for parsing pdf:
            auto: Automatically determine the method based on the file type.
            txt: Use text extraction method.
            ocr: Use OCR method for image-based PDFs.
            Without method specified, 'auto' will be used by default.
            Adapted only for the case where the backend is set to "pipeline".
        server_url: When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
    """
    try:
        file_name_list = []
        pdf_bytes_list = []
        lang_list = []
        for path in path_list:
            file_name = str(Path(path).stem)
            pdf_bytes = read_fn(path)
            file_name_list.append(file_name)
            pdf_bytes_list.append(pdf_bytes)
            lang_list.append(lang)
        do_parse(
            output_dir=output_dir,
            pdf_file_names=file_name_list,
            pdf_bytes_list=pdf_bytes_list,
            p_lang_list=lang_list,
            backend=backend,
            parse_method=method,
            server_url=server_url,
            start_page_id=start_page_id,
            end_page_id=end_page_id
        )
    except Exception as e:
        logger.exception(e)


if __name__ == '__main__':
    # args
    pdf_files_dir = "./downloaded_pdfs"
    output_dir = "/content/drive/MyDrive/Mineru_Output_pipeline"
    pdf_suffixes = [".pdf"]
    image_suffixes = [".png", ".jpeg", ".jpg"]

    doc_path_list = []
    for doc_path in Path(pdf_files_dir).glob('*'):
        if doc_path.suffix in pdf_suffixes + image_suffixes:
            doc_path_list.append(doc_path)

    """如果您由于网络问题无法下载模型，可以设置环境变量MINERU_MODEL_SOURCE为modelscope使用免代理仓库下载模型"""
    # os.environ['MINERU_MODEL_SOURCE'] = "modelscope"

    """Use pipeline mode if your environment does not support VLM"""
    parse_doc(doc_path_list, output_dir, backend="pipeline", lang="en")

    """To enable VLM mode, change the backend to 'vlm-xxx'"""
    # parse_doc(doc_path_list, output_dir, backend="vlm-transformers")  # more general.
    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-engine")  # faster(engine).
    # parse_doc(doc_path_list, output_dir, backend="vlm-sglang-client", server_url="http://127.0.0.1:30000"）  # faster(client).



Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


[32m2025-07-21 12:51:19.748[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mdoc_analyze[0m:[36m124[0m - [1mBatch 1/1: 40 pages/40 pages[0m
[32m2025-07-21 12:51:19.750[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mbatch_image_analyze[0m:[36m187[0m - [1mgpu_memory: 15 GB, batch_ratio: 8[0m
[32m2025-07-21 12:51:19.751[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.model_init[0m:[36m__init__[0m:[36m137[0m - [1mDocAnalysis init, this may take some times......[0m
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

yolo_v8_ft.pt:   0%|          | 0.00/350M [00:00<?, ?B/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/810M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

(…)ut_yolo_docstructbench_imgsz1280_2501.pt:   0%|          | 0.00/39.8M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

ch_PP-OCRv5_det_infer.pth:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

ch_PP-OCRv4_rec_server_doc_infer.pth:   0%|          | 0.00/101M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

slanet-plus.onnx:   0%|          | 0.00/7.76M [00:00<?, ?B/s]

[32m2025-07-21 12:51:43.693[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.model_init[0m:[36m__init__[0m:[36m182[0m - [1mDocAnalysis init done![0m
[32m2025-07-21 12:51:43.698[0m | [1mINFO    [0m | [36mmineru.backend.pipeline.pipeline_analyze[0m:[36mcustom_model_init[0m:[36m64[0m - [1mmodel init cost: 23.946722507476807[0m
Layout Predict: 100%|██████████| 40/40 [00:08<00:00,  4.95it/s]
MFD Predict: 100%|██████████| 40/40 [00:12<00:00,  3.11it/s]
MFR Predict: 100%|██████████| 238/238 [00:05<00:00, 41.89it/s]


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

en_PP-OCRv3_det_infer.pth:   0%|          | 0.00/2.54M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

en_PP-OCRv4_rec_infer.pth:   0%|          | 0.00/23.8M [00:00<?, ?B/s]

OCR-det en: 100%|██████████| 112/112 [00:10<00:00, 10.47it/s]
Table Predict:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Table Predict: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]
Processing pages:   0%|          | 0/9 [00:00<?, ?it/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/713M [00:00<?, ?B/s]

Processing pages: 100%|██████████| 9/9 [00:08<00:00,  1.00it/s]
OCR-rec Predict: 100%|██████████| 15/15 [00:00<00:00, 64.52it/s]
[32m2025-07-21 12:52:36.650[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_parse[0m:[36m105[0m - [1mlocal output dir is /content/drive/MyDrive/Mineru_Output_pipeline/article_1/auto[0m
Processing pages: 100%|██████████| 31/31 [00:09<00:00,  3.16it/s]
OCR-rec Predict: 100%|██████████| 4/4 [00:00<00:00, 39.35it/s]
[32m2025-07-21 12:52:53.808[0m | [1mINFO    [0m | [36m__main__[0m:[36mdo_parse[0m:[36m105[0m - [1mlocal output dir is /content/drive/MyDrive/Mineru_Output_pipeline/article_2/auto[0m


In [None]:
import os
import pathlib
from google import genai
from google.genai import types

# Ana klasör yolu
root_folder = '/content/drive/MyDrive/Mineru_Output_pipeline'

# Gemini istemcisi
client = genai.Client(api_key="****")

# Tüm article_* klasörlerini sırayla işle
for article_name in os.listdir(root_folder):
    article_path = os.path.join(root_folder, article_name)

    # Sadece klasörleri al
    if not os.path.isdir(article_path):
        continue

    # auto klasörü ve images alt klasörü
    auto_folder = os.path.join(article_path, 'auto')
    image_folder = os.path.join(auto_folder, 'images')

    if not os.path.exists(image_folder):
        print(f"Images folder not found for {article_name}, skipping.")
        continue

    # İlgili .md dosyasını bul (article klasörünün içinde, benzersiz adla)
    md_file = None
    for file in os.listdir(auto_folder):
        if file.endswith('.md'):
            md_file = os.path.join(auto_folder, file)
            break

    if not md_file:
        print(f"No markdown file found for {article_name}, skipping.")
        continue

    # .md dosyasını oku
    with open(md_file, 'r', encoding='utf-8') as f:
        article_text = f.read()

    # Görsel dosyalarını al
    image_files = [f for f in os.listdir(image_folder) if f.lower().endswith('.jpg')]

    # Her bir görseli işle
    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)

        with open(image_path, 'rb') as f:
            image_bytes = f.read()

        # Görsel + md bağlamı ile açıklama üret
        response = client.models.generate_content(
            model='gemini-2.5-flash',
            contents=[
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='image/jpeg',
                ),
                f"Here is the related article content:\n\n{article_text}",
                "As a researcher, analyze this scientific chart in the context of the article. Explain its meaning, implications, and key insights briefly."
            ]
        )

        # TXT dosyasına yaz
        txt_filename = os.path.splitext(image_file)[0] + '.txt'
        txt_path = os.path.join(image_folder, txt_filename)

        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(response.text)

    print(f"Processed all images in {article_name}")
