# Mathpix 本地 PDF 转换（Jupyter 版）

在本 Notebook 中，你可以：
1. 通过**路径**或**上传**选择一份本地 PDF（如考试卷）。
2. 输入你的 `APP_ID` / `APP_KEY`（建议使用**新生成的密钥**，不要把旧密钥写入文件中）。
3. 选择输出格式（DOCX / LaTeX ZIP / MMD / 行级 JSON 等）。
4. 一键上传、轮询直至完成，并自动下载结果到指定目录。

> 说明：
> - 该 Notebook 在**你的本地环境**调用 Mathpix API；本文件不会保存你的密钥值。
> - 若你只想测试部分页，可设置 `page_ranges`（如 `1-8,10`）。
> - 如果按钮无响应，尝试重启内核或运行安装 ipywidgets 的单元格。


In [6]:
import os, json, time, pathlib, requests, tempfile, shutil
from typing import List

API_BASE = "https://api.mathpix.com/v3/pdf"

FORMAT_EXTENSIONS = {
    # 可直接下载的扩展名
    "md": "md",                # 普通 Markdown（不等于 mmd）
    "mmd": "mmd",              # Mathpix Markdown（无需加到 conversion_formats）
    "docx": "docx",
    "tex.zip": "tex.zip",
    "html": "html",            # 注意不是 html.zip
    "pdf": "pdf",
    "latex.pdf": "latex.pdf",
    "pptx": "pptx",
    "lines.json": "lines.json",
    "lines.mmd.json": "lines.mmd.json",
}

def parse_formats(fmts: List[str]) -> List[str]:
    out = []
    for f in fmts:
        f = f.strip()
        if not f:
            continue
        if f not in FORMAT_EXTENSIONS:
            raise ValueError(f"未知格式: {f} | 支持: {', '.join(FORMAT_EXTENSIONS)}")
        out.append(f)
    return out

def convert_with_mathpix(pdf_path: pathlib.Path, app_id: str, app_key: str,
                         fmts: List[str], page_ranges: str = None,
                         include_equation_tags: bool = True,
                         timeout: int = 900, poll_interval: float = 2.0,
                         out_dir: pathlib.Path = None,
                         verbose: bool = True):
    headers = {"app_id": app_id, "app_key": app_key}

    options = {"conversion_formats": {}, "include_equation_tags": bool(include_equation_tags)}
    if page_ranges:
        options["page_ranges"] = page_ranges

    # 映射下载格式到 conversion_formats
# 映射下载格式到 conversion_formats（只为需要声明的格式加开关）
    for f in fmts:
        if f == "docx":
            options["conversion_formats"]["docx"] = True
        elif f == "tex.zip":
            options["conversion_formats"]["tex.zip"] = True
        elif f == "md":
            options["conversion_formats"]["md"] = True
        elif f == "html":
            options["conversion_formats"]["html"] = True
        elif f == "pdf":
            options["conversion_formats"]["pdf"] = True
        elif f == "latex.pdf":
            options["conversion_formats"]["latex.pdf"] = True
        elif f == "pptx":
            options["conversion_formats"]["pptx"] = True
        # 这些不需要在 conversion_formats 里声明（但仍可下载）：
        # "mmd", "lines.json", "lines.mmd.json"


    if not options["conversion_formats"]:
        raise ValueError("未选择有效输出格式")

    if verbose:
        print("[info] 上传:", pdf_path)
    with open(pdf_path, "rb") as f:
        files = {"file": (pdf_path.name, f, "application/pdf")}
        data = {"options_json": json.dumps(options)}
        resp = requests.post(API_BASE, headers=headers, files=files, data=data)
    resp.raise_for_status()
    j = resp.json()
    pdf_id = j.get("pdf_id")
    if not pdf_id:
        raise RuntimeError(f"意外响应: {j}")

    if verbose:
        print("[info] pdf_id:", pdf_id)

    # 轮询
    status_url = f"{API_BASE}/{pdf_id}"
    t0 = time.time()
    last_status = None
    while True:
        r = requests.get(status_url, headers=headers)
        r.raise_for_status()
        s = r.json()
        status = s.get("status")
        if verbose and status != last_status:
            print("[info] status=", status)
            last_status = status
        if status == "completed":
            break
        if status == "error":
            raise RuntimeError(f"任务错误: {s}")
        if time.time() - t0 > timeout:
            raise TimeoutError(f"超时 {timeout}s，最后状态: {s}")
        time.sleep(poll_interval)

    # 下载
    if out_dir is None:
        out_dir = pdf_path.parent
    out_dir.mkdir(parents=True, exist_ok=True)
    stem = pdf_path.stem

    def download(ext: str, suffix: str):
        url = f"{status_url}.{ext}"
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            print(f"[warn] 下载失败: {ext} | status={r.status_code} body={r.text[:200]}")
            return None
        out_path = out_dir / f"{stem}.{suffix}"
        out_path.write_bytes(r.content)
        return out_path

    saved = []
    for f in fmts:
        ext = FORMAT_EXTENSIONS[f]
        p = download(ext, ext)
        if p:
            saved.append(str(p))

    return saved


## 纯函数式用法（可选）
如果你不想用上面的交互式控件，也可以直接调用函数：
```python
saved = convert_with_mathpix(
    pdf_path=pathlib.Path('exam_2003_12_16_danish.pdf'),
    app_id='dtu_479d64_8a44ad',
    app_key='8c667bf5aa7ff99be639433a6dfd09fc21fa3bf5d846cc53933a2b9aa6d18745',
    fmts=['docx','tex.zip','mmd','lines.json','lines.mmd.json'],
    page_ranges='1-8',
    include_equation_tags=True,
    timeout=900,
    poll_interval=2.0,
    out_dir=pathlib.Path('./outputs')
)
saved
```


In [12]:
saved = convert_with_mathpix(
    pdf_path=pathlib.Path('exam_2003_12_16_danish.pdf'),
    app_id='dtu_479d64_8a44ad',
    app_key='8c667bf5aa7ff99be639433a6dfd09fc21fa3bf5d846cc53933a2b9aa6d18745',
    fmts=['tex.zip'],
    page_ranges='2--1',
    include_equation_tags=True,
    timeout=900,
    poll_interval=2.0,
    out_dir=pathlib.Path('./outputs')
)
saved

[info] 上传: exam_2003_12_16_danish.pdf
[info] pdf_id: 2025_08_21_49d08aca80a4c7cda77bg
[info] status= loaded
[info] status= split
[info] status= completed


['outputs\\exam_2003_12_16_danish.tex.zip']

### 小贴士
- **安全**：不要把密钥硬编码进 Notebook；使用输入框或环境变量。
- **加速**：为大文件设置 `page_ranges`，以及必要的输出格式。
- **LaTeX 输出**：选择 `tex.zip`，适合需要严格公式与可复现排版的场景。
- **结构化处理**：选 `mmd`/`lines.json`/`lines.mmd.json`，方便后续程序解析。


In [13]:
# 批量处理文件夹中的 PDF（基于你已定义的 convert_with_mathpix 函数）
from pathlib import Path
import os, time, pandas as pd

# === 配置区 ===
FOLDER = Path("outputs")           # 你的 PDF 文件夹（示例：当前目录下的 pdf/）
RECURSIVE = True               # 递归子目录
FMTS = ['tex.zip']  # 任选：docx, tex.zip, mmd, lines.json, lines.mmd.json, md, html, pdf, latex.pdf, pptx
PAGE_RANGES = '2--1'           # 跳过第一页。若全页处理，设为 None
INCLUDE_EQ = True              # 保留公式编号
TIMEOUT = 900                  # 每份 PDF 的超时时间（秒）
POLL = 2.0                     # 轮询间隔（秒）
SLEEP_BETWEEN = 1.0            # 每份 PDF 之间的间隔，避免过快触发限速
OUT_DIR = None                 # None = 结果保存在原 PDF 同目录；也可设 Path("./outputs")
APP_ID = os.getenv("MATHPIX_APP_ID") or "dtu_479d64_8a44ad"
APP_KEY = os.getenv("MATHPIX_APP_KEY") or "8c667bf5aa7ff99be639433a6dfd09fc21fa3bf5d846cc53933a2b9aa6d18745"
SKIP_IF_ALL_EXIST = True       # 若该 PDF 所需输出都已存在，则跳过
MAX_FILES = None               # 仅处理前 N 个文件；全部处理请设为 None

# === 扩展名映射（用于判断是否已存在产物）===
EXT_MAP = {
    "md": "md", "mmd": "mmd", "docx": "docx", "tex.zip": "tex.zip",
    "html": "html", "pdf": "pdf", "latex.pdf": "latex.pdf", "pptx": "pptx",
    "lines.json": "lines.json", "lines.mmd.json": "lines.mmd.json"
}

def expected_outputs_for(pdf_path: Path, fmts, out_dir):
    out_dir = (out_dir or pdf_path.parent)
    stem = pdf_path.stem
    return [out_dir / f"{stem}.{EXT_MAP[f]}" for f in fmts]

def list_pdfs(folder: Path, recursive=True):
    it = folder.rglob("*.pdf") if recursive else folder.glob("*.pdf")
    return sorted([p for p in it if p.is_file()])

# === 主流程 ===
pdfs = list_pdfs(FOLDER, RECURSIVE)
if MAX_FILES:
    pdfs = pdfs[:MAX_FILES]

rows = []
for i, pdf in enumerate(pdfs, 1):
    try:
        # 跳过：已存在全部目标产物
        if SKIP_IF_ALL_EXIST:
            exp_outs = expected_outputs_for(pdf, FMTS, OUT_DIR)
            if all(p.exists() for p in exp_outs):
                rows.append({"pdf": str(pdf), "status": "skipped (exists)", "saved": [str(p) for p in exp_outs], "error": ""})
                print(f"[{i}/{len(pdfs)}] SKIP  {pdf}")
                continue

        print(f"[{i}/{len(pdfs)}] RUN   {pdf}")
        saved = convert_with_mathpix(
            pdf_path=pdf,
            app_id=APP_ID,
            app_key=APP_KEY,
            fmts=FMTS,
            page_ranges=PAGE_RANGES,
            include_equation_tags=INCLUDE_EQ,
            timeout=TIMEOUT,
            poll_interval=POLL,
            out_dir=OUT_DIR,
            verbose=True,
        )
        rows.append({"pdf": str(pdf), "status": "ok", "saved": saved, "error": ""})
        time.sleep(SLEEP_BETWEEN)

    except Exception as e:
        rows.append({"pdf": str(pdf), "status": "error", "saved": [], "error": str(e)})
        print(f"[{i}/{len(pdfs)}] ERR   {pdf} -> {e}")

df = pd.DataFrame(rows)
df


[1/1] RUN   outputs\exam_2003_12_16_danish.latex.pdf
[info] 上传: outputs\exam_2003_12_16_danish.latex.pdf
[info] pdf_id: 2025_08_21_7bbc7cbd03a7e83c531ag
[info] status= loaded
[info] status= split
[info] status= completed


Unnamed: 0,pdf,status,saved,error
0,outputs\exam_2003_12_16_danish.latex.pdf,ok,[outputs\exam_2003_12_16_danish.latex.tex.zip],


In [15]:
# 批量：从 *.tex.zip 提取主 .tex -> 转为 .txt
from pathlib import Path
import zipfile, shutil, subprocess, tempfile, sys

# ===== 配置 =====
SRC = Path("outputs")     # 你的 tex.zip 所在目录（可改成 "pdf" 等）
RECURSIVE = True          # 递归子目录
OUT_DIR = Path("only-txt")  # 统一放 txt 的目录；若想放回原处，设为 None
KEEP_ZIP = True           # 提取后是否保留 zip
KEEP_TEX = False          # 生成 txt 后是否保留中间的 .tex
OVERWRITE = False         # 目标 .txt 已存在时是否覆盖
USE_PANDOC_FIRST = True   # 优先用 Pandoc；找不到再用 pylatexenc
PANDOC_ARGS = ["-f","latex","-t","plain"]  # Pandoc 转纯文本文档参数

# ====== 工具函数 ======
def pick_main_tex(zf: zipfile.ZipFile):
    """挑最像主文档的 .tex：有 \\documentclass / \\begin{document} 得分更高，越长越像主文件。"""
    tex_members = [m for m in zf.namelist() if m.lower().endswith(".tex")]
    if not tex_members: return None, None
    best, best_score, best_text = None, -1, None
    for m in tex_members:
        data = zf.read(m)
        text = None
        for enc in ("utf-8","utf-8-sig","latin-1","cp1252"):
            try:
                text = data.decode(enc); break
            except UnicodeDecodeError: pass
        if text is None: continue
        score = 0
        if "\\documentclass" in text: score += 3
        if "\\begin{document}" in text: score += 4
        if "\\end{document}" in text: score += 2
        if "\\section" in text: score += 1
        if "\\begin{equation}" in text or "\\begin{align}" in text: score += 1
        if len(text) > 1000: score += 1
        if len(text) > 3000: score += 1
        if score > best_score:
            best, best_score, best_text = m, score, text
    return best, best_text

def iter_zip_files(root: Path, recursive=True):
    it = root.rglob("*.tex.zip") if recursive else root.glob("*.tex.zip")
    for p in sorted(it):
        if p.is_file(): yield p

def ensure_out_path(zip_path: Path, out_dir: Path|None, suffix: str):
    if out_dir is None:
        return zip_path.with_suffix("")  # *.tex.zip -> *.tex 先用这个，再改后缀
    rel = zip_path.parent.relative_to(SRC)
    target_dir = out_dir / rel
    target_dir.mkdir(parents=True, exist_ok=True)
    return (target_dir / zip_path.with_suffix("").name).with_suffix(suffix)

def have_pandoc():
    from shutil import which
    return which("pandoc") is not None

def tex_to_txt_via_pandoc(tex_text: str) -> str:
    p = subprocess.run(
        ["pandoc", *PANDOC_ARGS, "-o", "-"],  # 输出到 stdout
        input=tex_text.encode("utf-8"),
        stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    if p.returncode != 0:
        raise RuntimeError("pandoc failed: " + p.stderr.decode("utf-8", "ignore"))
    return p.stdout.decode("utf-8", "ignore")

def tex_to_txt_via_pylatexenc(tex_text: str) -> str:
    try:
        from pylatexenc.latex2text import LatexNodes2Text
    except ImportError:
        # 安装 pylatexenc
        print("[info] 正在安装 pylatexenc ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pylatexenc"])
        from pylatexenc.latex2text import LatexNodes2Text
    # math_mode='verbatim' 保留数学内容为 TeX 片段；想删除数学可用 'remove'
    return LatexNodes2Text(math_mode='verbatim').latex_to_text(tex_text)

# ====== 主流程 ======
ok = skip = err = 0
use_pandoc = USE_PANDOC_FIRST and have_pandoc()
if USE_PANDOC_FIRST and not use_pandoc:
    print("[warn] 未发现 pandoc，可安装后再跑；本次将使用 pylatexenc 兜底。")

for zp in iter_zip_files(SRC, RECURSIVE):
    try:
        with zipfile.ZipFile(zp, "r") as zf:
            name, tex_text = pick_main_tex(zf)
            if not name:
                print("[warn] 未找到 .tex：", zp); err += 1; continue

        # 目标路径（.txt）
        out_txt = ensure_out_path(zp, OUT_DIR, ".txt")
        if out_txt.exists() and not OVERWRITE:
            print("[skip] 已存在：", out_txt); skip += 1; continue

        # 转换为 txt
        if use_pandoc:
            try:
                txt = tex_to_txt_via_pandoc(tex_text)
            except Exception as e:
                print("[warn] pandoc 转换失败，改用 pylatexenc：", e)
                txt = tex_to_txt_via_pylatexenc(tex_text)
        else:
            txt = tex_to_txt_via_pylatexenc(tex_text)

        out_txt.parent.mkdir(parents=True, exist_ok=True)
        out_txt.write_text(txt, encoding="utf-8")
        print("[ok] ", zp.name, "->", out_txt)

        ok += 1

        # 是否保留中间产物
        if not KEEP_TEX:
            # 我们没有把 .tex 落地，只保留 txt；无需清理
            pass
        if not KEEP_ZIP:
            try: zp.unlink()
            except Exception as e: print("[warn] 删除 zip 失败：", zp, "->", e)

    except Exception as e:
        print("[err]", zp, "->", e)
        err += 1

print(f"\n完成：ok={ok}, skip={skip}, err={err}")


[ok]  exam_2003_12_16_danish.latex.tex.zip -> only-txt\exam_2003_12_16_danish.latex.txt
[ok]  exam_2003_12_16_danish.tex.zip -> only-txt\exam_2003_12_16_danish.txt

完成：ok=2, skip=0, err=0
