In [1]:
import os
import json
import fitz           # PyMuPDF
from typing import List

# Set your input/output paths:
INPUT_DIR  = os.path.expanduser(r"C:\Users\offic\AGENT\data\raw_papers")
OUTPUT_DIR = os.path.expanduser(r"C:\Users\offic\AGENT\data\text_json")

# Make sure output folder exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Will read PDFs from:", INPUT_DIR)
print("JSON output to:",   OUTPUT_DIR)


Will read PDFs from: C:\Users\offic\AGENT\data\raw_papers
JSON output to: C:\Users\offic\AGENT\data\text_json


In [2]:
def extract_text_from_pdf(pdf_path: str) -> List[str]:
    """
    Open a PDF and return a list of page-texts.
    """
    doc = fitz.open(pdf_path)
    pages = []
    for i in range(doc.page_count):
        page = doc.load_page(i)
        pages.append(page.get_text("text"))
    doc.close()
    return pages


In [3]:
# Find all PDF filenames
pdf_files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]

if not pdf_files:
    print("❗ No PDFs found in", INPUT_DIR)
else:
    for fn in pdf_files:
        in_path = os.path.join(INPUT_DIR, fn)
        try:
            pages = extract_text_from_pdf(in_path)
            data = {
                "filename":  fn,
                "num_pages": len(pages),
                "pages":     pages,
                "full_text": "\n\n".join(pages),
            }
            out_name = fn.rsplit(".",1)[0] + ".json"
            out_path = os.path.join(OUTPUT_DIR, out_name)
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"✅ {fn} → {out_name} ({len(pages)} pages)")
        except Exception as e:
            print(f"❌ Failed {fn}: {e}")


✅ 1312.6114v11.pdf → 1312.6114v11.json (14 pages)
✅ 1361_a_closer_look_at_few_shot_clas.pdf → 1361_a_closer_look_at_few_shot_clas.json (16 pages)
✅ 1406.2661v1.pdf → 1406.2661v1.json (9 pages)
✅ 1503.02531v1.pdf → 1503.02531v1.json (9 pages)
✅ 1512.00567v3.pdf → 1512.00567v3.json (10 pages)
✅ 1512.03385v1.pdf → 1512.03385v1.json (12 pages)
✅ 1605.07146v4.pdf → 1605.07146v4.json (15 pages)
✅ 1608.06993v5.pdf → 1608.06993v5.json (9 pages)
✅ 1686_a_baseline_for_few_shot_image_.pdf → 1686_a_baseline_for_few_shot_image_.json (20 pages)
✅ 1703.00837v2.pdf → 1703.00837v2.json (11 pages)
✅ 1703.03400v3.pdf → 1703.03400v3.json (13 pages)
✅ 1703.05175v2.pdf → 1703.05175v2.json (13 pages)
✅ 1706.03762v7.pdf → 1706.03762v7.json (15 pages)
✅ 1707.03141v3.pdf → 1707.03141v3.json (17 pages)
✅ 1707.09835v2.pdf → 1707.09835v2.json (11 pages)
✅ 1709.00340v4.pdf → 1709.00340v4.json (12 pages)
✅ 1710.09412v2.pdf → 1710.09412v2.json (13 pages)
✅ 1711.04043v3.pdf → 1711.04043v3.json (13 pages)
✅ 1711.05101v

In [4]:
print("JSON files now in:", OUTPUT_DIR)
print(os.listdir(OUTPUT_DIR)[:10])  # show up to first 10


JSON files now in: C:\Users\offic\AGENT\data\text_json
['1312.6114v11.json', '1361_a_closer_look_at_few_shot_clas.json', '1406.2661v1.json', '1503.02531v1.json', '1512.00567v3.json', '1512.03385v1.json', '1605.07146v4.json', '1608.06993v5.json', '1686_a_baseline_for_few_shot_image_.json', '1703.00837v2.json']
