In [1]:
#!/usr/bin/env python3
# ============================================================
# Batch Workflow: Extract Ancient Place‑Names from Turkish "Kazı Sonuçları" PDFs
#   (with PDF filename + original URL in output)
# Prerequisites (run once):
#   pip install pandas requests beautifulsoup4 PyMuPDF PyPDF2 zemberek-python
# ============================================================

import os
import re
import json
import logging
import requests
import pandas as pd
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from zemberek import TurkishMorphology

# ─── Logger setup ─────────────────────────────────────────────
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
log = logging.getLogger(__name__)

# ─── Step 1 – Download & prepare Pleiades gazetteer ────────────
PLEIADES_CSV = "places.csv"
PLEIADES_URL = (
    "https://raw.githubusercontent.com/isawnyu/pleiades.datasets"
    "/main/data/gis/places.csv"
)
if not os.path.exists(PLEIADES_CSV):
    log.info("⬇️  Downloading Pleiades CSV…")
    r = requests.get(PLEIADES_URL)
    r.raise_for_status()
    with open(PLEIADES_CSV, "wb") as f:
        f.write(r.content)
    log.info("✅ Saved places.csv")

df = pd.read_csv(PLEIADES_CSV)
for col in ("representative_latitude", "representative_longitude", "title", "id"):
    if col not in df.columns:
        raise ValueError(f"❌ Missing column {col} in Pleiades CSV")

# filter for modern‐Turkey bounding box
df["reprLat"]  = pd.to_numeric(df["representative_latitude"],  errors="coerce")
df["reprLong"] = pd.to_numeric(df["representative_longitude"], errors="coerce")
df_tr = df[
    df["reprLat"].between(36,42) &
    df["reprLong"].between(26,45)
].copy()

ancient_places = set(df_tr["title"].str.lower().dropna())
log.info(f"✅ Loaded {len(ancient_places):,} place‑names inside Turkey")

# build lookup for coords & PID
place_lookup = {
    row["title"].lower(): {
        "pid": int(row["id"]),
        "lat": float(row["representative_latitude"]),
        "lon": float(row["representative_longitude"])
    }
    for _, row in df_tr.iterrows()
}

# ─── Step 2 – Scrape Ministry site for PDF URLs ───────────────
BASE      = "https://ukaas.ktb.gov.tr"
INDEX_URL = f"{BASE}/TR-389973/kazi-sonuclari-toplantisi.html"

log.info("🔎 Scraping excavation‐report PDF links …")
html       = requests.get(INDEX_URL).text
soup       = BeautifulSoup(html, "html.parser")
hrefs      = [a["href"] for a in soup.find_all("a", href=True) if a["href"].lower().endswith(".pdf")]
pdf_links  = [BASE + href for href in hrefs]
log.info(f"✅ Found {len(pdf_links)} PDF URLs on the site")

# build a map: local_filename → pdf_url
pdf_url_map = {}
for url in pdf_links:
    fn = os.path.basename(url.split("?")[0])
    pdf_url_map[fn] = url

# ─── Step 3 – Gather your local PDFs ──────────────────────────
pdf_dir   = "pdf_reports"
pdf_files = [
    os.path.join(pdf_dir, fn)
    for fn in os.listdir(pdf_dir)
    if fn.lower().endswith(".pdf")
]
if not pdf_files:
    raise RuntimeError(f"❌ No PDFs found in `{pdf_dir}/` – please drop them there.")
log.info(f"🔎 Found {len(pdf_files)} local PDFs to process")

# ─── Step 4 – Zemberek helper & PDF page extractor ───────────
zemb = TurkishMorphology.create_with_defaults()

def get_lemmas(token):
    analyses = zemb.analyze(token).analysis_results
    return [a.get_stem() for a in analyses] if analyses else [token]

def extract_pages(pdf_path):
    """Return list of (page_number, text) tuples, 1‑based."""
    reader = PdfReader(pdf_path)
    pages = [(i+1, page.extract_text() or "") for i, page in enumerate(reader.pages)]
    # fallback to PyMuPDF if all pages empty
    if all(not txt.strip() for _, txt in pages):
        import fitz
        doc = fitz.open(pdf_path)
        pages = [(i+1, doc[i].get_text("text")) for i in range(doc.page_count)]
    return pages

# ─── Step 5 – Batch loop over each PDF ───────────────────────
batch_results = []
for pdf_path in pdf_files:
    pdf_fn  = os.path.basename(pdf_path)
    pdf_url = pdf_url_map.get(pdf_fn)
    log.info(f"📄 Processing {pdf_fn}")
    pages = extract_pages(pdf_path)

    report = {
        "pdf_file": pdf_fn,
        "pdf_url":  pdf_url,
        "places":   {}
    }

    for page_num, text in pages:
        tokens = re.findall(r"\b([A-ZÇĞİÖŞÜ][a-zçğıöşü\-]+)\b", text)
        for tok in tokens:
            for lemma in get_lemmas(tok):
                key = lemma.lower()
                if key not in ancient_places:
                    continue
                ent = report["places"].setdefault(key, {
                    "pid":          place_lookup[key]["pid"],
                    "pleiades_url": f"https://pleiades.stoa.org/places/{place_lookup[key]['pid']}",
                    "lat":          place_lookup[key]["lat"],
                    "lon":          place_lookup[key]["lon"],
                    "occurrences":  []
                })
                # capture ±30 chars snippet
                idx     = text.find(tok)
                start   = max(0, idx - 30)
                snippet = text[start : idx + len(tok) + 30].replace("\n"," ")
                ent["occurrences"].append({
                    "token":   tok,
                    "page":    page_num,
                    "snippet": snippet
                })

    log.info(f"  → found {len(report['places'])} distinct places")
    batch_results.append(report)

# ─── Step 6 – Write out a single JSON ─────────────────────────
output = "all_reports_places.json"
with open(output, "w", encoding="utf-8") as f:
    json.dump(batch_results, f, ensure_ascii=False, indent=2)

log.info(f"🎉 Done – wrote `{output}`")


2025-07-16 10:39:33,467 - __main__ - INFO
Msg: ✅ Loaded 4,174 place‑names inside Turkey

2025-07-16 10:39:33,693 - __main__ - INFO
Msg: 🔎 Scraping excavation‐report PDF links …

2025-07-16 10:39:36,313 - __main__ - INFO
Msg: ✅ Found 0 PDF URLs on the site

2025-07-16 10:39:36,331 - __main__ - INFO
Msg: 🔎 Found 73 local PDFs to process

2025-07-16 10:39:46,008 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 9.676278352737427

2025-07-16 10:39:46,008 - __main__ - INFO
Msg: 📄 Processing 130346,02kazipdf.pdf

2025-07-16 10:40:33,028 - __main__ - INFO
Msg:   → found 28 distinct places

2025-07-16 10:40:33,028 - __main__ - INFO
Msg: 📄 Processing 130347,03kazipdf.pdf

2025-07-16 10:41:08,045 - __main__ - INFO
Msg:   → found 19 distinct places

2025-07-16 10:41:08,045 - __main__ - INFO
Msg: 📄 Processing 130348,04kazipdf.pdf

2025-07-16 10:42:41,036 - __main__ - INFO
Msg:   → found 35 distinct places

2025-07-16 10:42:41,036 - __main__ - INFO
Msg: 

2025-07-16 11:55:49,916 - __main__ - INFO
Msg: 📄 Processing 130395,29kazi1pdf.pdf

2025-07-16 11:56:07,065 - __main__ - INFO
Msg:   → found 53 distinct places

2025-07-16 11:56:07,065 - __main__ - INFO
Msg: 📄 Processing 130396,29kazi2pdf.pdf

2025-07-16 11:56:23,792 - __main__ - INFO
Msg:   → found 76 distinct places

2025-07-16 11:56:23,792 - __main__ - INFO
Msg: 📄 Processing 130397,29kazi3pdf.pdf

2025-07-16 11:56:43,142 - __main__ - INFO
Msg:   → found 74 distinct places

2025-07-16 11:56:43,142 - __main__ - INFO
Msg: 📄 Processing 130398,30kazi2pdf.pdf

2025-07-16 11:56:59,262 - __main__ - INFO
Msg:   → found 59 distinct places

2025-07-16 11:56:59,262 - __main__ - INFO
Msg: 📄 Processing 130399,30kazi1pdf.pdf

2025-07-16 11:57:14,879 - __main__ - INFO
Msg:   → found 61 distinct places

2025-07-16 11:57:14,879 - __main__ - INFO
Msg: 📄 Processing 130400,30kazi3pdf.pdf

2025-07-16 11:57:29,046 - __main__ - INFO
Msg:   → found 71 distinct places

2025-07-16 11:57:29,046 - __main__ - INF