In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.8.5-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting fastapi>=0.115.0 (from fastapi[standard]>=0.115.0->vllm)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.metadata (17 kB)
Collecting llguidance<0.8.0,>=0.7.9 (from vllm)
  Downloading llguidance-0.7.19-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting outlines==0.1.11 (from vllm)
  Downloading outli

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [32]:
import os
import re
import json
import logging
import time
from pathlib import Path
from typing import Optional, List, Dict, Any, Set
from concurrent.futures import ThreadPoolExecutor, as_completed

import torch
from tqdm.auto import tqdm
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
from rich.table import Table
from PyPDF2 import PdfReader
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
import gc

In [46]:
CONFIG = {
    "pdf_path": Path(os.getenv("PDF_PATH", "./books/kohgiloye.pdf")),
    "province": os.getenv("PROVINCE", "کهگیلویه‌و‌بویراحمد"),
    "start_page": int(os.getenv("START_PAGE", 10)),
    "end_page": 103 if os.getenv("END_PAGE") in (None, "") else int(os.getenv("END_PAGE")),
    "chunk_size": int(os.getenv("CHUNK_SIZE", 2000)),
    "overlap_size": int(os.getenv("OVERLAP_SIZE", 50)),
    "max_seq_length": int(os.getenv("MAX_SEQ_LENGTH", 2048)),
    "max_new_tokens": int(os.getenv("MAX_NEW_TOKENS", 2048)),
    "model_name": os.getenv("MODEL_NAME", "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"),
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "log_file": os.getenv("LOG_FILE", "extraction.log"),
    "workers": int(os.getenv("WORKERS", 1)),
    "partial_save_interval": int(os.getenv("PARTIAL_SAVE_INTERVAL", 10))
}

In [34]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(CONFIG["log_file"]),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)
console = Console()

In [35]:
def clean_json_string(s: str) -> str:
    """Remove JavaScript‐style comments and stray commas so the JSON can be parsed reliably."""
    s = re.sub(r"//.*", "", s)
    s = re.sub(r"/\*.*?\*/", "", s, flags=re.DOTALL)
    s = re.sub(r",(\s*[}\]])", r"\1", s)
    return s


def extract_json_block(text: str) -> str | None:
    """Scan through a string of mixed content and return the longest balanced `{…}` JSON snippet, or None if none found."""
    blocks = []
    start = None
    depth = 0
    for i, ch in enumerate(text):
        if ch == '{':
            if depth == 0:
                start = i
            depth += 1
        elif ch == '}' and depth > 0:
            depth -= 1
            if depth == 0 and start is not None:
                blocks.append(text[start : i + 1])
                start = None
    if not blocks:
        return None
    return max(blocks, key=len)


def clean_json_block(raw: str) -> str:
    """Strip out any Markdown-style code fences (```…```) from around a raw JSON block."""
    s = raw.strip()
    if s.startswith("```"):
        lines = s.splitlines()
        if lines[0].startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].startswith("```"):
            lines = lines[:-1]
        s = "\n".join(lines)
    return s


def extract_text_from_pdf(path: Path, start: int, end: Optional[int]) -> str:
    """Read text from pages `start` through `end` of a PDF file and concatenate them into one string."""
    reader = PdfReader(str(path))
    pages = reader.pages[start-1 : end] if end else reader.pages[start-1 :]
    texts = []
    for page in pages:
        try:
            texts.append(page.extract_text() or "")
        except Exception as e:
            logger.warning(f"Failed to extract page text: {e}")
    return "\n".join(texts)


def normalize_text(text: str) -> str:
    """Collapse all whitespace into single spaces and strip out any digits."""
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\d+", "", text)
    return text.strip()


def chunk_text(text: str, max_chars: int, overlap: int) -> List[str]:
    """Break `text` into word‐aligned chunks up to `max_chars` long, preserving an `overlap` of characters for context."""
    words = text.split()
    if not words:
        return []

    chunks: List[str] = []
    current: List[str] = []
    length = 0

    for w in words:
        if length + len(w) + 1 > max_chars:
            chunk = " ".join(current)
            chunks.append(chunk)
            if overlap > 0 and len(chunk) > overlap:
                carry = chunk[-overlap:]
                current = carry.split()
                length = sum(len(x) + 1 for x in current)
            else:
                current = []
                length = 0
        current.append(w)
        length += len(w) + 1

    if current:
        chunks.append(" ".join(current))

    return chunks

In [47]:
JSON_SCHEMA_EXACT = """
نمونه ساختار دقیق JSON برای هر استان:

{{
  "title": "string",
  "location": {{ "province":"string","city":"string" }},
  "geographical_features":[
    {{ "name":"string","items":[{{ "name":"string","images":["string"] }}]}}
  ],
  "natural_resources":[
    {{ "name":"string","description":["string"] }}
  ],
  "vegetation":["string"],
  "topography":[
    {{ "name":"string","description":["string"] }}
  ],
  "tourist_attractions":[
    {{ "name":"string","images":["string"],"year_built":"string",
       "constructor":"string","architect":"string","description":"string" }}
  ],
  "climate_impacts":[
    {{ "impact":"string","description":["string"] }}
  ],
  "additional_info":{{
    "books_source":"string","other_sources":["string"]
  }}
}}
"""

EXAMPLE_JSON = """
مثال پرشده برای «ویژگی‌های جغرافیایی کهگیلویه‌و‌بویراحمد:

{{
  "title": "ویژگی‌های جغرافیایی کهگیلویه‌و‌بویراحمد",
  "location": {{ "province":"کهگیلویه‌و‌بویراحمد","city":"یاسوج" }},
  "geographical_features":[
    {{
      "name": "رودخانه‌ها",
      "items":[{{ "name":"رود مارون","images": [] }}]
    }},
    {{
      "name": "کوه‌ها",
      "items":[{{ "name":"کوه سرخ","images": [] }}]
    }}
  ],
  "natural_resources": [],
  "vegetation": [],
  "topography": [],
  "tourist_attractions": [],
  "climate_impacts": [],
  "additional_info": {{ "books_source":"","other_sources":[] }}
}}
"""

PROMPT_TEMPLATE = """
شما مدل Gemma-3 هستید و **تنها** باید یک شیء JSON یکتا و **معتبر** تولید کنید.
۱. کلیدها و مقدارهای رشته‌ای حتماً با گیومهٔ دوگانه ("") باشند.
۲. اگر داده‌ای وجود ندارد، از "" یا [] استفاده کنید؛ **هرگز** {{}} خالی ننویسید.
۳. اگر برای فیلدی اطمینان کمتر از ۹۰٪ دارید یا داده نیست، آن را "" یا [] بگذارید.
۴. حتماً حداقل یک مورد واقعی برای هر لیست استخراج‌شده در متن بیاورید.
۵. **هرگز** JSON را داخل code fence (```…```) یا تگ Markdown قرار ندهید—فقط جسم خالص JSON را برگردانید!
۶. فقط JSON خالص، بدون توضیح یا کامنت.
۷. **هرگز** تنها مجموعهٔ نمونه (EXAMPLE_JSON) را به‌عنوان خروجی نهایی برنگردانید؛ حتماً داده‌های استخراج‌شده از متن ورودی را نمایش دهید.

{schema}

{example}

حال با متن زیر، دقیقاً یک شیء JSON منطبق بر ساختار فوق برگردانید:
"""

def build_prompt(province: str, chunk: str) -> str:
    """
    Construct the model’s input prompt for a given province and text chunk.

    This will insert the exact JSON schema and example into the template,
    then append the source text under a clear ‘متن منبع’ header so the model
    knows exactly what to parse and where the data applies.

    :param province: The name of the province to contextualize the prompt.
    :param chunk: The segment of source text to include.
    :return: A ready-to-send prompt string combining template, example, and text.
    """
    return (
        PROMPT_TEMPLATE.format(
            province=province,
            schema=JSON_SCHEMA_EXACT,
            example=EXAMPLE_JSON,
        )
        + "\n--- متن منبع ---\n"
        + chunk
        + "\n"
    )

In [37]:
console.log(f"Loading model {CONFIG['model_name']} on {CONFIG['device']}...")
model, tokenizer = FastModel.from_pretrained(
    model_name=CONFIG['model_name'],
    max_seq_length=CONFIG['max_seq_length'],
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    device_map="auto"
)
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")
console.log("Model loaded Succesfully!")

==((====))==  Unsloth 2025.4.4: Fast Gemma3 patching. Transformers: 4.51.3. vLLM: 0.8.5.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


In [49]:
seen_entries = {
    "geographical_features": set(),
    "natural_resources": set(),
    "vegetation": set(),
    "topography": set(),
    "tourist_attractions": set(),
    "climate_impacts": set(),
}
seen_subitems = set()

def process_chunk(chunk: str, idx: int) -> Optional[Dict[str, Any]]:
    prompt = build_prompt(CONFIG['province'], chunk)
    if idx <= 10:
        console.rule(f"[bold green]Chunk {idx} Prompt[/]")
        console.print(prompt, overflow="fold")
        console.rule()

    inp = tokenizer.apply_chat_template(
        [{"role":"user","content":[{"type":"text","text":prompt}]}],
        add_generation_prompt=True
    )
    inputs = tokenizer([inp], return_tensors="pt").to(CONFIG['device'])
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=1.0,
            top_p=0.9,
            top_k=50,
        )
    decoded = tokenizer.batch_decode(out)[0]

    marker = "<start_of_turn>model"
    content = decoded.split(marker,1)[1] if marker in decoded else decoded
    content = content.replace("<end_of_turn>", "").strip()

    if idx <= 10:
        console.rule(f"[bold blue]Chunk {idx} Raw Output[/]")
        console.print(content, overflow="fold")
        console.rule()

    raw_json = extract_json_block(content)
    if not raw_json:
        console.log(f"[red]⚠️ Chunk {idx}: no JSON block. Snippet: {content[:200]}...")
        cleanup_gpu(inputs, out)
        return None

    raw_json = clean_json_block(raw_json)
    raw_json = clean_json_string(raw_json)

    if idx <= 10:
        console.rule(f"[bold yellow]Chunk {idx} Extracted JSON Block[/]")
        console.print(raw_json, overflow="fold")
        console.rule()

    try:
        parsed = json.loads(raw_json)
    except json.JSONDecodeError as e:
        console.log(f"[red]Chunk {idx} JSON parse error:[/] {e}")
        parsed = None

    if parsed:
        for section in ["geographical_features","natural_resources","topography",
                        "tourist_attractions","climate_impacts"]:
            new_list = []
            for item in parsed.get(section, []):
                name = item.get("name")
                if not name or name in seen_entries[section]:
                    continue
                seen_entries[section].add(name)

                if section == "geographical_features":
                    kept_subs = []
                    for sub in item.get("items", []):
                        subname = sub.get("name")
                        if subname and subname not in seen_subitems:
                            seen_subitems.add(subname)
                            kept_subs.append(sub)
                    item["items"] = kept_subs
                    if kept_subs:
                        new_list.append(item)
                else:
                    new_list.append(item)
            parsed[section] = new_list

        for section in ["vegetation"]:
            new_list = []
            for val in parsed.get(section, []):
                if not val:
                    continue
                val_key = json.dumps(val, sort_keys=True)
                if val_key not in seen_entries[section]:
                    seen_entries[section].add(val_key)
                    new_list.append(val)
            parsed[section] = new_list

    if idx <= 10 and parsed is not None:
        console.rule(f"[bold magenta]Chunk {idx} Parsed & Deduped JSON[/]")
        console.print(json.dumps(parsed, ensure_ascii=False, indent=2))
        console.rule()

    cleanup_gpu(inputs, out)
    return parsed

def cleanup_gpu(inputs, out):
    """Helper to free GPU memory immediately."""
    import gc
    del inputs, out
    gc.collect()
    if CONFIG['device'].startswith("cuda"):
        torch.cuda.empty_cache()

In [44]:
def main():
    """
    Orchestrate the end-to-end extraction for a given province:
      1. Load and normalize text from the configured PDF pages.
      2. Split that text into overlapping chunks.
      3. Send each chunk to the model, filtering out any duplicate entries.
      4. Aggregate all unique results into one JSON file.
      5. Display progress during processing and a summary table at the end.
    """
    start_time = time.time()
    console.rule("[bold green]Starting Province Extraction[/]")

    text = extract_text_from_pdf(CONFIG['pdf_path'], CONFIG['start_page'], CONFIG['end_page'])
    text = normalize_text(text)
    chunks = chunk_text(text, CONFIG['chunk_size'], CONFIG['overlap_size'])

    console.print(f"[bold]Province:[/] {CONFIG['province']}")
    console.print(f"[bold]Total chunks to process:[/] {len(chunks)}\n")

    combined: Dict[str, Any] = {"province": CONFIG['province']}
    partials: List[Dict[str, Any]] = []

    with Progress(
        SpinnerColumn(style="bold green"),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(bar_width=None),
        TextColumn("[bold magenta]{task.completed}/{task.total} chunks"),
        TimeElapsedColumn(),
        TimeRemainingColumn(),
        console=console
    ) as progress:
        task_id = progress.add_task("Extracting chunks", total=len(chunks))
        for idx, chunk in enumerate(chunks, start=1):
            result = process_chunk(chunk, idx)
            if result:
                partials.append(result)
            progress.advance(task_id)

    for part in partials:
        for key, value in part.items():
            if key == "province":
                continue
            if key not in combined:
                combined[key] = value
            elif isinstance(value, list) and isinstance(combined[key], list):
                combined[key].extend(value)

    out_path = Path(f"./{CONFIG['province']}_dataset.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(combined, f, ensure_ascii=False, indent=2)

    elapsed = time.time() - start_time
    total_items = sum(
        len(v) if isinstance(v, list) else 1
        for k, v in combined.items()
        if k != "province"
    )

    console.rule("[bold green]Extraction Complete[/]")
    console.print(f"• Saved to: [bold]{out_path}[/]")
    console.print(f"• Items extracted: [bold]{total_items}[/]")
    console.print(f"• Chunks processed: [bold]{len(chunks)}[/]")
    console.print(f"• Elapsed time: [bold]{elapsed:.2f}s[/]\n")

    table = Table(title="✅ Extraction Summary")
    table.add_column("Province", style="cyan")
    table.add_column("Items", justify="right", style="magenta")
    table.add_column("Chunks", justify="right", style="magenta")
    table.add_column("Time (s)", justify="right", style="magenta")
    table.add_row(
        CONFIG["province"],
        str(total_items),
        str(len(chunks)),
        f"{elapsed:.2f}"
    )
    console.print(table)

In [None]:
if __name__ == '__main__':
    main()

Output()