In [1]:
# ==== Kaggle setup (run once per session) ====
# Make sure Internet is ON in Kaggle notebook settings.

# System deps: Poppler (for pdf2image) + Tesseract with Arabic data
!apt-get -y update >/dev/null
!apt-get -y install -qq poppler-utils tesseract-ocr tesseract-ocr-ara >/dev/null

# Python libs
!pip -q install pdfplumber PyMuPDF arabic-reshaper python-bidi pytesseract pdf2image

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h

In [2]:
# ===== Robust Arabic PDF extractor (ensemble, batch, headers learned) =====
# What it does
# - Processes all PDFs in INPUT_DIR (and you can attach multiple datasets)
# - For each page, tries pdfplumber → PyMuPDF(text) → PyMuPDF(blocks), then OCR if needed
# - Picks best candidate by Arabic quality
# - Learns repeating headers/footers (per document) and strips them
# - Applies gentle OCR fixes (percent signs, digits)
# - Saves per-PDF:
#       /kaggle/working/outputs/<pdf-stem>/
#           ├─ pages.jsonl              (one JSON per page: page, source, text)
#           ├─ output_logical.txt       (NLP-friendly with [[PAGE | SOURCE]] markers)
#           └─ output_readable.txt      (reshaped RTL for human reading)
# - Also writes:
#       /kaggle/working/outputs/corpus_pages.jsonl   (all docs aggregated)

import os, re, json, unicodedata, glob, shutil
import pdfplumber, fitz
import pytesseract
from pdf2image import convert_from_path
import arabic_reshaper
from bidi.algorithm import get_display
from pathlib import Path
from collections import Counter

# ========= Configure your paths here =========
# Attach your dataset(s) in the right panel → Add data. Then set INPUT_DIR to that path.
INPUT_DIR  = "/kaggle/input/tax-data"   # <-- change to your dataset folder
OUTPUT_DIR = "/kaggle/working/outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========= OCR and extraction settings =========
OCR_LANG        = "ara"
OCR_CONFIG      = "--oem 3 --psm 4"     # try --psm 4 for multi-column pages
MIN_LEN         = 30                    # below this we assume page failed
AR_RATIO_THRESH = 0.15                  # minimal Arabic ratio to accept non-OCR text
USE_PDF2IMAGE   = True                  # If apt-get is not allowed, set to False (uses PyMuPDF rasterization)

# ========= Arabic helpers =========
def has_arabic_ratio(s: str, thresh: float = AR_RATIO_THRESH) -> bool:
    if not s:
        return False
    ar = re.findall(r'[\u0600-\u06FF]', s)
    return (len(ar) / max(1, len(s))) >= thresh

def normalize_for_search(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r'[ـ]+', '', s)
    s = re.sub(r'[ \t]+', ' ', s)
    s = re.sub(r'[\r\n]+', '\n', s)
    s = s.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789'))
    return s

def make_readable_arabic(s: str) -> str:
    reshaped = arabic_reshaper.reshape(s)
    return get_display(reshaped)

def fix_common_ocr(text: str) -> str:
    # unify percent variants and tidy spaces: "60 %" -> "60%"
    text = text.replace("٪","%").replace("﹪","%")
    text = re.sub(r"(\d)\s*%", r"\1%", text)          # 60 % -> 60%
    text = re.sub(r"(?<=\d)\s+(?=\d)", "", text)      # 6 0 -> 60
    # Domain-specific safe fix: misread "60%" as "9060" before "من مبلغ/قيمة/الضريبة"
    text = re.sub(r"\b90?60\b(?=\s+(?:من|عن)\s+(?:مبلغ|قيمة|الضريبة))", "60%", text)
    # Harmonize digits
    text = text.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩','0123456789'))
    return text

def score_arabic_quality(s: str) -> float:
    """Simple score: length-weighted Arabic ratio."""
    if not s: return 0.0
    return (len(s) / 1000.0) * (1.0 if has_arabic_ratio(s) else 0.0)

# ========= Extraction per page (multi-method) =========
def extract_page_pdfplumber(pg) -> str:
    try:
        t = pg.extract_text() or ""
        return t.strip()
    except Exception:
        return ""

def extract_page_pymupdf_text(p: fitz.Page) -> str:
    try:
        t = p.get_text("text") or ""
        return t.strip()
    except Exception:
        return ""

def extract_page_pymupdf_blocks(p: fitz.Page) -> str:
    # Sort blocks top-to-bottom then left-to-right; join lines
    try:
        blocks = p.get_text("blocks") or []
        blocks = sorted(blocks, key=lambda b: (round(b[1],2), round(b[0],2)))
        lines = []
        for b in blocks:
            txt = (b[4] or "").strip()
            if txt:
                lines.append(txt)
        return "\n".join(lines).strip()
    except Exception:
        return ""

def rasterize_pymupdf(doc: fitz.Document, i: int, dpi: int = 300):
    """Rasterize page i with PyMuPDF; returns PIL Image."""
    # PyMuPDF returns pixmaps; convert to PIL
    page = doc[i]
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    try:
        from PIL import Image
        mode = "RGB" if pix.n < 4 else "RGBA"
        im = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
        return im
    except Exception:
        return None

def ocr_page_image(images_or_doc, idx, via_pdf2image=True) -> str:
    if via_pdf2image:
        im = images_or_doc[idx]
    else:
        im = rasterize_pymupdf(images_or_doc, idx, dpi=300)
        if im is None:
            return ""
    txt = pytesseract.image_to_string(im, lang=OCR_LANG, config=OCR_CONFIG) or ""
    return txt.strip()

# ========= Learn & strip headers/footers per document =========
def learn_repeating_edge_lines(page_texts, k=2, freq_threshold=0.5):
    """
    Collect first/last k non-empty lines per page, count frequency,
    and return sets of lines that appear in >= freq_threshold of pages.
    """
    first_counts, last_counts = Counter(), Counter()
    n_pages = len(page_texts)

    for t in page_texts:
        lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
        if not lines: 
            continue
        first_counts.update(lines[:min(k, len(lines))])
        last_counts.update(lines[-min(k, len(lines)):])

    first_common = {ln for ln, c in first_counts.items() if c >= freq_threshold * n_pages}
    last_common  = {ln for ln, c in last_counts.items()  if c >= freq_threshold * n_pages}
    return first_common, last_common

def strip_learned_headers_footers(text, first_common, last_common):
    out = []
    for ln in text.splitlines():
        s = ln.strip()
        if s in first_common or s in last_common:
            continue
        out.append(ln)
    return "\n".join(out)

# ========= Process a single PDF =========
def process_pdf(pdf_path: str, out_dir_root: str):
    stem = Path(pdf_path).stem
    out_dir = os.path.join(out_dir_root, stem)
    os.makedirs(out_dir, exist_ok=True)

    # Open once for both engines
    doc_pl = pdfplumber.open(pdf_path)
    doc_fz = fitz.open(pdf_path)

    # Render images lazily only if needed
    images = None

    records_raw = []  # before header/footer stripping
    max_pages = max(len(doc_pl.pages), doc_fz.page_count)

    for i in range(max_pages):
        page_num = i + 1
        cand = []  # [(source, text, score)]

        # pdfplumber
        if i < len(doc_pl.pages):
            t1 = extract_page_pdfplumber(doc_pl.pages[i])
            cand.append(("pdfplumber", t1, score_arabic_quality(t1)))

        # PyMuPDF text & blocks
        if i < doc_fz.page_count:
            p = doc_fz[i]
            t2 = extract_page_pymupdf_text(p)
            cand.append(("pymupdf_text", t2, score_arabic_quality(t2)))
            t3 = extract_page_pymupdf_blocks(p)
            cand.append(("pymupdf_blocks", t3, score_arabic_quality(t3)))

        # Choose best non-OCR candidate
        cand.sort(key=lambda x: x[2], reverse=True)
        best_src, best_txt, best_score = (cand[0] if cand else ("", "", 0.0))

        # If too short or poor Arabic ratio, OCR
        if (len(best_txt.strip()) < MIN_LEN) or (not has_arabic_ratio(best_txt, AR_RATIO_THRESH)):
            if USE_PDF2IMAGE:
                if images is None:
                    images = convert_from_path(pdf_path, dpi=300)
                best_txt = ocr_page_image(images, i, via_pdf2image=True)
            else:
                best_txt = ocr_page_image(doc_fz, i, via_pdf2image=False)
            best_src = "ocr"

        # Cleanup OCR/common artifacts
        best_txt = fix_common_ocr(best_txt)

        records_raw.append({"page": page_num, "source": best_src, "text": best_txt})

    # Learn headers/footers and strip
    first_common, last_common = learn_repeating_edge_lines([r["text"] for r in records_raw])
    records = []
    for r in records_raw:
        cleaned = strip_learned_headers_footers(r["text"], first_common, last_common).strip()
        records.append({"page": r["page"], "source": r["source"], "text": cleaned})

    # Save per-pdf JSONL
    pages_jsonl = os.path.join(out_dir, "pages.jsonl")
    with open(pages_jsonl, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps({"doc": os.path.basename(pdf_path), **r}, ensure_ascii=False) + "\n")

    # Save logical & readable
    logical_path  = os.path.join(out_dir, "output_logical.txt")
    readable_path = os.path.join(out_dir, "output_readable.txt")

    logical_text = "\n\n".join(f"[[PAGE {r['page']} | {r['source'].upper()}]]\n{r['text']}" for r in records).strip()
    with open(logical_path, "w", encoding="utf-8") as f:
        f.write(logical_text)

    readable_text = make_readable_arabic(logical_text)
    with open(readable_path, "w", encoding="utf-8") as f:
        f.write(readable_text)

    # Close docs
    doc_pl.close()
    doc_fz.close()

    return out_dir, records

# ========= Batch over a folder =========
def process_folder(input_dir: str, output_dir: str):
    pdfs = sorted(glob.glob(os.path.join(input_dir, "*.pdf")))
    if not pdfs:
        print(f"⚠️ No PDFs found in: {input_dir}")
        return

    corpus_path = os.path.join(output_dir, "corpus_pages.jsonl")
    with open(corpus_path, "w", encoding="utf-8") as corpus:
        for pdf in pdfs:
            print(f"📄 Processing: {os.path.basename(pdf)}")
            out_dir, recs = process_pdf(pdf, output_dir)
            for r in recs:
                corpus.write(json.dumps({"doc": os.path.basename(pdf), **r}, ensure_ascii=False) + "\n")
            print(f"   ✅ Saved → {out_dir}")

    print("\n🎉 Done. Corpus JSONL:", corpus_path)

# ========= RUN =========
process_folder(INPUT_DIR, OUTPUT_DIR)


📄 Processing: -    -  .pdf
   ✅ Saved → /kaggle/working/outputs/-    -  
📄 Processing: 2017 .pdf
   ✅ Saved → /kaggle/working/outputs/2017 
📄 Processing: 2020_0.pdf
   ✅ Saved → /kaggle/working/outputs/2020_0

🎉 Done. Corpus JSONL: /kaggle/working/outputs/corpus_pages.jsonl


In [5]:
# ==== FIXED unit builder: raw-span extraction + broader headers (circulars) ====
# Inputs:  /kaggle/working/outputs/corpus_pages.jsonl
# Outputs: /kaggle/working/outputs/units.jsonl, per-doc units.jsonl, units_summary.csv

import os, re, json, csv, unicodedata
from pathlib import Path
from collections import defaultdict

OUTPUT_ROOT = "/kaggle/working/outputs"
CORPUS_JSONL = f"{OUTPUT_ROOT}/corpus_pages.jsonl"
UNITS_JSONL  = f"{OUTPUT_ROOT}/units.jsonl"
UNITS_CSV    = f"{OUTPUT_ROOT}/units_summary.csv"
assert os.path.exists(CORPUS_JSONL), f"Missing {CORPUS_JSONL}. Run the extractor first."

def normalize_for_search(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r'[ـ]+', '', s)
    s = re.sub(r'[ \t]+', ' ', s)
    s = re.sub(r'[\r\n]+', '\n', s)
    s = s.translate(str.maketrans('٠١٢٣٤٥٦٧٨٩','0123456789'))
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    return s

# --- Regex on RAW text (no normalization in spans) ---
ART_RE_RAW = re.compile(
    r'(?m)^(?:المادة|مادة)\s*[\(]?\s*(\d+)\s*[\)]?\s*(.*?)(?=^(?:المادة|مادة)\s*[\(]?\d+|\Z)',
    flags=re.DOTALL
)

# Broader family of admin headers (very common in ETA PDFs)
CIRC_FAMILY = r'(?:تعليمات(?:\s+تنفيذية)?|منشور(?:\s+عام)?|كتاب\s+دوري|قرار|بيان)'
CIRC_RE_RAW = re.compile(
    rf'(?ms)^\s*{CIRC_FAMILY}\s*(?:رقم\s*\(([^)]+)\))?\s*(?:لسنة\s*([0-9]{{3,4}}))?.*?(?=^\s*{CIRC_FAMILY}\b|^\s*(?:المادة|مادة)\b|\Z)'
)

# Some docs have headings like: "تعليمات رقم 12 لسنة 2020 بشأن ..." without parentheses
CIRC_ALT_RE_RAW = re.compile(
    rf'(?ms)^\s*{CIRC_FAMILY}\s*(?:رقم\s*([0-9]+))?\s*(?:لسنة\s*([0-9]{{3,4}}))?.*?(?=^\s*{CIRC_FAMILY}\b|^\s*(?:المادة|مادة)\b|\Z)'
)

def guess_doc_type_raw(raw: str) -> str:
    art_hits  = len(re.findall(r'(?m)^(?:المادة|مادة)\s*\(?\d+', raw))
    circ_hits = len(re.findall(rf'(?m)^\s*{CIRC_FAMILY}', raw))
    if art_hits >= 5 and art_hits >= circ_hits * 2: return "law_or_reg"
    if circ_hits >= 1 and circ_hits >= art_hits:    return "circulars"
    return "unknown"

def join_pages_to_text(pages):
    if not pages: return ""
    pages = sorted(set(pages))
    if len(pages) == 1: return f"{pages[0]}"
    ranges, start, prev = [], pages[0], pages[0]
    for p in pages[1:]:
        if p == prev + 1:
            prev = p
        else:
            ranges.append((start, prev)); start = prev = p
    ranges.append((start, prev))
    return ", ".join(f"{a}" if a==b else f"{a}-{b}" for a,b in ranges)

def pages_covered(span_start, span_end, page_markers):
    # page_markers: list[(pos_after_marker_line, page_number)]
    pages = []
    for pos, pg in page_markers:
        if span_start <= pos < span_end:
            pages.append(pg)
    if not pages and page_markers:
        # include nearest previous page
        prev = None
        for pos, pg in page_markers:
            if pos <= span_start: prev = pg
            else: break
        if prev is not None: pages = [prev]
    return sorted(set(pages))

def safe_stem(name: str) -> str:
    # sanitize folder names like "-    -  .pdf"
    stem = Path(name).stem.strip()
    stem = re.sub(r'[\\/:*?"<>|]+', '_', stem)   # windows-forbidden
    stem = re.sub(r'\s+', ' ', stem).strip()
    return stem or "doc"

# ---- Load corpus ----
docs = defaultdict(list)
with open(CORPUS_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
        rec = json.loads(line)
        docs[rec["doc"]].append((rec.get("page"), rec.get("text","")))

# ---- Build units per doc ----
all_units = []
summary_rows = []

for doc, page_blocks in docs.items():
    page_blocks = sorted(page_blocks, key=lambda x: (x[0] if x[0] else 0))
    stem = safe_stem(doc)
    per_doc_dir = f"{OUTPUT_ROOT}/{stem}"
    os.makedirs(per_doc_dir, exist_ok=True)

    # Build RAW full text with page markers for span→page mapping
    combined, marker_positions, pos = [], [], 0
    for pg, txt in page_blocks:
        marker = f"[[PAGE {pg}]]\n"
        combined.append(marker); pos += len(marker)
        marker_positions.append((pos, pg))
        combined.append(txt or ""); pos += len(txt or "")
        combined.append("\n\n"); pos += 2
    raw_text = "".join(combined)

    dtype = guess_doc_type_raw(raw_text)
    units = []

    if dtype == "law_or_reg":
        for m in ART_RE_RAW.finditer(raw_text):
            art_no = m.group(1)
            s, e = m.span()
            chunk_raw = raw_text[s:e].strip()
            pg_list   = pages_covered(s, e, marker_positions)
            units.append({
                "doc_id": doc,
                "unit_type": "article",
                "unit_id": f"المادة {art_no}",
                "text": chunk_raw,
                "search_text": normalize_for_search(chunk_raw),
                "pages": pg_list,
                "pages_human": join_pages_to_text(pg_list),
                "effective_from": None, "effective_to": None, "version_note": None
            })

    elif dtype == "circulars":
        # Try strict pattern first, then relaxed
        matches = list(CIRC_RE_RAW.finditer(raw_text))
        if not matches:
            matches = list(CIRC_ALT_RE_RAW.finditer(raw_text))
        for m in matches:
            num = (m.group(1) or "").strip()
            yr  = (m.group(2) or "").strip()
            s, e = m.span()
            chunk_raw = raw_text[s:e].strip()
            pg_list   = pages_covered(s, e, marker_positions)
            unit_id = f"{'تعليمات/منشور/كتاب/قرار'}"
            if num and yr:
                unit_id = f"{unit_id} رقم {num} لسنة {yr}"
            elif num:
                unit_id = f"{unit_id} رقم {num}"
            units.append({
                "doc_id": doc,
                "unit_type": "circular",
                "unit_id": unit_id,
                "text": chunk_raw,
                "search_text": normalize_for_search(chunk_raw),
                "pages": pg_list,
                "pages_human": join_pages_to_text(pg_list),
                "effective_from": None, "effective_to": None, "version_note": None
            })

    # Fallback if nothing matched or dtype unknown: big sections
    if not units:
        # Split on large gaps or major headings
        blocks = re.split(r"\n{3,}", raw_text)
        for idx, blk in enumerate(blocks, 1):
            b = blk.strip()
            if len(b) < 200: 
                continue
            s = raw_text.find(blk)
            e = s + len(blk)
            pg_list = pages_covered(s, e, marker_positions)
            units.append({
                "doc_id": doc, "unit_type": "section",
                "unit_id": f"section-{idx}",
                "text": b, "search_text": normalize_for_search(b),
                "pages": pg_list, "pages_human": join_pages_to_text(pg_list),
                "effective_from": None, "effective_to": None, "version_note": None
            })

    # Save per-doc units
    per_doc_units = f"{per_doc_dir}/units.jsonl"
    with open(per_doc_units, "w", encoding="utf-8") as out:
        for u in units:
            out.write(json.dumps(u, ensure_ascii=False) + "\n")

    all_units.extend(units)
    summary_rows.append({"doc": doc, "type": dtype, "pages": len(page_blocks), "units": len(units)})

# ---- Write global files ----
with open(UNITS_JSONL, "w", encoding="utf-8") as f:
    for u in all_units:
        f.write(json.dumps(u, ensure_ascii=False) + "\n")

with open(UNITS_CSV, "w", encoding="utf-8", newline="") as f:
    w = csv.DictWriter(f, fieldnames=["doc","type","pages","units"])
    w.writeheader()
    for row in summary_rows:
        w.writerow(row)

print(f"✅ Rebuilt units. Total = {len(all_units)}")
print(f"• All units  : {UNITS_JSONL}")
print(f"• Summary CSV: {UNITS_CSV}")
print("• Per-doc units under /kaggle/working/outputs/<doc-stem>/units.jsonl")


✅ Rebuilt units. Total = 24
• All units  : /kaggle/working/outputs/units.jsonl
• Summary CSV: /kaggle/working/outputs/units_summary.csv
• Per-doc units under /kaggle/working/outputs/<doc-stem>/units.jsonl


In [6]:
import os, json, csv, glob, random, pandas as pd
from pathlib import Path

ROOT = "/kaggle/working/outputs"
UNITS_JSONL = f"{ROOT}/units.jsonl"
SUMMARY_CSV = f"{ROOT}/units_summary.csv"

assert os.path.exists(SUMMARY_CSV), "units_summary.csv not found"
assert os.path.exists(UNITS_JSONL), "units.jsonl not found"

# 1) Overview
summary = pd.read_csv(SUMMARY_CSV)
display(summary)

print("\nDocs with 0 units (should be none):")
print(summary[summary["units"] == 0])

# 2) Load all units
units = []
with open(UNITS_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            units.append(json.loads(line))
print(f"\nTotal units: {len(units)}")

df = pd.DataFrame(units)

# 3) Basic health checks
print("\nUnit types distribution:")
print(df["unit_type"].value_counts())

print("\nEmpty or too-short texts (<120 chars):")
short = df[df["text"].str.len().fillna(0) < 120]
print(short[["doc_id","unit_type","unit_id"]].head(10))

print("\nUnits missing pages:")
missing_pages = df[df["pages"].apply(lambda x: not x)]
print(missing_pages[["doc_id","unit_type","unit_id"]].head(10))

# 4) Per-doc spot checks (3 samples each)
for doc, sub in df.groupby("doc_id"):
    print(f"\n===== {doc} | {len(sub)} units =====")
    sample = sub.sample(min(3, len(sub)), random_state=42)
    for _, r in sample.iterrows():
        print(f"- [{r['unit_type']}] {r['unit_id']}  | pages: {r['pages']}  | text_len: {len(r['text'])}")

# 5) Ensure per-doc files exist
stems = set(Path(p).stem for p in glob.glob(f"{ROOT}/*/*.pdf"))  # not used (no PDFs here), so check by folder name
perdoc_units = glob.glob(f"{ROOT}/*/units.jsonl")
print(f"\nPer-doc units files found: {len(perdoc_units)}")
for p in perdoc_units[:10]:
    print("•", p)


Unnamed: 0,doc,type,pages,units
0,- - .pdf,circulars,130,2
1,2017 .pdf,circulars,26,1
2,2020_0.pdf,circulars,41,21



Docs with 0 units (should be none):
Empty DataFrame
Columns: [doc, type, pages, units]
Index: []

Total units: 24

Unit types distribution:
unit_type
circular    24
Name: count, dtype: int64

Empty or too-short texts (<120 chars):
        doc_id unit_type                  unit_id
3   2020_0.pdf  circular  تعليمات/منشور/كتاب/قرار
11  2020_0.pdf  circular  تعليمات/منشور/كتاب/قرار
17  2020_0.pdf  circular  تعليمات/منشور/كتاب/قرار

Units missing pages:
Empty DataFrame
Columns: [doc_id, unit_type, unit_id]
Index: []

===== -    -  .pdf | 2 units =====
- [circular] تعليمات/منشور/كتاب/قرار  | pages: [88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108]  | text_len: 16896
- [circular] تعليمات/منشور/كتاب/قرار  | pages: [59, 60, 61, 62, 63, 64, 65, 66, 67]  | text_len: 11075

===== 2017 .pdf | 1 units =====
- [circular] تعليمات/منشور/كتاب/قرار  | pages: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]  | text_len: 3

In [7]:
# ==== Load & Preview Your Legal Units Dataset ====
import pandas as pd
import json

# Path to your structured dataset
DATA_PATH = "/kaggle/working/outputs/units.jsonl"

# Load JSONL into a DataFrame
df = pd.read_json(DATA_PATH, lines=True)

# Basic info
print(f"✅ Loaded {len(df):,} units from {DATA_PATH}")
print("Columns:", list(df.columns))

# Show basic stats
print("\n📊 Document distribution:")
display(df.groupby("doc_id")["unit_id"].count().reset_index(name="num_units"))

print("\n🧩 Unit types distribution:")
display(df["unit_type"].value_counts())

# Show a random sample of 3 units
print("\n🔎 Sample of extracted units:")
display(df[["doc_id", "unit_type", "unit_id", "pages_human", "text"]].sample(3, random_state=42))

# Optional: view average text length per document
df["text_len"] = df["text"].apply(lambda x: len(x or ""))
avg_len = df.groupby("doc_id")["text_len"].mean().reset_index()
avg_len.columns = ["doc_id", "avg_text_len"]
print("\n📏 Average unit text length by document:")
display(avg_len)


✅ Loaded 24 units from /kaggle/working/outputs/units.jsonl
Columns: ['doc_id', 'unit_type', 'unit_id', 'text', 'search_text', 'pages', 'pages_human', 'effective_from', 'effective_to', 'version_note']

📊 Document distribution:


Unnamed: 0,doc_id,num_units
0,- - .pdf,2
1,2017 .pdf,1
2,2020_0.pdf,21



🧩 Unit types distribution:


unit_type
circular    24
Name: count, dtype: int64


🔎 Sample of extracted units:


Unnamed: 0,doc_id,unit_type,unit_id,pages_human,text
8,2020_0.pdf,circular,تعليمات/منشور/كتاب/قرار,12,تعليمات\nرقم ر / )لسنة ‎0111‏\n\nفي إطار السعي...
16,2020_0.pdf,circular,تعليمات/منشور/كتاب/قرار,26,تعليمات ‏\nرقم ( كدب ) لسئة ‎3101‏\n‏بشاأن\n‏ا...
0,- - .pdf,circular,تعليمات/منشور/كتاب/قرار,59-67,بيانات الفواتير الضريبية الصادرة من المسجلين\n...



📏 Average unit text length by document:


Unnamed: 0,doc_id,avg_text_len
0,- - .pdf,13985.5
1,2017 .pdf,31960.0
2,2020_0.pdf,2213.142857


In [8]:
import re
bad = df[df["text"].str.contains(r'[A-Za-z0-9]{20,}', na=False)]
print(f"Gibberish-like segments: {len(bad)}")


Gibberish-like segments: 1
