In [None]:
# ===============================
# Stage 1 — Text Extraction from PDF files
# Read both native PDF and scanned PDF with OCR
# ===============================
!apt-get -qq update && apt-get -qq install -y poppler-utils tesseract-ocr >/dev/null
!pip -q install pypdf pdf2image pytesseract nltk pandas pyarrow tqdm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, hashlib
from collections import defaultdict
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
import nltk; nltk.download('punkt', quiet=True)
import nltk; nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

# ---------- CONFIG ----------
IN_DIR           = "<<Google Drive Link>>"  # I uploaded the PDFs files in Google Drive. Each company has one folder.
SKIP_EXISTING    = True                         # resumable
NATIVE_MIN_CHARS = 120                          # if below → treat as scanned
OCR_DPI          = 300
SENT_MINLEN      = 25

# ---------- Regex & helpers ----------
YEAR_RE     = re.compile(r'_(19|20)\d{2}(?=\.pdf$)', re.I)
PAGINATION  = re.compile(r"\b(page|pp\.|p\.|appendix|annex|exhibit|figure|table|chapter|section|see page)\b", re.I)
TOC_LINE    = re.compile(r"\.{3,}\s*\d{1,4}$")
BARE_NUM    = re.compile(r"^\s*\d{1,3}\s*$")
DATE_ONLY   = re.compile(r"^\s*(?:FY\s*\d{4}|(19|20)\d{2}|Q[1-4])\s*$", re.I)
SHORT_HEAD  = re.compile(r"^(?:contents|table of contents|index)$", re.I)

def extract_year(p: Path):
    m = YEAR_RE.search(p.name); return int(m.group(0)[1:]) if m else None

def extract_type(p: Path):
    n = p.name.upper()
    if "_AR_" in n: return "AR"
    if "_10K_" in n: return "10K"
    return "ESG"

def sha256_16(p: Path):
    h=hashlib.sha256()
    with open(p,'rb') as f:
        for b in iter(lambda:f.read(1<<20), b''): h.update(b)
    return h.hexdigest()[:16]

def _keep_line(line: str) -> bool:
    s = line.strip()
    if not s: return False
    if BARE_NUM.match(s): return False
    if TOC_LINE.search(s): return False
    if SHORT_HEAD.match(s): return False
    if PAGINATION.search(s) and len(s.split()) <= 6: return False
    if DATE_ONLY.match(s): return False
    return True

def clean_lines(text: str) -> str:
    lines = text.splitlines()
    lines = [ln for ln in lines if _keep_line(ln)]
    return re.sub(r"\s+", " ", "\n".join(lines)).strip()

def to_sentences(txt: str):
    txt = clean_lines(txt)
    return [s.strip() for s in sent_tokenize(txt) if len(s.strip()) >= SENT_MINLEN]

def native_text(pdf: Path) -> str:
    try:
        pages = PdfReader(str(pdf)).pages
        parts = [(pg.extract_text() or "") for pg in pages]
        raw = "\n".join([t for t in parts if t]).strip()
        return clean_lines(raw)
    except Exception:
        return ""

def ocr_fulldoc(pdf: Path) -> str:
    try:
        imgs = convert_from_path(str(pdf), dpi=OCR_DPI)
        txts = [pytesseract.image_to_string(im, lang="eng", config="--oem 1 --psm 6").strip() for im in imgs]
        raw = "\n".join([t for t in txts if t]).strip()
        return clean_lines(raw)
    except Exception:
        return ""

# ---------- Discover PDFs & group by company folder ----------
IN_ROOT = Path(IN_DIR)
all_pdfs = sorted(IN_ROOT.rglob("*.pdf"))
groups = defaultdict(list)
for pdf in all_pdfs:
    groups[pdf.parent].append(pdf)

total_processed = 0
total_skipped = 0
per_folder_summaries = []

# ---------- Process per folder----------
for folder, pdfs in groups.items():
    out_root = folder / "esg_stage1"
    texts_dir = out_root / "texts"
    sents_dir = out_root / "sentences"
    texts_dir.mkdir(parents=True, exist_ok=True)
    sents_dir.mkdir(parents=True, exist_ok=True)

    manifest_path = out_root / "manifest.parquet"
    existing = pd.read_parquet(manifest_path) if (SKIP_EXISTING and manifest_path.exists()) else pd.DataFrame()

    rows = []
    skipped_here = 0
    processed_here = 0

    for pdf in tqdm(pdfs, desc=f"Processing PDFs in {folder.name}", leave=False):
        filename = pdf.name
        if SKIP_EXISTING and not existing.empty and filename in existing["filename"].values:
            skipped_here += 1
            continue

        year = extract_year(pdf); rtype = extract_type(pdf); hid = sha256_16(pdf)
        text_path = texts_dir / f"{hid}.txt"
        sent_path = sents_dir / f"{hid}.parquet"

        # 1) Native extraction
        txt = native_text(pdf)
        if txt and len(txt) >= NATIVE_MIN_CHARS:
            status = "ok_native"
        else:
            # 2) Full-document OCR
            txt = ocr_fulldoc(pdf)
            status = "ok_ocr" if txt and len(txt) >= NATIVE_MIN_CHARS else "error_empty"

        # Save manifest extraction
        text_path.write_text(txt or "", encoding="utf-8")
        pd.DataFrame({"text": to_sentences(txt) if txt else []}).to_parquet(sent_path, index=False)

        rows.append({
            "sha256_16": hid,
            "filename": filename,
            "pdf_path": str(pdf),
            "text_path": str(text_path),
            "sentences_path": str(sent_path),
            "status": status,
            "doc_year": year,
            "report_type": rtype
        })
        processed_here += 1

    # Merge+save manifest in the sampe folder
    if rows:
        manifest = (pd.concat([existing, pd.DataFrame(rows)], ignore_index=True)
                    if not existing.empty else pd.DataFrame(rows))
        manifest = manifest.drop_duplicates(subset=["filename"], keep="last")
        manifest.to_parquet(manifest_path, index=False)
    else:
        # If nothing new, keep existing manifest as-is
        manifest = existing if not existing.empty else pd.DataFrame(columns=[
            "sha256_16","filename","pdf_path","text_path","sentences_path","status","doc_year","report_type"
        ])

    per_folder_summaries.append({
        "folder": str(folder),
        "pdfs_found": len(pdfs),
        "processed_now": processed_here,
        "skipped_existing": skipped_here,
        "manifest_path": str(manifest_path),
        "manifest_rows": len(manifest)
    })
    total_processed += processed_here
    total_skipped += skipped_here

# ---------- Result ----------
print("\n✅ Done.")
print(f"Folders: {len(groups)}  |  PDFs total: {len(all_pdfs)}")
print(f"Processed now: {total_processed}  |  Skipped (existing): {total_skipped}\n")
for s in per_folder_summaries[:10]:
    print(f"- {Path(s['folder']).name}: found={s['pdfs_found']}, processed={s['processed_now']}, "
          f"skipped={s['skipped_existing']}, manifest_rows={s['manifest_rows']}")
    print(f"  → {s['manifest_path']}")


## AFTER REVISING THE TEXT EXTRACTION

In [None]:
# ===============================
# Stage 1 — Text Extraction from PDF files (layout-aware, bullet-safe)
# Read both native PDF and scanned PDF with OCR
# Key changes:
# - Preserve line breaks & bullets (no global whitespace collapse before tokenization)
# - Strip common headers/footers/TOC lines per page
# - Fix hyphenation at line ends
# - Add defensive "list-aware" sentence splitting (bullets, long-lines fallback)
# - Configurable Tesseract PSM for better list/column OCR
# ===============================
!apt-get -qq update && apt-get -qq install -y poppler-utils tesseract-ocr >/dev/null
!pip -q install pypdf pdf2image pytesseract nltk pandas pyarrow tqdm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, hashlib, unicodedata
from collections import defaultdict
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
import nltk; nltk.download('punkt', quiet=True)
import nltk; nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

# ---------- CONFIG ----------
IN_DIR           = "/content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/"
SKIP_EXISTING    = True                  # resumable
NATIVE_MIN_CHARS = 120                   # if below → treat as scanned
OCR_DPI          = 300
OCR_PSM          = 6                     # 6: Assume a single uniform block of text; try 4 for columns if needed
SENT_MINLEN      = 25
MAX_SENT_LEN_CH  = 1200                  # if a "sentence" exceeds this, apply secondary splitting
KEEP_PAGE_BREAKS = True                  # insert blank line between pages for native & OCR

# ---------- Regex & helpers ----------
YEAR_RE     = re.compile(r'_(19|20)\d{2}(?=\.pdf$)', re.I)
PAGINATION  = re.compile(r"\b(page|pp\.|p\.|appendix|annex|exhibit|figure|table|chapter|section|see page)\b", re.I)
TOC_LINE    = re.compile(r"\.{3,}\s*\d{1,4}$")
BARE_NUM    = re.compile(r"^\s*\d{1,3}\s*$")
DATE_ONLY   = re.compile(r"^\s*(?:FY\s*\d{4}|(19|20)\d{2}|Q[1-4])\s*$", re.I)
SHORT_HEAD  = re.compile(r"^(?:contents|table of contents|index)$", re.I)
HEADER_FOOT_HINT = re.compile(r"^(?:esg|sustainability|highlights|introduction|company|report|\w+\s+\d{4})\b", re.I)

BULLETS = ("•", "▪", "‣", "‧", "・", "·", "-", "–")  # last two treated cautiously

def extract_year(p: Path):
    m = YEAR_RE.search(p.name); return int(m.group(0)[1:]) if m else None

def extract_type(p: Path):
    n = p.name.upper()
    if "_AR_" in n: return "AR"
    if "_10K_" in n: return "10K"
    return "ESG"

def sha256_16(p: Path):
    h=hashlib.sha256()
    with open(p,'rb') as f:
        for b in iter(lambda:f.read(1<<20), b''): h.update(b)
    return h.hexdigest()[:16]

def _likely_header_footer(line: str) -> bool:
    s = line.strip()
    if not s: return True
    if BARE_NUM.match(s): return True
    if TOC_LINE.search(s): return True
    if SHORT_HEAD.match(s): return True
    # very short lines with pagination cue
    if PAGINATION.search(s) and len(s.split()) <= 6: return True
    if DATE_ONLY.match(s): return True
    # single ALL-CAPS short tokens (typical running heads)
    if len(s) <= 40 and (s.isupper() or HEADER_FOOT_HINT.match(s)): return True
    return False

def _normalize_bullets(text: str) -> str:
    """
    Ensure bullets start new lines, so lists become separate segments.
    Also fix cases like ' text • item ' -> '\n• item'
    """
    txt = text
    # Make sure bullets at mid-line get their own line
    for b in BULLETS:
        # surround bullet with spaces to avoid gluing to words
        txt = txt.replace(f" {b} ", f"\n{b} ")
        txt = txt.replace(f"\t{b} ", f"\n{b} ")
        # bullet without trailing space
        txt = txt.replace(f" {b}", f"\n{b}")
    # de-duplicate excessive newlines
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    return txt

def _fix_hyphenation(text: str) -> str:
    """
    Join words broken by line-end hyphenation: 'carbon-\nneutral' -> 'carbon-neutral'
    Only when hyphen is at end of line followed by a word start.
    """
    return re.sub(r"-\n(?=\w)", "", text)

def clean_page_text(page_text: str) -> str:
    """
    Per-page cleaning: remove obvious headers/footers/TOC lines,
    preserve internal newlines, normalize bullets, fix hyphenation.
    """
    if not page_text:
        return ""
    # Normalize unicode + keep as original case for better sentence detection later
    t = unicodedata.normalize("NFKC", page_text)
    # Fix hyphenation first to avoid breaking words into odd tokens
    t = _fix_hyphenation(t)

    # Split into lines, drop typical headers/footers
    lines = [ln for ln in t.splitlines() if ln.strip()]
    if not lines:
        return ""
    # Drop first/last line if they look like running head/footer
    if lines and _likely_header_footer(lines[0]): lines = lines[1:]
    if lines and _likely_header_footer(lines[-1]): lines = lines[:-1]
    # Filter remaining noisy lines
    lines = [ln for ln in lines if not _likely_header_footer(ln)]
    if not lines:
        return ""

    # Rejoin with preserved newlines
    t = "\n".join(lines)

    # Normalize bullets to enforce line starts
    t = _normalize_bullets(t)

    # Trim stray spaces on each line, keep line structure
    t = "\n".join(ln.strip() for ln in t.splitlines())
    return t.strip()

def to_sentences_structured(txt: str):
    """
    Structure-aware sentence segmentation:
    1) Keep existing newlines as hard segment boundaries
    2) If a segment is very long or contains bullets, secondary split by bullets/; / spaced dashes
    3) Run NLTK sentence tokenizer inside each small segment
    """
    if not txt:
        return []
    segments = [seg for seg in txt.splitlines() if seg.strip()]
    out = []
    for seg in segments:
        # Secondary split if segment too long or seems list-like
        needs_split = (len(seg) > MAX_SENT_LEN_CH) or any(b in seg for b in BULLETS)
        if needs_split:
            # split on bullets at start or mid sentence
            split_bits = re.split(r"(?<=\S)\s+(?=(?:[•▪‣‧・·]|- |– )\s*)", seg)
        else:
            split_bits = [seg]

        for bit in split_bits:
            b = bit.strip()
            if not b:
                continue
            # Now apply sentence tokenizer
            for s in sent_tokenize(b):
                s2 = s.strip()
                if len(s2) >= SENT_MINLEN:
                    out.append(s2)
    return out

def native_text(pdf: Path) -> str:
    """
    Extract per-page text with pypdf, then clean page-by-page.
    We KEEP newlines and insert blank lines between pages if configured.
    """
    try:
        pages = PdfReader(str(pdf)).pages
        cleaned_pages = []
        for pg in pages:
            raw = (pg.extract_text() or "")
            cp = clean_page_text(raw)
            if cp:
                cleaned_pages.append(cp)
        if not cleaned_pages:
            return ""
        glue = "\n\n" if KEEP_PAGE_BREAKS else "\n"
        return glue.join(cleaned_pages).strip()
    except Exception:
        return ""

def ocr_fulldoc(pdf: Path) -> str:
    """
    Full-document OCR with layout-friendly settings.
    We keep per-image (page) text separate and clean page-by-page.
    """
    try:
        imgs = convert_from_path(str(pdf), dpi=OCR_DPI)
        cleaned_pages = []
        for im in imgs:
            txt = pytesseract.image_to_string(
                im, lang="eng",
                config=f"--oem 1 --psm {OCR_PSM}"
            ).strip()
            cp = clean_page_text(txt)
            if cp:
                cleaned_pages.append(cp)
        if not cleaned_pages:
            return ""
        glue = "\n\n" if KEEP_PAGE_BREAKS else "\n"
        return glue.join(cleaned_pages).strip()
    except Exception:
        return ""

# ---------- Discover PDFs & group by company folder ----------
IN_ROOT = Path(IN_DIR)
all_pdfs = sorted(IN_ROOT.rglob("*.pdf"))
groups = defaultdict(list)
for pdf in all_pdfs:
    groups[pdf.parent].append(pdf)

total_processed = 0
total_skipped = 0
per_folder_summaries = []

# ---------- Process per folder ----------
for folder, pdfs in groups.items():
    out_root = folder / "esg_stage1(ver2)"
    texts_dir = out_root / "texts"
    sents_dir = out_root / "sentences"
    texts_dir.mkdir(parents=True, exist_ok=True)
    sents_dir.mkdir(parents=True, exist_ok=True)

    manifest_path = out_root / "manifest.parquet"
    existing = pd.read_parquet(manifest_path) if (SKIP_EXISTING and manifest_path.exists()) else pd.DataFrame()

    rows = []
    skipped_here = 0
    processed_here = 0

    for pdf in tqdm(pdfs, desc=f"Processing PDFs in {folder.name}", leave=False):
        filename = pdf.name
        if SKIP_EXISTING and not existing.empty and filename in existing["filename"].values:
            skipped_here += 1
            continue

        year = extract_year(pdf); rtype = extract_type(pdf); hid = sha256_16(pdf)
        text_path = texts_dir / f"{hid}.txt"
        sent_path = sents_dir / f"{hid}.parquet"

        # 1) Native extraction (layout-preserving)
        txt = native_text(pdf)
        if txt and len(txt) >= NATIVE_MIN_CHARS:
            status = "ok_native"
        else:
            # 2) Full-document OCR (page-by-page cleaning; layout-friendly PSM)
            txt = ocr_fulldoc(pdf)
            status = "ok_ocr" if txt and len(txt) >= NATIVE_MIN_CHARS else "error_empty"

        # Save cleaned text and robust sentences
        text_path.write_text(txt or "", encoding="utf-8")
        sentences = to_sentences_structured(txt) if txt else []
        pd.DataFrame({"text": sentences}).to_parquet(sent_path, index=False)

        rows.append({
            "sha256_16": hid,
            "filename": filename,
            "pdf_path": str(pdf),
            "text_path": str(text_path),
            "sentences_path": str(sent_path),
            "status": status,
            "doc_year": year,
            "report_type": rtype
        })
        processed_here += 1

    # Merge+save manifest in the same folder
    if rows:
        manifest = (pd.concat([existing, pd.DataFrame(rows)], ignore_index=True)
                    if not existing.empty else pd.DataFrame(rows))
        manifest = manifest.drop_duplicates(subset=["filename"], keep="last")
        manifest.to_parquet(manifest_path, index=False)
    else:
        manifest = existing if not existing.empty else pd.DataFrame(columns=[
            "sha256_16","filename","pdf_path","text_path","sentences_path","status","doc_year","report_type"
        ])

    per_folder_summaries.append({
        "folder": str(folder),
        "pdfs_found": len(pdfs),
        "processed_now": processed_here,
        "skipped_existing": skipped_here,
        "manifest_path": str(manifest_path),
        "manifest_rows": len(manifest)
    })
    total_processed += processed_here
    total_skipped += skipped_here

# ---------- Result ----------
print("\n✅ Done.")
print(f"Folders: {len(groups)}  |  PDFs total: {len(all_pdfs)}")
print(f"Processed now: {total_processed}  |  Skipped (existing): {total_skipped}\n")
for s in per_folder_summaries[:10]:
    print(f"- {Path(s['folder']).name}: found={s['pdfs_found']}, processed={s['processed_now']}, "
          f"skipped={s['skipped_existing']}, manifest_rows={s['manifest_rows']}")
    print(f"  → {s['manifest_path']}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.9/323.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


                                                                             
KeyboardInterrupt



### STAGE 1 WITH IMPROVEMENT

In [None]:
# ===============================
# Stage 1 — Text Extraction from PDF files (improved)
# - Native PDF + OCR
# - Soft line-break joining, hyphenation repair, header/TOC filtering
# - Safer sentence tokenization (paragraph-aware)
# ===============================
!apt-get -qq update && apt-get -qq install -y poppler-utils tesseract-ocr >/dev/null
!pip -q install pypdf pdf2image pytesseract nltk pandas pyarrow tqdm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, hashlib, unicodedata, string
from collections import defaultdict
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract

import nltk; nltk.download('punkt', quiet=True)
import nltk; nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

# ---------- CONFIG ----------
IN_DIR           = "/content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/"
SKIP_EXISTING    = True
NATIVE_MIN_CHARS = 120
OCR_DPI          = 300
SENT_MINLEN      = 25

# Soft line-break joiner knobs
JOIN_SOFT_BREAKS     = True    # turn on the joiner
KEEP_DOUBLE_NEWLINES = True    # preserve paragraph gaps
MAX_JOIN_LINE_LEN    = 2000    # guardrail against runaway joins

# Tesseract config (PSM 4 handles multi-column pages better than default 6 in many ESG PDFs)
OCR_CONFIG = "--oem 1 --psm 4 -c preserve_interword_spaces=1"

# ---------- Regex & helpers ----------
YEAR_RE     = re.compile(r'_(19|20)\d{2}(?=\.pdf$)', re.I)
PAGINATION  = re.compile(r"\b(page|pp\.|p\.|appendix|annex|exhibit|figure|table|chapter|section|see page)\b", re.I)
TOC_LINE    = re.compile(r"\.{3,}\s*\d{1,4}$")
BARE_NUM    = re.compile(r"^\s*\d{1,3}\s*$")
DATE_ONLY   = re.compile(r"^\s*(?:FY\s*\d{4}|(19|20)\d{2}|Q[1-4])\s*$", re.I)
SHORT_HEAD  = re.compile(r"^(?:contents|table of contents|index)$", re.I)

# sentence-final punctuation (also covers quotes and ellipsis)
SENT_FINAL  = re.compile(r'[.?!…:;)"’%]\s*$')
# bullet/heading detectors
BULLET_START = re.compile(r"^\s*(?:[-–•‣●◦▪]|[0-9]{1,3}[.)]|[A-Z]{1,2}[.)])\s+")
ALLCAPS_LINE = re.compile(r"^[A-Z0-9][A-Z0-9 &/–\-.,]{2,}$")

_SOFT_HYPHEN = "\u00AD"
PUNCT_TABLE  = str.maketrans("", "", string.punctuation)

def extract_year(p: Path):
    m = YEAR_RE.search(p.name); return int(m.group(0)[1:]) if m else None

def extract_type(p: Path):
    n = p.name.upper()
    if "_AR_" in n: return "AR"
    if "_10K_" in n: return "10K"
    return "ESG"

def sha256_16(p: Path):
    h=hashlib.sha256()
    with open(p,'rb') as f:
        for b in iter(lambda:f.read(1<<20), b''): h.update(b)
    return h.hexdigest()[:16]

def _keep_line(line: str) -> bool:
    s = line.strip()
    if not s: return False
    if BARE_NUM.match(s): return False
    if TOC_LINE.search(s): return False
    if SHORT_HEAD.match(s): return False
    # very short pagination-only lines
    if PAGINATION.search(s) and len(s.split()) <= 6: return False
    if DATE_ONLY.match(s): return False
    return True

def normalize_soft_hyphens(text: str) -> str:
    # Remove discretionary hyphen glyphs and normalize spaces
    text = text.replace(_SOFT_HYPHEN, "")
    text = unicodedata.normalize("NFKC", text)
    return text

def is_bullet_or_heading(line: str) -> bool:
    s = line.strip()
    if not s: return False
    if BULLET_START.match(s): return True
    # short all-caps headings (avoid shouting paragraphs)
    if len(s) <= 80 and ALLCAPS_LINE.match(s):
        return True
    return False

def should_join(prev: str, nxt: str) -> bool:
    """
    Decide if prev and next lines are the same sentence (i.e., visual wrap).
    Heuristics:
      - prev NOT ending with sentence-final punctuation
      - next starts lowercase or punctuation indicating continuation
      - neither line looks like a bullet/heading
    """
    if not prev or not nxt: return False
    if is_bullet_or_heading(prev) or is_bullet_or_heading(nxt): return False

    prev_stripped = prev.rstrip()
    nxt_stripped  = nxt.lstrip()

    # hyphenated word wrap always joins (handled separately, but we allow here too)
    if prev_stripped.endswith("-") and (nxt_stripped[:1].isalpha()):
        return True

    # already ends a sentence? keep the break
    if SENT_FINAL.search(prev_stripped):
        return False

    # continuation cues: lowercase start, comma/semicolon, closing bracket, quote, or small function words
    if re.match(r'^[a-z“’)\],;:—–-]', nxt_stripped):
        return True

    # sentence likely continues if prev ends with a word char and next starts with a lowercase or “of/and/to/for/in”
    if re.search(r'[A-Za-z0-9]$', prev_stripped) and re.match(r'^(?:[a-z]|of|and|to|for|in|on|with|as)\b', nxt_stripped):
        return True

    return False

def join_soft_linebreaks(text: str) -> str:
    """
    Convert single newlines inside paragraphs to spaces, keep double newlines as paragraph breaks.
    Also repairs hyphenation at line boundaries.
    """
    if not text:
        return ""

    # keep paragraph blocks separated by blank lines
    blocks = re.split(r'\n{2,}', text)
    fixed_blocks = []

    for block in blocks:
        lines = [ln for ln in block.splitlines() if _keep_line(ln)]
        if not lines:
            fixed_blocks.append("")
            continue

        rebuilt = []
        cur = lines[0].rstrip()

        for i in range(1, len(lines)):
            nxt = lines[i].strip()

            # hyphenated join: "hard-to-decarbonize-\nregions" -> "hard-to-decarbonize regions"
            if cur.endswith("-") and (nxt[:1].isalpha()):
                cur = cur[:-1] + nxt
                continue

            if JOIN_SOFT_BREAKS and should_join(cur, nxt) and len(cur) < MAX_JOIN_LINE_LEN:
                cur = cur + " " + nxt
            else:
                rebuilt.append(cur)
                cur = nxt

        rebuilt.append(cur)

        fixed = "\n".join(rebuilt)
        fixed_blocks.append(fixed)

    if KEEP_DOUBLE_NEWLINES:
        return "\n\n".join(fixed_blocks).strip()
    else:
        return "\n".join(fixed_blocks).strip()

def clean_lines_keep_structure(text: str) -> str:
    """
    Apply line filtering, soft-break joining, and whitespace normalization while
    preserving paragraph gaps.
    """
    text = normalize_soft_hyphens(text)
    text = join_soft_linebreaks(text)
    # collapse 3+ blank lines to 2
    text = re.sub(r'\n{3,}', '\n\n', text)
    # trim trailing spaces on lines
    text = "\n".join([ln.rstrip() for ln in text.splitlines()])
    return text.strip()

def paragraphs_to_sentences(txt: str):
    """
    Tokenize by NLTK after paragraph-aware joining.
    """
    if not txt:
        return []
    # keep paragraphs for better sentence context
    paras = [p.strip() for p in re.split(r'\n{2,}', txt) if p.strip()]
    sents = []
    for p in paras:
        # replace remaining single newlines in a paragraph with spaces (safety)
        p_flat = re.sub(r'\n+', ' ', p).strip()
        sents.extend([s.strip() for s in sent_tokenize(p_flat) if len(s.strip()) >= SENT_MINLEN])
    return sents

def native_text(pdf: Path) -> str:
    try:
        pages = PdfReader(str(pdf)).pages
        parts = [(pg.extract_text() or "") for pg in pages]
        raw = "\n\n".join([t for t in parts if t]).strip()  # keep page gaps as paragraph gaps
        return clean_lines_keep_structure(raw)
    except Exception:
        return ""

def ocr_fulldoc(pdf: Path) -> str:
    try:
        imgs = convert_from_path(str(pdf), dpi=OCR_DPI)
        txts = [pytesseract.image_to_string(im, lang="eng", config=OCR_CONFIG).strip() for im in imgs]
        raw = "\n\n".join([t for t in txts if t]).strip()
        return clean_lines_keep_structure(raw)
    except Exception:
        return ""

def to_sentences(txt: str):
    txt = clean_lines_keep_structure(txt)
    return paragraphs_to_sentences(txt)

# ---------- Discover PDFs & group by company folder ----------
IN_ROOT = Path(IN_DIR)
all_pdfs = sorted(IN_ROOT.rglob("*.pdf"))
groups = defaultdict(list)
for pdf in all_pdfs:
    groups[pdf.parent].append(pdf)

total_processed = 0
total_skipped = 0
per_folder_summaries = []

# ---------- Process per folder ----------
for folder, pdfs in groups.items():
    out_root = folder / "esg_stage1(ver3)"
    texts_dir = out_root / "texts"
    sents_dir = out_root / "sentences"
    texts_dir.mkdir(parents=True, exist_ok=True)
    sents_dir.mkdir(parents=True, exist_ok=True)

    manifest_path = out_root / "manifest.parquet"
    existing = pd.read_parquet(manifest_path) if (SKIP_EXISTING and manifest_path.exists()) else pd.DataFrame()

    rows = []
    skipped_here = 0
    processed_here = 0

    for pdf in tqdm(pdfs, desc=f"Processing PDFs in {folder.name}", leave=False):
        filename = pdf.name
        if SKIP_EXISTING and not existing.empty and filename in existing["filename"].values:
            skipped_here += 1
            continue

        year = extract_year(pdf); rtype = extract_type(pdf); hid = sha256_16(pdf)
        text_path = texts_dir / f"{hid}.txt"
        sent_path = sents_dir / f"{hid}.parquet"

        # 1) Native extraction
        txt = native_text(pdf)
        if txt and len(txt) >= NATIVE_MIN_CHARS:
            status = "ok_native"
        else:
            # 2) Full-document OCR
            txt = ocr_fulldoc(pdf)
            status = "ok_ocr" if txt and len(txt) >= NATIVE_MIN_CHARS else "error_empty"

        # Save manifest extraction
        text_path.write_text(txt or "", encoding="utf-8")
        pd.DataFrame({"text": to_sentences(txt) if txt else []}).to_parquet(sent_path, index=False)

        rows.append({
            "sha256_16": hid,
            "filename": filename,
            "pdf_path": str(pdf),
            "text_path": str(text_path),
            "sentences_path": str(sent_path),
            "status": status,
            "doc_year": year,
            "report_type": rtype
        })
        processed_here += 1

    # Merge+save manifest in the same folder
    if rows:
        manifest = (pd.concat([existing, pd.DataFrame(rows)], ignore_index=True)
                    if not existing.empty else pd.DataFrame(rows))
        manifest = manifest.drop_duplicates(subset=["filename"], keep="last")
        manifest.to_parquet(manifest_path, index=False)
    else:
        manifest = existing if not existing.empty else pd.DataFrame(columns=[
            "sha256_16","filename","pdf_path","text_path","sentences_path","status","doc_year","report_type"
        ])

    per_folder_summaries.append({
        "folder": str(folder),
        "pdfs_found": len(pdfs),
        "processed_now": processed_here,
        "skipped_existing": skipped_here,
        "manifest_path": str(manifest_path),
        "manifest_rows": len(manifest)
    })
    total_processed += processed_here
    total_skipped += skipped_here

# ---------- Result ----------
print("\n✅ Done.")
print(f"Folders: {len(groups)}  |  PDFs total: {len(all_pdfs)}")
print(f"Processed now: {total_processed}  |  Skipped (existing): {total_skipped}\n")
for s in per_folder_summaries[:10]:
    print(f"- {Path(s['folder']).name}: found={s['pdfs_found']}, processed={s['processed_now']}, "
          f"skipped={s['skipped_existing']}, manifest_rows={s['manifest_rows']}")
    print(f"  → {s['manifest_path']}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Mounted at /content/drive


                                                                               


✅ Done.
Folders: 493  |  PDFs total: 2278
Processed now: 0  |  Skipped (existing): 2278

- 000. Mini Trial: found=5, processed=0, skipped=5, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/000. Mini Trial/esg_stage1(ver3)/manifest.parquet
- 001. Walmart: found=6, processed=0, skipped=6, manifest_rows=6
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/001. Walmart/esg_stage1(ver3)/manifest.parquet
- 002. Amazon: found=5, processed=0, skipped=5, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/002. Amazon/esg_stage1(ver3)/manifest.parquet
- 003. United Health: found=5, processed=0, skipped=5, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/003. United Health/esg_stage1(ver3)/manifest.parquet
- 004. Apple: found=5, processed=0, skipped=5, manifest_rows=5
  → /content/drive/MyDr



## STAGE 1 WITH TEXT IMPROVEMENT (PAGE NUMBER)

In [None]:
# ===============================
# Stage 1 — Text Extraction (improved)
# Native PDF via pdftotext(-layout) → fallback PyPDF → OCR (layout-aware)
# - Header/footer stripping via page-line frequency
# - ToC page/line detection and suppression
# - Fix broken hyphens & wrapped lines
# - Bullet/leader normalization
# - Safe sentence joining before NLTK sent_tokenize
# ===============================
!apt-get -qq update && apt-get -qq install -y poppler-utils tesseract-ocr >/dev/null
!pip -q install pypdf pdf2image pytesseract nltk pandas pyarrow tqdm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, io, subprocess, hashlib, unicodedata, statistics, string
from collections import defaultdict, Counter
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
import nltk; nltk.download('punkt', quiet=True)
import nltk; nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

# ---------- CONFIG ----------
IN_DIR            = "/content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/"
SKIP_EXISTING     = True
NATIVE_MIN_CHARS  = 120
OCR_DPI           = 300
SENT_MINLEN       = 25

# Cleanup toggles
DROP_TOC_PAGES            = True
DROP_TOC_LINES_EVERYWHERE = True
STRIP_HEADERS_FOOTERS     = True
HEADER_FOOTER_MIN_PAGES   = 4     # if a top/bottom line repeats on >= this many pages → drop
WRAP_JOIN_WIDTH_CHARS     = 64    # if a short line (no terminal punctuation) followed by lowercase start → join
MAX_PAGE_NUMBER           = 400   # cap for small-int detection on ToC

# ---------- Regex & helpers ----------
YEAR_RE   = re.compile(r'_(19|20)\d{2}(?=\.pdf$)', re.I)
PAGINATION= re.compile(r"\b(page|pp\.|p\.|appendix|annex|exhibit|figure|table|chapter|section|see page)\b", re.I)
TOC_LEADER= re.compile(r"\.{3,}\s*\d{1,4}$")
BARE_NUM  = re.compile(r"^\s*\d{1,3}\s*$")
DATE_ONLY = re.compile(r"^\s*(?:FY\s*\d{4}|(19|20)\d{2}|Q[1-4])\s*$", re.I)
SHORT_HEAD= re.compile(r"^(?:contents|table of contents|index)$", re.I)
SMALL_INT = re.compile(r"\b(?:[1-9]\d{0,2})\b")  # 1..999
END_PUNCT = re.compile(r"[.!?…]\s*$")
BULLET_CH = "•●▪◦‣∙·"
BULLET_RE = re.compile(rf"\s*[{re.escape(BULLET_CH)}]\s*")
SOFT_HYPH = re.compile(r"(\w)-\s*\n\s*(\w)")  # hyphen wrap across newline

def extract_year(p: Path):
    m = YEAR_RE.search(p.name); return int(m.group(0)[1:]) if m else None

def extract_type(p: Path):
    n = p.name.upper()
    if "_AR_" in n: return "AR"
    if "_10K_" in n: return "10K"
    return "ESG"

def sha256_16(p: Path):
    h=hashlib.sha256()
    with open(p,'rb') as f:
        for b in iter(lambda:f.read(1<<20), b''): h.update(b)
    return h.hexdigest()[:16]

def normalize_ws(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    # keep newlines for page/line logic; collapse excessive spaces
    text = re.sub(r"[ \t]+", " ", text)
    return text

def is_toc_like_line(line: str) -> bool:
    s = line.strip()
    if not s: return False
    # multiple title+page-number pairs or many small integers and little punctuation
    pairs = len(re.findall(r"[A-Za-z][\w’&/ -]{2,}\s{1,}\d{1,3}(?:\s{2,}|\s*$)", s))
    small_ints = [int(x) for x in SMALL_INT.findall(s) if int(x) <= MAX_PAGE_NUMBER]
    punct = sum(ch in ",;:.!?()" for ch in s)
    # dot leaders classic
    if TOC_LEADER.search(s): return True
    if pairs >= 3: return True
    if len(small_ints) >= 5 and punct <= 1: return True
    toc_keywords = ("contents","data tables","endnotes","assurance","disclaimer","overview","goals summary")
    if any(k in s.lower() for k in toc_keywords) and (pairs >= 2 or len(small_ints) >= 4):
        return True
    return False

def is_toc_page(text: str) -> bool:
    lines = [ln for ln in text.splitlines() if ln.strip()]
    if not lines: return False
    score = sum(is_toc_like_line(ln) for ln in lines)
    return score >= max(5, int(0.3*len(lines)))  # many toc-like lines

def fingerprint_header_footer_per_page(pages: list[str]):
    # header = first non-empty line; footer = last non-empty line
    headers, footers = [], []
    for pg in pages:
        ls = [l.strip() for l in pg.splitlines() if l.strip()]
        if not ls:
            headers.append(""); footers.append(""); continue
        headers.append(ls[0]); footers.append(ls[-1])
    return headers, footers

def compute_drop_lines(headers, footers, min_pages=HEADER_FOOTER_MIN_PAGES):
    hc = Counter(headers); fc = Counter(footers)
    drop = set()
    for line, cnt in hc.items():
        if line and cnt >= min_pages: drop.add(line)
    for line, cnt in fc.items():
        if line and cnt >= min_pages: drop.add(line)
    return drop

def strip_headers_footers(text: str, drop_lines: set[str]) -> str:
    if not drop_lines: return text
    out = []
    for ln in text.splitlines():
        if ln.strip() in drop_lines: continue
        out.append(ln)
    return "\n".join(out)

def fix_broken_hyphens(text: str) -> str:
    # join word-wrap hyphen breaks across newline
    text = SOFT_HYPH.sub(r"\1\2", text)
    # remove discretionary hyphen (Unicode soft hyphen)
    text = text.replace("\u00AD", "")
    return text

def join_wrapped_lines(text: str) -> str:
    # Join lines that are likely wrapped (no end punctuation and next starts lowercase)
    lines = text.splitlines()
    out = []
    i = 0
    while i < len(lines):
        cur = lines[i].rstrip()
        while i+1 < len(lines):
            nxt = lines[i+1].lstrip()
            if not cur: break
            # If current line is short and doesn't end with sentence punctuation, and next is a lowercase/continuation, join.
            if (len(cur) < WRAP_JOIN_WIDTH_CHARS and not END_PUNCT.search(cur)
                and (nxt[:1].islower() or (nxt and nxt[0] in string.ascii_lowercase))
                and not SHORT_HEAD.match(cur)
               ):
                cur = cur + " " + nxt
                i += 1
            else:
                break
        out.append(cur)
        i += 1
    return "\n".join(out)

def normalize_bullets_and_leaders(text: str) -> str:
    # Turn "• item" into " - item" and collapse multiple bullets on one line with semicolons
    text = BULLET_RE.sub(" - ", text)
    # Replace midline bullet separators with " ; " to aid sentence tokenization
    text = re.sub(rf"\s*[{re.escape(BULLET_CH)}]\s*", " ; ", text)
    return text

def clean_lines_keep_structure(text: str, drop_toc_pages=True, drop_toc_lines=True, strip_hf=True):
    text = normalize_ws(text)
    pages = [pg for pg in re.split(r"\f|\x0c", text)]  # form-feed split if present
    if not pages: pages = [text]

    # detect headers/footers
    drop_set = set()
    if strip_hf:
        h, f = fingerprint_header_footer_per_page(pages)
        drop_set = compute_drop_lines(h, f, HEADER_FOOTER_MIN_PAGES)

    new_pages = []
    for pg in pages:
        if not pg.strip():
            new_pages.append(pg);
            continue
        if drop_toc_pages and is_toc_page(pg):
            # keep page for traceability but blank out
            new_pages.append("")
            continue
        # strip headers/footers by line content
        if drop_set:
            pg = strip_headers_footers(pg, drop_set)
        # remove obvious non-content lines
        kept = []
        for ln in pg.splitlines():
            s = ln.strip()
            if not s:
                kept.append(ln);
                continue
            if BARE_NUM.match(s):
                continue
            if TOC_LEADER.search(s):
                continue
            if SHORT_HEAD.match(s):
                continue
            if PAGINATION.search(s) and len(s.split()) <= 6:
                continue
            if DATE_ONLY.match(s):
                continue
            if drop_toc_lines and is_toc_like_line(s):
                continue
            kept.append(ln)
        page_txt = "\n".join(kept)
        new_pages.append(page_txt)

    cleaned = "\n".join(new_pages)
    cleaned = fix_broken_hyphens(cleaned)
    cleaned = join_wrapped_lines(cleaned)
    cleaned = normalize_bullets_and_leaders(cleaned)
    # Collapse >2 consecutive newlines but keep paragraph boundaries
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
    return cleaned.strip()

def to_sentences(txt: str):
    txt = clean_lines_keep_structure(txt,
                                     drop_toc_pages=DROP_TOC_PAGES,
                                     drop_toc_lines=DROP_TOC_LINES_EVERYWHERE,
                                     strip_hf=STRIP_HEADERS_FOOTERS)
    # Help tokenizer by ensuring paragraphs have terminal punctuation:
    txt = re.sub(r"(?<![.!?;])\n(?=[A-Z])", ". ", txt)  # paragraph starts with cap
    sents = [s.strip() for s in sent_tokenize(txt)]
    sents = [s for s in sents if len(s) >= SENT_MINLEN]
    return sents

# ---------- Native extraction ----------
def native_text_pdftotext(pdf: Path) -> str:
    try:
        # -layout preserves columns; -q quiet; returns bytes
        out = subprocess.check_output(
            ["pdftotext", "-layout", "-q", str(pdf), "-"],
            stderr=subprocess.DEVNULL
        )
        return out.decode("utf-8", errors="ignore")
    except Exception:
        return ""

def native_text_pypdf(pdf: Path) -> str:
    try:
        pages = PdfReader(str(pdf)).pages
        parts = [(pg.extract_text() or "") for pg in pages]
        return "\n\f\n".join(parts)  # keep page separators
    except Exception:
        return ""

def native_text(pdf: Path) -> str:
    txt = native_text_pdftotext(pdf)
    if len(txt.strip()) < NATIVE_MIN_CHARS:
        txt = native_text_pypdf(pdf)
    return txt

# ---------- OCR extraction ----------
def ocr_fulldoc(pdf: Path) -> str:
    try:
        imgs = convert_from_path(str(pdf), dpi=OCR_DPI)
        txts = []
        for im in imgs:
            t = pytesseract.image_to_string(
                im, lang="eng", config="--oem 1 --psm 6"
            )
            txts.append(t or "")
        return "\n\f\n".join(txts)
    except Exception:
        return ""

# ---------- Discover PDFs & group by company folder ----------
IN_ROOT = Path(IN_DIR)
all_pdfs = sorted(IN_ROOT.rglob("*.pdf"))
groups = defaultdict(list)
for pdf in all_pdfs:
    groups[pdf.parent].append(pdf)

total_processed = 0
total_skipped = 0
per_folder_summaries = []

# ---------- Process per folder ----------
for folder, pdfs in groups.items():
    out_root  = folder / "esg_stage1(ver4)"
    texts_dir = out_root / "texts"
    sents_dir = out_root / "sentences"
    texts_dir.mkdir(parents=True, exist_ok=True)
    sents_dir.mkdir(parents=True, exist_ok=True)

    manifest_path = out_root / "manifest.parquet"
    existing = pd.read_parquet(manifest_path) if (SKIP_EXISTING and manifest_path.exists()) else pd.DataFrame()

    rows = []
    skipped_here = 0
    processed_here = 0

    for pdf in tqdm(pdfs, desc=f"Processing PDFs in {folder.name}", leave=False):
        filename = pdf.name
        if SKIP_EXISTING and not existing.empty and filename in existing["filename"].values:
            skipped_here += 1
            continue

        year = extract_year(pdf); rtype = extract_type(pdf); hid = sha256_16(pdf)
        text_path = texts_dir / f"{hid}.txt"
        sent_path = sents_dir / f"{hid}.parquet"

        # 1) Native (pdftotext → PyPDF)
        raw = native_text(pdf)
        # 2) Decide OCR if too short
        if raw and len(raw) >= NATIVE_MIN_CHARS:
            status = "ok_native"
            txt_to_save = raw
        else:
            raw = ocr_fulldoc(pdf)
            status = "ok_ocr" if raw and len(raw) >= NATIVE_MIN_CHARS else "error_empty"
            txt_to_save = raw

        # Save raw (normalized a bit) and sentences (post-clean)
        text_path.write_text(normalize_ws(txt_to_save) if txt_to_save else "", encoding="utf-8")
        sents = to_sentences(txt_to_save) if txt_to_save else []
        pd.DataFrame({"text": sents}).to_parquet(sent_path, index=False)

        rows.append({
            "sha256_16": hid,
            "filename": filename,
            "pdf_path": str(pdf),
            "text_path": str(text_path),
            "sentences_path": str(sent_path),
            "status": status,
            "doc_year": year,
            "report_type": rtype,
            "dropped_toc_pages": DROP_TOC_PAGES,
            "dropped_toc_lines": DROP_TOC_LINES_EVERYWHERE,
            "strip_headers_footers": STRIP_HEADERS_FOOTERS
        })
        processed_here += 1

    # Merge+save manifest
    if rows:
        manifest = (pd.concat([existing, pd.DataFrame(rows)], ignore_index=True)
                    if not existing.empty else pd.DataFrame(rows))
        manifest = manifest.drop_duplicates(subset=["filename"], keep="last")
        manifest.to_parquet(manifest_path, index=False)
    else:
        manifest = existing if not existing.empty else pd.DataFrame(columns=[
            "sha256_16","filename","pdf_path","text_path","sentences_path","status",
            "doc_year","report_type","dropped_toc_pages","dropped_toc_lines","strip_headers_footers"
        ])

    per_folder_summaries.append({
        "folder": str(folder),
        "pdfs_found": len(pdfs),
        "processed_now": processed_here,
        "skipped_existing": skipped_here,
        "manifest_path": str(manifest_path),
        "manifest_rows": len(manifest)
    })
    total_processed += processed_here
    total_skipped += skipped_here

# ---------- Result ----------
print("\n✅ Done.")
print(f"Folders: {len(groups)}  |  PDFs total: {len(all_pdfs)}")
print(f"Processed now: {total_processed}  |  Skipped (existing): {total_skipped}\n")
for s in per_folder_summaries[:10]:
    print(f"- {Path(s['folder']).name}: found={s['pdfs_found']}, processed={s['processed_now']}, "
          f"skipped={s['skipped_existing']}, manifest_rows={s['manifest_rows']}")
    print(f"  → {s['manifest_path']}")


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Mounted at /content/drive





✅ Done.
Folders: 493  |  PDFs total: 2278
Processed now: 2278  |  Skipped (existing): 0

- 000. Mini Trial: found=5, processed=5, skipped=0, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/000. Mini Trial/esg_stage1(ver4)/manifest.parquet
- 001. Walmart: found=6, processed=6, skipped=0, manifest_rows=6
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/001. Walmart/esg_stage1(ver4)/manifest.parquet
- 002. Amazon: found=5, processed=5, skipped=0, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/002. Amazon/esg_stage1(ver4)/manifest.parquet
- 003. United Health: found=5, processed=5, skipped=0, manifest_rows=5
  → /content/drive/MyDrive/Australia Award Scholarship/USYD/Dissertation/DATA/FORTUNE 500/003. United Health/esg_stage1(ver4)/manifest.parquet
- 004. Apple: found=5, processed=5, skipped=0, manifest_rows=5
  → /content/drive/MyDr