In [None]:
import xml.etree.ElementTree as ET
import requests
import csv
import fitz  # PyMuPDF
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract
from bs4 import BeautifulSoup
from tqdm import tqdm

# Constants
OAI_BASE = "https://digital.library.unt.edu/oai/"
COLLECTION_SET = "collection:IIPCM"
NAMESPACES = {
    "oai": "http://www.openarchives.org/OAI/2.0/",
    "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
    "dc": "http://purl.org/dc/elements/1.1/",
}
OUTPUT_FILE = "iipcm_extracted_content.csv"

def get_first(elements):
    for el in elements:
        if el.text:
            return el.text.strip()
    return ""

def extract_pdf_text(pdf_url):
    try:
        response = requests.get(pdf_url, timeout=15)
        if response.status_code != 200 or "application/pdf" not in response.headers.get("Content-Type", ""):
            return ""
        pdf_bytes = response.content

        # Try extracting using PyMuPDF
        try:
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            text = "\n".join([page.get_text() for page in doc])
            if text.strip():
                return text.strip()
        except Exception:
            pass

        # Fallback to OCR
        images = convert_from_bytes(pdf_bytes)
        text = ""
        for img in images:
            text += pytesseract.image_to_string(img) + "\n"
        return text.strip()

    except Exception:
        return ""

def extract_vtt_transcript(vtt_url):
    try:
        response = requests.get(vtt_url)
        response.raise_for_status()
        lines = response.text.splitlines()
        transcript = []

        for line in lines:
            line = line.strip()
            if (
                line.startswith("WEBVTT") or
                line.startswith("NOTE") or
                "-->" in line or
                line.isdigit() or
                (":" in line and line.lower().startswith("vtt_")) or
                line == ""
            ):
                continue
            transcript.append(line)

        return " ".join(transcript)

    except Exception as e:
        print(f"❌ Error parsing VTT from {vtt_url}: {e}")
        return ""

def harvest_oai_records():
    print("🔁 Harvesting metadata using: oai_dc")
    records = []
    token = None

    while True:
        params = {
            "verb": "ListRecords",
            "metadataPrefix": "oai_dc",
            "set": COLLECTION_SET
        } if not token else {
            "verb": "ListRecords",
            "resumptionToken": token
        }

        resp = requests.get(OAI_BASE, params=params)
        root = ET.fromstring(resp.content)

        for r in root.findall(".//oai:record", NAMESPACES):
            header = r.find("oai:header", NAMESPACES)
            if header is None or header.attrib.get("status") == "deleted":
                continue

            meta = r.find(".//oai_dc:dc", NAMESPACES)
            if meta is None:
                continue

            identifiers = meta.findall("dc:identifier", NAMESPACES)
            ark_url = ""
            for ident in identifiers:
                if ident.text and "ark:/67531/" in ident.text:
                    ark_url = ident.text.strip()
                    break
            if not ark_url:
                continue

            record = {
                "ark_url": ark_url,
                "title": get_first(meta.findall("dc:title", NAMESPACES)),
                "date": get_first(meta.findall("dc:date", NAMESPACES)),
                "creator": get_first(meta.findall("dc:creator", NAMESPACES)),
                "subject": "; ".join([el.text.strip() for el in meta.findall("dc:subject", NAMESPACES) if el.text]),
                "description": get_first(meta.findall("dc:description", NAMESPACES)),
                "item_type": get_first(meta.findall("dc:type", NAMESPACES)),
            }

            records.append(record)

        token_el = root.find(".//oai:resumptionToken", NAMESPACES)
        token = token_el.text.strip() if token_el is not None and token_el.text else None
        if not token:
            break

    print(f"✅ Retrieved {len(records)} metadata records.\n")
    return records

def resolve_pdf_link(folder_url):
    try:
        res = requests.get(folder_url, timeout=15, allow_redirects=True)
        soup = BeautifulSoup(res.text, "html.parser")
        for link in soup.find_all("a"):
            href = link.get("href", "")
            if href.endswith(".pdf"):
                return requests.compat.urljoin(folder_url, href)
    except Exception as e:
        print(f"❌ Error resolving PDF from folder: {folder_url} - {e}")
    return ""

def process_record(record):
    ark_url = record["ark_url"]
    item_type = record["item_type"].lower()
    text = ""
    file_url = ""

    try:
        res = requests.get(ark_url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        links = soup.find_all("a")

        for link in links:
            href = link.get("href", "")
            full_url = requests.compat.urljoin(ark_url, href)

            if item_type == "video" and href.endswith(".vtt"):
                text = extract_vtt_transcript(full_url)
                file_url = full_url
                break

            elif item_type != "video" and href.endswith(".pdf"):
                # Redirect from folder? (e.g. ends with /high_res_d/)
                if href.endswith("/"):
                    pdf_resolved = resolve_pdf_link(full_url)
                    if pdf_resolved:
                        text = extract_pdf_text(pdf_resolved)
                        file_url = pdf_resolved
                        break
                else:
                    text = extract_pdf_text(full_url)
                    file_url = full_url
                    break

    except Exception:
        pass

    record["source_url"] = file_url
    record["full_text"] = text.replace("\r", "").replace("\n", "\\n")  # Clean for CSV
    return record

def main():
    records = harvest_oai_records()
    print("🔍 Extracting full text from associated files...\n")
    processed = []

    for rec in tqdm(records, desc="📄 Processing"):
        processed.append(process_record(rec))

    keys = ["ark_url", "title", "date", "creator", "subject", "description", "item_type", "source_url", "full_text"]
    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(processed)

    print(f"\n✅ Exported {len(processed)} records with extracted content to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


🔁 Harvesting metadata using: oai_dc
✅ Retrieved 587 metadata records.

🔍 Extracting full text from associated files...



📄 Processing: 100%|██████████| 587/587 [14:06<00:00,  1.44s/it]


✅ Exported 587 records with extracted content to: iipcm_extracted_content.csv



