In [None]:
#jupyter cell test notebook for pipeline integration 

# DO NOT OVERWRITE LOGS, CREATE NEW LOG FILE BEFORE EACH NEW RUN WITH TIME AND DATE 
import os
import re
import io
import sys
import json
import ujson
import ast
import time
import logging
LOG_DIR = "/mnt/c/Users/WSTATION/Desktop/docling_mods/docling_test_logs"
os.makedirs(LOG_DIR, exist_ok=True)  # [QA CHANGE] Ensure directory exists
# /mnt/c/Users/WSTATION/Desktop/NEW_ETL
LOG_FILE = os.path.join(LOG_DIR, "docling_testing.log")
logging.basicConfig(
    filename=LOG_FILE,
    filemode='w',
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# set loggers for docling modules
docling_logger = logging.getLogger("docling")
docling_logger.setLevel(logging.DEBUG)

# Ensure logs propagate to the root logger
docling_logger.propagate = True
_log = logging.getLogger(__name__) 

_log.debug("Test debug message from LayoutPostprocessor.py")
_log.info("Test info message from LayoutPostprocessor.py")
import traceback  # [QA CHANGE] For logging full tracebacks
import requests
import pandas as pd

from docling_test_single_GPU import do_docling_extraction

import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock


In [None]:
# jupyter cell 

#-----------Directories----------------

PDF_SAVE_FOLDER = '/mnt/c/Users/WSTATION/Desktop/docling_mods/data/NEW_ETL_PDF'
INTERMEDIATE_PATH = "/mnt/c/Users/WSTATION/Desktop/docling_mods/data/pre_docling.feather"


OUTPUT_FEATHER = "/mnt/c/Users/WSTATION/Desktop/docling_mods/data/ETL_output_docling_ts.feather"

os.makedirs(PDF_SAVE_FOLDER, exist_ok=True) # [QA CHANGE] Ensure directory exists


In [None]:
# ─── patch PDFPath to relative repo paths ──────────────────────────────────────
import os
import pandas as pd

# assume this notebook lives in docling_mods/scripts/
repo_root    = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
feather_file = os.path.join(repo_root, "data", "pre_docling.feather")
pdf_folder   = os.path.join(repo_root, "data", "NEW_ETL_PDF")

# load, rewrite, save
df = pd.read_feather(feather_file)
df["PDFPath"] = df["PDFPath"].apply(lambda p: os.path.join(pdf_folder, os.path.basename(p)))
df.to_feather(feather_file)

print(f"[+] PDFPath column now points at {pdf_folder}")
# ───────────────────────────────────────────────────────────────────────────────

In [None]:
# Main test cell
# ------------------------------------------------------------------------------
def pipeline():
    """
    Tests the integration of Docling
    """
    print("\n=== Starting Docling test run ===")
    logging.info("Starting pipeline")

    # ------------------------------------------------
    # Docling extraction
    # ------------------------------------------------
    combined_df = pd.read_feather(INTERMEDIATE_PATH)
    combined_df = do_docling_extraction(combined_df)

    # if running debug, comment the docling function call above and uncomment the one below, BE SURE TO CHANGE THE OUTPUT DIRECTORY:
    # combined_df = do_docling_extraction(combined_df, output_dir="/mnt/c/Users/WSTATION/Desktop/docling_mods/data/docling_debug")
    # ------------------------------------------------
    # Step 10: Save final
    # ------------------------------------------------
    try:
        combined_df["InTextCitation"] = combined_df["InTextCitation"].astype(str)
        combined_df["FullCitation"]    = combined_df["FullCitation"].astype(str)
        combined_df.to_feather(OUTPUT_FEATHER)
        logging.info("Results saved to %s", OUTPUT_FEATHER)
        print(f"[*] Results saved to {OUTPUT_FEATHER}")
    except Exception as e:
        logging.error(f"Error saving Feather: {e}")
        print(f"[!] Error saving Feather file: {e}")

# -----------------------------------------------------
# if run directly
# -----------------------------------------------------
if __name__ == "__main__":
    pipeline()

In [None]:
# inspect the results 
result_df = pd.read_feather("/mnt/c/Users/WSTATION/Desktop/docling_mods/data/ETL_output_docling_ts.feather")