# Processing the TOCs of the PDFs for the TOC-based PageParser, the LLM-Refined PageParser and the GT_TOC Data

## Extracting TOC Metadata from the PDFs

In [None]:
#%pip install pdfminer.six
from enum import Enum, auto
from pathlib import Path
from typing import Any, Optional
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage, LITERAL_PAGE
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.pdftypes import PDFObjRef


class PDFRefType(Enum):
    """PDF reference type."""

    PDF_OBJ_REF = auto()
    DICTIONARY = auto()
    LIST = auto()
    NAMED_REF = auto()
    UNK = auto()  # fallback


class RefPageNumberResolver:
    """PDF Reference to page number resolver.

    .. note::

       Remote Go-To Actions (see 12.6.4.3 in
       `https://www.adobe.com/go/pdfreference/`__)
       are out of the scope of this resolver.

    Attributes:
        document (:obj:`pdfminer.pdfdocument.PDFDocument`):
            The document that contains the references.
        objid_to_pagenum (:obj:`dict[int, int]`):
            Mapping from an object id to the number of the page that contains
            that object.
    """

    def __init__(self, document: PDFDocument):
        self.document = document
        # obj_id -> page_number
        self.objid_to_pagenum: dict[int, int] = {
            page.pageid: page_num
            for page_num, page in enumerate(PDFPage.create_pages(document), 1)
        }

    @classmethod
    def get_ref_type(cls, ref: Any) -> PDFRefType:
        """Get the type of a PDF reference."""
        if isinstance(ref, PDFObjRef):
            return PDFRefType.PDF_OBJ_REF
        elif isinstance(ref, dict) and "D" in ref:
            return PDFRefType.DICTIONARY
        elif isinstance(ref, list) and any(isinstance(e, PDFObjRef) for e in ref):
            return PDFRefType.LIST
        elif isinstance(ref, bytes):
            return PDFRefType.NAMED_REF
        else:
            return PDFRefType.UNK

    @classmethod
    def is_ref_page(cls, ref: Any) -> bool:
        """Check whether a reference is of type '/Page'.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`bool`: :obj:`True` if the reference references
            a page, :obj:`False` otherwise.
        """
        return isinstance(ref, dict) and "Type" in ref and ref["Type"] is LITERAL_PAGE

    def resolve(self, ref: Any) -> Optional[int]:
        """Resolve a PDF reference to a page number recursively.

        Args:
            ref (:obj:`Any`):
                The PDF reference.

        Returns:
            :obj:`Optional[int]`: The page number or :obj:`None`
            if the reference could not be resolved (e.g., remote Go-To
            Actions or malformed references).
        """
        ref_type = self.get_ref_type(ref)

        if ref_type is PDFRefType.PDF_OBJ_REF and self.is_ref_page(ref.resolve()):
            return self.objid_to_pagenum.get(ref.objid)
        elif ref_type is PDFRefType.PDF_OBJ_REF:
            return self.resolve(ref.resolve())

        if ref_type is PDFRefType.DICTIONARY:
            return self.resolve(ref["D"])

        if ref_type is PDFRefType.LIST:
            # Get the PDFObjRef in the list (usually first element).
            return self.resolve(next(filter(lambda e: isinstance(e, PDFObjRef), ref)))

        if ref_type is PDFRefType.NAMED_REF:
            return self.resolve(self.document.get_dest(ref))

        return None  # PDFRefType.UNK

In [None]:
import os
import re
import csv
import shutil
from string import digits
from pdfminer.pdfparser import PDFParser
import traceback
import unicodedata
from pdfminer.pdfdocument import PDFDocument
from collections import defaultdict

def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

# a stopwordlist may be used, but we did not do that
#stopwordlist=["cover page", "cover", "front cover", "back cover", "half title", "series page", "title page", "dedication", "table of contents", "content", "contents", "acknowledgments","list of figures", "list of tables", "list of abbreviations","copyright", "copyright page","title page", "title", "imprint and copyright information", "credits", "references", "bibliography", "literature", "pagina vuota"]
stopwordlist=[]

# Open a PDF document.
for f in os.listdir("../data/PDFs/"):
    
    # Checking if the file is a PDF and does not already have a corresponding TOC file
    if f.endswith(".pdf") and not os.path.exists("./TOCs/"+f[:-4]+".toc"):
        try:
            tocelements=defaultdict(list)
            fp = open(str('../data/PDFs/'+f), 'rb')
            parser = PDFParser(fp)
            document = PDFDocument(parser)
            ref_pagenum_resolver = RefPageNumberResolver(document)
            print(f)
            
            # Get the outlines of the document.
            try:
                outlines = document.get_outlines()
            except Exception as e:
                print(e)
                outlines=None
                
            # Writing TOC elements to a file
            if outlines:
                with open("./TOCs/"+f[:-4]+".toc", "w", encoding="utf-8-sig") as r:
                    
                    # Resolving the page number based on different attributes
                    for (level,title,dest,a,se) in outlines:
                        if dest:
                            page_num = ref_pagenum_resolver.resolve(dest)
                        elif a:
                            page_num = ref_pagenum_resolver.resolve(a)
                        elif se:
                            page_num = ref_pagenum_resolver.resolve(se)
                        else:
                            page_num = None
                            
                        # Cleaning and stripping control characters from the title
                        res=remove_control_characters(title).strip('"').strip("\r\n").strip("\n").strip(" ")
                        if res.lower() not in stopwordlist:
                            #print(level, title)
                            wr=csv.writer(r)
                            title=res
                            wr.writerow([level, title, page_num])
                            tocelements[level].append(title)
                if not tocelements:
                    print("No TOC in the metadata of file: "+f)
                    os.remove("./TOCs/"+f[:-4]+".toc") # Removing the TOC file if no elements were added
                    
        except Exception:
            print(traceback.format_exc())
            
            continue

## Segmentation with the ground truth GT_TOC data using the XML file representation

In [None]:
#%pip install lxml
#%pip install wordsegment
#%pip install --upgrade ipywidgets

from collections import defaultdict, OrderedDict
import logging
import os
import csv
import json
import string
import signal
import traceback
from lxml import etree
from fuzzywuzzy import fuzz
from wordsegment import load, segment
import tqdm.notebook as tq
import chardet
import re
import logging

# Configure logging
logging.basicConfig(filename='./log/unmatched_headings_gt.log', level=logging.WARNING, format='%(asctime)s - %(message)s')

# Initialize wordsegment
load()

# Custom exception for timeout handling
class TimeoutException(Exception):
    pass

# Custom OrderedDict subclass with default list values
class OrderedDictWithDefaultList(OrderedDict):
    def __missing__(self, key):
        value = []
        self[key] = value
        return value

# Signal handler for timeout
def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

# Function to detect encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        detector = chardet.universaldetector.UniversalDetector()
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    return detector.result['encoding']

# Function to parse TOC files and return a defaultdict of lists
def parse_toc_files(toc_path, failed_files):
    toc = defaultdict(list)
    for f in sorted(os.listdir(toc_path)):
        if f.endswith(".toc") and f"{f[:-4]}" not in failed_files:
            enc = detect_encoding(os.path.join(toc_path, f))
            with open(os.path.join(toc_path, f), "r", encoding=enc) as r:
                data = csv.reader(r, escapechar='\\')
                for d in data:
                    if len(d) == 3:
                        level, heading, page = d
                        toc[f[:-4]].append([level, heading, page])
    return toc

# Function to load XML content from a file path
def load_xml(xml_path):
    with open(xml_path, 'rb') as f:
        xml_content = f.read()
    return etree.fromstring(xml_content)

from collections import defaultdict
from lxml import etree
import re
from fuzzywuzzy import fuzz

# Function to group <text> elements by their top attribute
def group_text_by_top(text_elements):
    lines = defaultdict(list)
    for txt in text_elements:
        top = txt.get("top")
        lines[top].append(txt)
    return lines

# Function to combine text from nodes grouped by top attribute
def combine_text_by_top(lines):
    combined_texts = {}
    for top, texts in lines.items():
        combined_texts[top] = ' '.join(
            filter(None, [concatenate_text_from_node(txt) for txt in texts if isinstance(txt, etree._Element)])
        ).strip()
    return combined_texts

# Function to concatenate text content from nodes, handling special formatting tags
def concatenate_text_from_node(node):
    texts = []
    for child in node.iter():
        if isinstance(child, etree._ElementUnicodeResult):
            if child:
                texts.append(child.strip())
        elif child.tag in ['b', 'i', 'u', 'font', 'a']:  # Adjust as per your XML structure
            if child.text:
                texts.append(child.text.strip())
            if child.tail:
                texts.append(child.tail.strip())
    return ' '.join(filter(None, texts)).strip()

# Function to check exact match between text and heading
def matches_heading_exact(text, heading):
    if text is None or heading is None:
        return False
    return text.lower().strip().replace(" ", "") == heading.lower().strip().replace(" ", "")

# Function to find substring match using regex between text and heading
def matches_heading(text, heading):
    if text is None or heading is None:
        return False
    clean_text = re.sub(r'[^\w\s]', '', text.strip()).lower().replace(" ", "") if text else ""
    clean_heading = re.sub(r'[^\w\s]', '', heading.strip()).lower().replace(" ", "") if heading else ""
  
# Function to check fuzzy match between text and heading
def fuzzy_match_heading(text, heading):
    if text is None or heading is None:
        return False
    return fuzz.partial_ratio(re.sub(r'[^\w\s]', '', text).strip().lower(), re.sub(r'[^\w\s]', '', heading).strip().lower()) == 100 or re.sub(r'[^\w\s]', '', heading).strip().lower() in re.sub(r'[^\w\s]', '', text).strip().lower()



def segment_text(xml, toc, file_key):
    segments = []
    headings_per_page = defaultdict(list)

    # Convert toc_entries to a list of tuples with (heading, page_number, level)
    toc_entries = [(entry[1], int(entry[2]), int(entry[0])) for entry in toc]

    # Add a dummy heading with a maximum page number
    max_page_number = max([int(page.get("number")) for page in xml.xpath('.//page')])
    dummy_heading = ("DUMMY_HEADING", max_page_number + 1, 0)
    toc_entries.append(dummy_heading)
    
    current_section = {}
    toc_index = 0
    total_entries = len(toc_entries)
    
    for page in xml.xpath('.//page'):
        page_number = int(page.get("number"))
        text_elements = page.xpath('.//text')
    
        # Group text elements by their top attribute
        lines = group_text_by_top(text_elements)
        combined_texts = combine_text_by_top(lines)
    
        found_match = False
    
        # Build a dictionary of headings per page
        while toc_index < total_entries and toc_entries[toc_index][1] == page_number:
            heading, _, level = toc_entries[toc_index]
            headings_per_page[page_number].append((heading, level))
            toc_index += 1
        for heading, level in headings_per_page[page_number]:
            found_match = False
        # Iterate over text elements to find matches
            for txt in text_elements:
                text_content = concatenate_text_from_node(txt)
                found_match = False
                # Check if we have an entry in toc_entries for this page_number
                if page_number in headings_per_page:
                
                    # Exact match
                    if matches_heading_exact(txt.text, heading) or matches_heading_exact(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    # Fuzzy match or regex match
                    if matches_heading(txt.text, heading) or matches_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    if fuzzy_match_heading(txt.text, heading) or fuzzy_match_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                # Append text content to current section if no match yet
                if not found_match:
                    if current_section:
                        current_section['content'].append(txt.text or text_content)
    
            # Check for exact, regex, and fuzzy matches for combined text lines
            if not found_match:
                for combined_text in combined_texts.values():
                    found_match = False
                    if page_number in headings_per_page:
                        
                            
                            if matches_heading_exact(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if matches_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if fuzzy_match_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                    # Append combined text to current section if no match yet
                    if not found_match:
                        current_section['content'].append(combined_text)
        
        # Log unmatched headings for the current page
        if not found_match and page_number in headings_per_page:
            for heading, level in headings_per_page[page_number]:
                logging.warning(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                print(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                input()
    # Add the last current section if it exists
    if current_section is not None:
        segments.append(current_section)
    return segments

# Main function to orchestrate the process
def main():
    log_path = "./log/gt_segmentation_log.txt"
    toc_path = "../data/GT_TOCs/"
    xml_base_path = "./xml/"

    # Read failed files from log
    with open(log_path, "r", encoding="utf-8-sig") as log:
        failed_files = log.readlines()

    # Parse TOC files
    toc = parse_toc_files(toc_path, failed_files)

    # Iterate over each TOC entry and process corresponding XML file
    for k, v in tq.tqdm(toc.items()):
        print(v)
        xml_file_path = os.path.join(xml_base_path, f"{k}.xml")
        if os.path.exists(xml_file_path) and k!= "industrial-concentration-and-the-chicago-school-of-antitrust-analysis":
            enc2 = detect_encoding(xml_file_path)
            print(enc2)
            print(k)
            print("exists")
            xml = load_xml(xml_file_path)
            try:
                signal.alarm(60 * 15)  # Set a timeout of 15 minutes
                segments = segment_text(xml, toc[k], k)
                signal.alarm(0)  # Reset the alarm

                if len(segments) > 1:
                    with open(f"../data/gt_segments/segments_{k}.json", 'w', encoding="utf-8-sig") as s:
                        json.dump(segments, s)
                    
                else:
                    with open(log_path, "a", encoding="utf-8-sig") as log:
                        log.write(f"{k}\n")
            except TimeoutException:
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
            except Exception:
                print(traceback.format_exc())
                print("fail")
                print(k)
                #input()
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
        else:
            print("Does not exist:")
            print(xml_file_path)
            #input()

if __name__ == "__main__":
    main()


## Getting the TOC-based parser data segmented, as well

In [None]:
from collections import defaultdict, OrderedDict
import logging
import os
import csv
import json
import string
import signal
import traceback
from lxml import etree
from fuzzywuzzy import fuzz
from wordsegment import load, segment
import tqdm.notebook as tq
import chardet
import re
import logging

# Configure logging
logging.basicConfig(filename='./log/unmatched_headings_toc.log', level=logging.WARNING, format='%(asctime)s - %(message)s')

# Initialize wordsegment
load()

# Custom exception for timeout handling
class TimeoutException(Exception):
    pass

# Custom OrderedDict subclass with default list values
class OrderedDictWithDefaultList(OrderedDict):
    def __missing__(self, key):
        value = []
        self[key] = value
        return value

# Signal handler for timeout
def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

# Function to detect encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        detector = chardet.universaldetector.UniversalDetector()
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    return detector.result['encoding']

# Function to parse TOC files and return a defaultdict of lists
def parse_toc_files(toc_path, failed_files):
    toc = defaultdict(list)
    for f in sorted(os.listdir(toc_path)):
        if f.endswith(".toc") and f"{f[:-4]}" not in failed_files and f in os.listdir("../data/GT_TOCs/"):
            enc = detect_encoding(os.path.join(toc_path, f))
            with open(os.path.join(toc_path, f), "r", encoding=enc) as r:
                data = csv.reader(r, escapechar='\\')
                for d in data:
                    if len(d) == 3:
                        level, heading, page = d
                        toc[f[:-4]].append([level, heading, page])
    return toc

# Function to load XML content from a file path
def load_xml(xml_path):
    with open(xml_path, 'rb') as f:
        xml_content = f.read()
    return etree.fromstring(xml_content)

from collections import defaultdict
from lxml import etree
import re
from fuzzywuzzy import fuzz

# Function to group <text> elements by their top attribute
def group_text_by_top(text_elements):
    lines = defaultdict(list)
    for txt in text_elements:
        top = txt.get("top")
        lines[top].append(txt)
    return lines

# Function to combine text from nodes grouped by top attribute
def combine_text_by_top(lines):
    combined_texts = {}
    for top, texts in lines.items():
        combined_texts[top] = ' '.join(
            filter(None, [concatenate_text_from_node(txt) for txt in texts if isinstance(txt, etree._Element)])
        ).strip()
    return combined_texts

# Function to concatenate text content from nodes, handling special formatting tags
def concatenate_text_from_node(node):
    texts = []
    for child in node.iter():
        if isinstance(child, etree._ElementUnicodeResult):
            if child:
                texts.append(child.strip())
        elif child.tag in ['b', 'i', 'u', 'font', 'a']:  # Adjust as per your XML structure
            if child.text:
                texts.append(child.text.strip())
            if child.tail:
                texts.append(child.tail.strip())
    return ' '.join(filter(None, texts)).strip()

# Function to check exact match between text and heading
def matches_heading_exact(text, heading):
    if text is None or heading is None:
        return False
    return text.lower().strip().replace(" ", "") == heading.lower().strip().replace(" ", "")

# Function to find substring match using regex between text and heading
def matches_heading(text, heading):
    if text is None or heading is None:
        return False
    clean_text = re.sub(r'[^\w\s]', '', text.strip()).lower().replace(" ", "") if text else ""
    clean_heading = re.sub(r'[^\w\s]', '', heading.strip()).lower().replace(" ", "") if heading else ""
  
# Function to check fuzzy match between text and heading
def fuzzy_match_heading(text, heading):
    if text is None or heading is None:
        return False
    return fuzz.partial_ratio(re.sub(r'[^\w\s]', '', text).strip().lower(), re.sub(r'[^\w\s]', '', heading).strip().lower()) == 100 or re.sub(r'[^\w\s]', '', heading).strip().lower() in re.sub(r'[^\w\s]', '', text).strip().lower()



def segment_text(xml, toc, file_key):
    segments = []
    headings_per_page = defaultdict(list)

    # Convert toc_entries to a list of tuples with (heading, page_number, level)
    toc_entries = [(entry[1], int(entry[2]), int(entry[0])) for entry in toc]

    # Add a dummy heading with a maximum page number
    max_page_number = max([int(page.get("number")) for page in xml.xpath('.//page')])
    dummy_heading = ("DUMMY_HEADING", max_page_number + 1, 0)
    toc_entries.append(dummy_heading)
    
    current_section = {}
    toc_index = 0
    total_entries = len(toc_entries)
    
    for page in xml.xpath('.//page'):
        page_number = int(page.get("number"))
        text_elements = page.xpath('.//text')
    
        # Group text elements by their top attribute
        lines = group_text_by_top(text_elements)
        combined_texts = combine_text_by_top(lines)
    
        found_match = False
    
        # Build a dictionary of headings per page
        while toc_index < total_entries and toc_entries[toc_index][1] == page_number:
            heading, _, level = toc_entries[toc_index]
            headings_per_page[page_number].append((heading, level))
            toc_index += 1
        for heading, level in headings_per_page[page_number]:
            found_match = False
        # Iterate over text elements to find matches
            for txt in text_elements:
                text_content = concatenate_text_from_node(txt)
                found_match = False
                # Check if we have an entry in toc_entries for this page_number
                if page_number in headings_per_page:
                
                    # Exact match
                    if matches_heading_exact(txt.text, heading) or matches_heading_exact(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    # Fuzzy match or regex match
                    if matches_heading(txt.text, heading) or matches_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    if fuzzy_match_heading(txt.text, heading) or fuzzy_match_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                # Append text content to current section if no match yet
                if not found_match:
                    if current_section:
                        current_section['content'].append(txt.text or text_content)
    
            # Check for exact, regex, and fuzzy matches for combined text lines
            if not found_match:
                for combined_text in combined_texts.values():
                    found_match = False
                    if page_number in headings_per_page:
                        
                            
                            if matches_heading_exact(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if matches_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if fuzzy_match_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                    # Append combined text to current section if no match yet
                    if not found_match:
                        if current_section:
                            current_section['content'].append(combined_text)
        
        # Log unmatched headings for the current page
        if not found_match and page_number in headings_per_page:
            for heading, level in headings_per_page[page_number]:
                logging.warning(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                print(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                #input()
    # Add the last current section if it exists
    if current_section is not None:
        segments.append(current_section)
    return segments

# Main function to orchestrate the process
def main():
    log_path = "./log/segmentation_log.txt"
    toc_path = "./TOCs/"
    xml_base_path = "./xml/"

    # Read failed files from log
    with open(log_path, "r", encoding="utf-8-sig") as log:
        failed_files = log.readlines()

    # Parse TOC files
    toc = parse_toc_files(toc_path, failed_files)

    # Iterate over each TOC entry and process corresponding XML file
    for k, v in tq.tqdm(toc.items()):
        print(k)
        xml_file_path = os.path.join(xml_base_path, f"{k}.xml")
        if os.path.exists(xml_file_path) and k!= "industrial-concentration-and-the-chicago-school-of-antitrust-analysis":
            enc2 = detect_encoding(xml_file_path)
            print(enc2)
            #print(k)
            print("exists")
            xml = load_xml(xml_file_path)
            try:
                signal.alarm(60 * 15)  # Set a timeout of 15 minutes
                segments = segment_text(xml, toc[k], k)
                signal.alarm(0)  # Reset the alarm

                if len(segments) > 1:
                    with open(f"./segments/segments_{k}.json", 'w', encoding="utf-8-sig") as s:
                        json.dump(segments, s)
                else:
                    print("seglength")
                    print(segments)
                    input()
                    with open(log_path, "a", encoding="utf-8-sig") as log:
                        log.write(f"{k}\n")
            except TimeoutException:
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
            except Exception:
                print(traceback.format_exc())
                
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
        else:
            print("Does not exist:")
            print(xml_file_path)
            

if __name__ == "__main__":
    main()


## Finally segmenting also the LLM-Refined PageParser Data

In [9]:
from collections import defaultdict, OrderedDict
import logging
import os
import csv
import json
import string
import signal
import traceback
from lxml import etree
from fuzzywuzzy import fuzz
from wordsegment import load, segment
import tqdm.notebook as tq
import chardet
import re
import logging

# Configure logging
logging.basicConfig(filename='./log/llm_unmatched_headings_toc.log', level=logging.WARNING, format='%(asctime)s - %(message)s')

# Initialize wordsegment
load()

# Custom exception for timeout handling
class TimeoutException(Exception):
    pass

# Custom OrderedDict subclass with default list values
class OrderedDictWithDefaultList(OrderedDict):
    def __missing__(self, key):
        value = []
        self[key] = value
        return value

# Signal handler for timeout
def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

# Function to detect encoding of a file
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        detector = chardet.universaldetector.UniversalDetector()
        for line in file:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    return detector.result['encoding']

# Function to parse TOC files and return a defaultdict of lists
def parse_toc_files(toc_path, failed_files):
    toc = defaultdict(list)
    for f in sorted(os.listdir(toc_path)):
        print(str(f[7:-4]+".toc"))
        if f.endswith(".csv") and f"{f[:-4]}" not in failed_files and str(f[7:-4]+".toc") in os.listdir("../data/GT_TOCs/"):
            enc = detect_encoding(os.path.join(toc_path, f))
            with open(os.path.join(toc_path, f), "r", encoding=enc) as r:
                data = csv.reader(r, escapechar='\\')
                for d in data:
                    if len(d) == 3:
                        level, heading, page = d
                        toc[f[:-4]].append([level, heading, page])
    return toc

# Function to load XML content from a file path
def load_xml(xml_path):
    with open(xml_path, 'rb') as f:
        xml_content = f.read()
    return etree.fromstring(xml_content)

from collections import defaultdict
from lxml import etree
import re
from fuzzywuzzy import fuzz

# Function to group <text> elements by their top attribute
def group_text_by_top(text_elements):
    lines = defaultdict(list)
    for txt in text_elements:
        top = txt.get("top")
        lines[top].append(txt)
    return lines

# Function to combine text from nodes grouped by top attribute
def combine_text_by_top(lines):
    combined_texts = {}
    for top, texts in lines.items():
        combined_texts[top] = ' '.join(
            filter(None, [concatenate_text_from_node(txt) for txt in texts if isinstance(txt, etree._Element)])
        ).strip()
    return combined_texts

# Function to concatenate text content from nodes, handling special formatting tags
def concatenate_text_from_node(node):
    texts = []
    for child in node.iter():
        if isinstance(child, etree._ElementUnicodeResult):
            if child:
                texts.append(child.strip())
        elif child.tag in ['b', 'i', 'u', 'font', 'a']:  # Adjust as per your XML structure
            if child.text:
                texts.append(child.text.strip())
            if child.tail:
                texts.append(child.tail.strip())
    return ' '.join(filter(None, texts)).strip()

# Function to check exact match between text and heading
def matches_heading_exact(text, heading):
    if text is None or heading is None:
        return False
    return text.lower().strip().replace(" ", "") == heading.lower().strip().replace(" ", "")

# Function to find substring match using regex between text and heading
def matches_heading(text, heading):
    if text is None or heading is None:
        return False
    clean_text = re.sub(r'[^\w\s]', '', text.strip()).lower().replace(" ", "") if text else ""
    clean_heading = re.sub(r'[^\w\s]', '', heading.strip()).lower().replace(" ", "") if heading else ""
  
# Function to check fuzzy match between text and heading
def fuzzy_match_heading(text, heading):
    if text is None or heading is None:
        return False
    return fuzz.partial_ratio(re.sub(r'[^\w\s]', '', text).strip().lower(), re.sub(r'[^\w\s]', '', heading).strip().lower()) == 100 or re.sub(r'[^\w\s]', '', heading).strip().lower() in re.sub(r'[^\w\s]', '', text).strip().lower()



def segment_text(xml, toc, file_key):
    segments = []
    headings_per_page = defaultdict(list)

    # Convert toc_entries to a list of tuples with (heading, page_number, level)
    toc_entries = [(entry[1], int(entry[2]), int(entry[0])) for entry in toc]

    # Add a dummy heading with a maximum page number
    max_page_number = max([int(page.get("number")) for page in xml.xpath('.//page')])
    dummy_heading = ("DUMMY_HEADING", max_page_number + 1, 0)
    toc_entries.append(dummy_heading)
    
    current_section = {}
    toc_index = 0
    total_entries = len(toc_entries)
    
    for page in xml.xpath('.//page'):
        page_number = int(page.get("number"))
        text_elements = page.xpath('.//text')
    
        # Group text elements by their top attribute
        lines = group_text_by_top(text_elements)
        combined_texts = combine_text_by_top(lines)
    
        found_match = False
    
        # Build a dictionary of headings per page
        while toc_index < total_entries and toc_entries[toc_index][1] == page_number:
            heading, _, level = toc_entries[toc_index]
            headings_per_page[page_number].append((heading, level))
            toc_index += 1
        for heading, level in headings_per_page[page_number]:
            found_match = False
        # Iterate over text elements to find matches
            for txt in text_elements:
                text_content = concatenate_text_from_node(txt)
                found_match = False
                # Check if we have an entry in toc_entries for this page_number
                if page_number in headings_per_page:
                
                    # Exact match
                    if matches_heading_exact(txt.text, heading) or matches_heading_exact(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    # Fuzzy match or regex match
                    if matches_heading(txt.text, heading) or matches_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                    if fuzzy_match_heading(txt.text, heading) or fuzzy_match_heading(text_content, heading):
                        if current_section is not None:
                            segments.append(current_section)
                        current_section = {'level':level,'heading': heading, 'content':[]}
                        found_match = True
                        break
    
                # Append text content to current section if no match yet
                if not found_match:
                    if current_section:
                        current_section['content'].append(txt.text or text_content)
    
            # Check for exact, regex, and fuzzy matches for combined text lines
            if not found_match:
                for combined_text in combined_texts.values():
                    found_match = False
                    if page_number in headings_per_page:
                        
                            
                            if matches_heading_exact(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if matches_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                            if fuzzy_match_heading(combined_text, heading):
                                if current_section is not None:
                                    segments.append(current_section)
                                current_section = {'level':level,'heading': heading, 'content':[]}
                                found_match = True
                                break
        
                    # Append combined text to current section if no match yet
                    if not found_match:
                        if current_section:
                            current_section['content'].append(combined_text)
        
        # Log unmatched headings for the current page
        if not found_match and page_number in headings_per_page:
            for heading, level in headings_per_page[page_number]:
                logging.warning(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                print(f'Unmatched heading: "{heading}" on page {page_number} in file {file_key}')
                #input()
    # Add the last current section if it exists
    if current_section is not None:
        segments.append(current_section)
    return segments

# Main function to orchestrate the process
def main():
    log_path = "./log/llm_segmentation_log.txt"
    toc_path = "../title_candidates_clean/"
    xml_base_path = "./xml/"

    # Read failed files from log
    with open(log_path, "r", encoding="utf-8-sig") as log:
        failed_files = log.readlines()

    # Parse TOC files
    toc = parse_toc_files(toc_path, failed_files)

    # Iterate over each TOC entry and process corresponding XML file
    for k, v in tq.tqdm(toc.items()):
        print(k)
        xml_file_path = os.path.join(xml_base_path, f"{k[7:]}.xml")
        print("xml:")
        print(xml_file_path)
        if os.path.exists(xml_file_path) and k!= "industrial-concentration-and-the-chicago-school-of-antitrust-analysis":
            enc2 = detect_encoding(xml_file_path)
            print(enc2)
            #print(k)
            print("exists")
            xml = load_xml(xml_file_path)
            try:
                signal.alarm(60 * 15)  # Set a timeout of 15 minutes
                segments = segment_text(xml, toc[k], k)
                signal.alarm(0)  # Reset the alarm

                if len(segments) > 1:
                    with open(f"./llm_segments/segments_{k[7:]}.json", 'w', encoding="utf-8-sig") as s:
                        json.dump(segments, s)
                else:
                    print("seglength")
                    print(segments)
                    input()
                    with open(log_path, "a", encoding="utf-8-sig") as log:
                        log.write(f"{k}\n")
            except TimeoutException:
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
            except Exception:
                print(traceback.format_exc())
                
                with open(log_path, "a", encoding="utf-8-sig") as log:
                    log.write(f"{k}\n")
        else:
            print("Does not exist:")
            print(xml_file_path)
            

if __name__ == "__main__":
    main()


6-traditional-medicines-law-and-the-dis-ordering-of-temporalities.toc
a-bird-that-flies-with-two-wings-kastom-and-state-justice-systems-in-vanuatu.toc
a-kind-of-mending-restorative-justice-in-the-pacific-islands.toc
access-controlled-the-shaping-of-power-rights-and-rule-in-cyberspace.toc
access-to-justice-and-legal-empowerment-making-the-poor-central-in-legal-development-co-operati.toc
access-to-knowledge-in-the-age-of-intellectual-property.toc
accountability-and-the-law-rights-authority-and-transparency-of-public-power.toc
administrative-decision-making-in-australian-migration-l.toc
advancing-equality-how-constitutional-rights-can-make-a-difference-worldwide.toc
aegis-or-achilles-heel-the-dilemma-of-homology-in-biopatents-in-the-wake-of-novozymes.toc
agriculture-and-food-security-in-china-what-effect-wto-accession-and-regional-trade-arrangements.toc
al-haq-a-global-history-of-the-first-palestinian-human-rights-organizati.toc
amicus-curiae-before-international-courts-and-tribunals.toc


  0%|          | 0/47 [00:00<?, ?it/s]

titles_6-traditional-medicines-law-and-the-dis-ordering-of-temporalities
xml:
./xml/6-traditional-medicines-law-and-the-dis-ordering-of-temporalities.xml
utf-8
exists
titles_a-bird-that-flies-with-two-wings-kastom-and-state-justice-systems-in-vanuatu
xml:
./xml/a-bird-that-flies-with-two-wings-kastom-and-state-justice-systems-in-vanuatu.xml
utf-8
exists
titles_a-kind-of-mending-restorative-justice-in-the-pacific-islands
xml:
./xml/a-kind-of-mending-restorative-justice-in-the-pacific-islands.xml
utf-8
exists
Unmatched heading: "Introduction" on page 59 in file titles_a-kind-of-mending-restorative-justice-in-the-pacific-islands
Unmatched heading: "tribal warfare and transformative justice in the new guinea highlands" on page 59 in file titles_a-kind-of-mending-restorative-justice-in-the-pacific-islands
Unmatched heading: "Alan Rumsey" on page 59 in file titles_a-kind-of-mending-restorative-justice-in-the-pacific-islands
Unmatched heading: "Department of Anthropology" on page 59 in file t

titles_bioethics-and-the-patent-eligibility-of-human-embryonic-stem-cells-related-inventions-in-europe
xml:
./xml/bioethics-and-the-patent-eligibility-of-human-embryonic-stem-cells-related-inventions-in-europe.xml
utf-8
exists
Unmatched heading: "Harvard Onco-mouse" on page 44 in file titles_bioethics-and-the-patent-eligibility-of-human-embryonic-stem-cells-related-inventions-in-europe
Unmatched heading: "T 19/90" on page 44 in file titles_bioethics-and-the-patent-eligibility-of-human-embryonic-stem-cells-related-inventions-in-europe
Unmatched heading: "vis-u00e0-vis" on page 44 in file titles_bioethics-and-the-patent-eligibility-of-human-embryonic-stem-cells-related-inventions-in-europe
titles_boats-to-burn-bajo-fishing-activity-in-the-australian-fishing-zone
xml:
./xml/boats-to-burn-bajo-fishing-activity-in-the-australian-fishing-zone.xml
utf-8
exists
Unmatched heading: "Published by ANU E Press" on page 4 in file titles_boats-to-burn-bajo-fishing-activity-in-the-australian-fishing-z

titles_revisiting-chinas-competition-law-and-its-interaction-with-intellectual-property-rights
xml:
./xml/revisiting-chinas-competition-law-and-its-interaction-with-intellectual-property-rights.xml
utf-8
exists
titles_second-generation-patents-in-pharmaceutical-innovati
xml:
./xml/second-generation-patents-in-pharmaceutical-innovati.xml
utf-8
exists
titles_the-constitution-and-governance-in-camer
xml:
./xml/the-constitution-and-governance-in-camer.xml
utf-8
exists
titles_the-endangered-species-act-history-implementation-successes-and-controversies
xml:
./xml/the-endangered-species-act-history-implementation-successes-and-controversies.xml
utf-8
exists
Unmatched heading: "Roots of Endangered Species Conservation" on page 8 in file titles_the-endangered-species-act-history-implementation-successes-and-controversies
Unmatched heading: "Some Basic Concepts" on page 8 in file titles_the-endangered-species-act-history-implementation-successes-and-controversies
Unmatched heading: "The Endange