Set input pdf file

In [31]:
filename = "../hackathon-task/sample-1a/Datasets/Pdfs/E0H1CM114.pdf"
output_filename = "../hackathon-task/sample-1a/Datasets/Output.json/E0H1CM114.json"

Extract text from all pages in the file

In [32]:
import extract

extractor = extract.TextExtractor(filename)
texts = extractor.extract_text_from_all_pages_multiprocessing()

dims = extractor.get_page_dimensions(0)
PAGE_WIDTH = dims["width"]
PAGE_HEIGHT = dims["height"]

print(f"Loaded {len(texts)} text snippets.")

Loaded 626 text snippets.


Sort text snippets in human-readable order

In [33]:
sorted_snippets = sorted(texts, key=lambda s: (s['page'], s['y_position'], s['bbox'][0]))
print(f"Sorted {len(sorted_snippets)} text snippets by page, y_position, and bbox[0].")

Sorted 626 text snippets by page, y_position, and bbox[0].


Merge snippets into lines based on position

In [34]:
def group_snippets_into_lines(snippets, y_tolerance=2.0):
    """
    Groups text snippets into lines based on their vertical position.
    """
    if not snippets:
        return []

    lines = []
    current_line_snippets = [snippets[0]]
    
    for i in range(1, len(snippets)):
        prev_snippet = snippets[i-1]
        current_snippet = snippets[i]
        
        # Check if snippets are on the same page and their y-positions are close
        if (current_snippet['page'] == prev_snippet['page'] and 
            abs(current_snippet['y_position'] - prev_snippet['y_position']) < y_tolerance):
            # If so, they belong to the same line
            current_line_snippets.append(current_snippet)
        else:
            # Otherwise, the previous line is finished. Finalize it and start a new one.
            lines.append(current_line_snippets)
            current_line_snippets = [current_snippet]
            
    # Add the last line
    lines.append(current_line_snippets)
    return lines

#--- Now, let's process the lines into a cleaner format ---

def process_lines(grouped_lines):
    processed_lines = []
    for line_snippets in grouped_lines:
        # Sort snippets within the line by their horizontal position
        line_snippets.sort(key=lambda s: s['bbox'][0])
        
        full_text = "".join(s['text'] for s in line_snippets).strip()
        if not full_text:
            continue
            
        # Fix trailing dots issue - specifically for table of contents entries
        # Pattern: "Title text ...... page number"
        import re
        
        # Remove table of contents dots and page numbers
        # Pattern matches strings with multiple consecutive dots followed by a number at the end
        toc_pattern = re.compile(r'^(.*?)\.{3,}\s*\d+$')
        match = toc_pattern.match(full_text)
        if match:
            # Keep only the heading text
            full_text = match.group(1).strip()
        
        # Also handle any other trailing dots (that aren't part of the TOC pattern)
        elif full_text.endswith('...') and full_text.count('.') > 3:
            full_text = full_text.rstrip('.')

        # Combine bounding boxes to get the full line's bbox
        x0 = min(s['bbox'][0] for s in line_snippets)
        y0 = min(s['bbox'][1] for s in line_snippets)
        x1 = max(s['bbox'][2] for s in line_snippets)
        y1 = max(s['bbox'][3] for s in line_snippets)
        
        # Calculate average font size (or you could use max)
        avg_font_size = sum(s['font_size'] for s in line_snippets) / len(line_snippets)
        
        processed_lines.append({
            "text": full_text,
            "page": line_snippets[0]['page'],
            "avg_font_size": avg_font_size,
            "y_position": line_snippets[0]['y_position'],
            "bbox": (x0, y0, x1, y1),
            # Keep a representative font name
            "font_name": line_snippets[0]['font_name'] 
        })
    return processed_lines

# Run the functions
grouped_lines = group_snippets_into_lines(sorted_snippets)
final_lines = process_lines(grouped_lines)

# Display the result
# for line in final_lines[1000:1004]:
#     print(line)

Feature Engineering 
List of features added:


In [35]:
from collections import Counter

def get_doc_stats(lines):
    """Calculates statistics needed for feature engineering."""
    # Find the most common font size (the body text size)
    font_sizes = [round(l['avg_font_size'], 2) for l in lines if l['text']]
    if not font_sizes:
        return {'modal_font_size': 10.0} # Default
        
    modal_font_size = Counter(font_sizes).most_common(1)[0][0]
    return {
        'modal_font_size': modal_font_size
    }


In [36]:
import re
import statistics

def engineer_features(lines, doc_stats):
    """
    Enriches each line with new features for ML classification.
    """
    modal_font_size = doc_stats['modal_font_size']
    
    for i, line in enumerate(lines):
        # 1. Font Weight
        font_name_lower = line['font_name'].lower()
        line['is_bold'] = any(indicator in font_name_lower for indicator in ['bold', 'black', 'heavy', 'sembold'])

        # 2. All Caps
        # isupper() is false for empty or non-alphabetic strings, which is what we want.
        # We add a length check to avoid flagging single-letter words or short acronyms.
        line['is_all_caps'] = line['text'].isupper() and len(line['text']) > 3
        
        # Note on Underline: Checking for underline requires data from the original PDF extraction
        # (often a 'flags' attribute). Since it's not in our line object, we'll skip it.
        # If you had it, you would add: line['is_underline'] = bool(line['flags'] & 2)

        # 3. Text Length
        line['text_len'] = len(line['text'])

        # 4. Starts with Numbering (using a comprehensive regex)
        # Catches: 1., 1.1, 1.1.1, A., (a), i., Chapter 1, Section A
        numbering_pattern = re.compile(
            r'^\s*(?:(?:Chapter|Section)\s+[\w\d]+|'  # "Chapter 1", "Section A"
            r'\d{1,2}(?:\.\d{1,2})*\.?|'             # "1.", "1.1", "1.1.1."
            r'[A-Z]\.|'                             # "A."
            r'\([a-z]\)|'                           # "(a)"
            r'[ivx]+\.)'                            # "i.", "iv."
        )
        line['starts_with_numbering'] = bool(numbering_pattern.match(line['text']))

        # 5. Relative Font Size
        # How much larger is this line's font than the document's main body font?
        if modal_font_size > 0:
            line['relative_font_size'] = line['avg_font_size'] / modal_font_size
        else:
            line['relative_font_size'] = 1.0

        # 6. Line Position (normalized)
        # 0.0 is top of page, 1.0 is bottom
        line['norm_y_pos'] = line['y_position'] / PAGE_HEIGHT

        # 7. Line Centering
        line_center = (line['bbox'][0] + line['bbox'][2]) / 2
        page_center = PAGE_WIDTH / 2
        # is the line's center within 10% of the page's center?
        line['is_centered'] = abs(line_center - page_center) < (0.1 * PAGE_WIDTH)

        # 8. Vertical Spacing
        space_before = -1
        space_after = -1
        
        # Space Before (compare with previous line on the same page)
        if i > 0 and lines[i-1]['page'] == line['page']:
            prev_line_bottom = lines[i-1]['bbox'][3]
            current_line_top = line['bbox'][1]
            space_before = current_line_top - prev_line_bottom
            
        # Space After (compare with next line on the same page)
        if i < len(lines) - 1 and lines[i+1]['page'] == line['page']:
            current_line_bottom = line['bbox'][3]
            next_line_top = lines[i+1]['bbox'][1]
            space_after = next_line_top - current_line_bottom

        line['space_before'] = space_before
        line['space_after'] = space_after
        
    return lines

In [37]:
document_stats = get_doc_stats(final_lines)
print(f"Document Statistics: {document_stats}\n")

featured_lines = engineer_features(final_lines, document_stats)
print(f"Engineered features for {len(featured_lines)} lines.\n")
import json
print(json.dumps(featured_lines, indent=1))

Document Statistics: {'modal_font_size': 11.04}

Engineered features for 488 lines.

[
 {
  "text": "Ontario\u2019s Libraries",
  "page": 1,
  "avg_font_size": 15.960000038146973,
  "y_position": 72.08401489257812,
  "bbox": [
   95.44059753417969,
   72.08401489257812,
   238.64328002929688,
   94.5716552734375
  ],
  "font_name": "Arial-Black",
  "is_bold": true,
  "is_all_caps": false,
  "text_len": 19,
  "starts_with_numbering": false,
  "relative_font_size": 1.4456521773683852,
  "norm_y_pos": 0.09101517031891178,
  "is_centered": false,
  "space_before": -1,
  "space_after": 0.063812255859375
 },
 {
  "text": "Working Together",
  "page": 1,
  "avg_font_size": 15.960000038146973,
  "y_position": 94.63546752929688,
  "bbox": [
   95.44059753417969,
   94.63546752929688,
   235.36988830566406,
   117.12310791015625
  ],
  "font_name": "Arial-Black",
  "is_bold": true,
  "is_all_caps": false,
  "text_len": 16,
  "starts_with_numbering": false,
  "relative_font_size": 1.4456521773683

Creating Labeled dataset

In [38]:
from thefuzz import fuzz
import pandas as pd



def normalize_text(text):
    """A more robust way to clean text for comparison."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.strip()
    text = re.sub(r'[\u2013\u2014]', '-', text) # Normalize dashes
    text = re.sub(r'\s+', ' ', text) # Collapse whitespace
    return text

def create_labeled_dataset_final(featured_lines, ground_truth_headings, ground_truth_title, fuzz_threshold=85):
    """
    A final, robust, and generic function to label the dataset.
    """
    # Create a lookup for ground truth headings by page
    gt_headings_by_page = {}
    for h in ground_truth_headings:
        page = h['page']
        if page not in gt_headings_by_page:
            gt_headings_by_page[page] = []
        gt_headings_by_page[page].append({
            'level': h['level'],
            'norm_text': normalize_text(h['text'])
        })
    
    # --- Generic Title Labeling ---
    # Find the line on the first page with the highest relative font size
    page_1_lines = [line for line in featured_lines if line['page'] == 1]
    if page_1_lines:
        # Find the most prominent text on page 1
        title_candidate = max(page_1_lines, key=lambda l: l['relative_font_size'])
        
        # Confirm it's a good match for the ground truth title
        norm_gt_title = normalize_text(ground_truth_title)
        
        # Special handling for titles split into two lines
        title_text_to_check = normalize_text(title_candidate['text'])
        candidate_index = featured_lines.index(title_candidate)
        if candidate_index + 1 < len(featured_lines):
            next_line = featured_lines[candidate_index+1]
            if abs(next_line['avg_font_size'] - title_candidate['avg_font_size']) < 1:
                 title_text_to_check += " " + normalize_text(next_line['text'])

        if fuzz.partial_ratio(title_text_to_check, norm_gt_title) > fuzz_threshold:
            title_candidate['label'] = 'Title'
            # Also label the next line if it was part of the combined title
            if " " in title_text_to_check:
                 featured_lines[candidate_index+1]['label'] = 'Title'
            
    # --- Label H1, H2, H3 Headings ---
    for line in featured_lines:
        line['page']  -= 1
        if 'label' in line:
            continue
            
        line['label'] = 'Body Text'
        page_num = line['page']
        
        if page_num in gt_headings_by_page:
            norm_line_text = normalize_text(line['text'])
            
            best_match = max(
                gt_headings_by_page[page_num],
                key=lambda h: fuzz.ratio(norm_line_text, h['norm_text']),
                default=None
            )
            
            if best_match:
                score = fuzz.ratio(norm_line_text, best_match['norm_text'])
                if score > fuzz_threshold:
                    line['label'] = best_match['level']
                    
    return featured_lines

# --- FINAL USAGE ---
# with open(output_filename, 'r', encoding='utf-8') as f:
#     out = json.load(f)

# corrected_outline = [
#     {**h, 'page': h['page'] + 1} for h in out['outline']
# ]

# labeled_lines = create_labeled_dataset_final(
#     featured_lines,
#     corrected_outline,
#     out['title']
# )

# df = pd.DataFrame(labeled_lines)
# print(df[df['label'] != 'Body Text'][['text', 'page', 'label']])

In [None]:
import json

with open(output_filename, 'r', encoding='utf-8') as f:
    out = json.load(f)
    
corrected_outline = [
    {**h, 'page': h['page'] + 1} for h in out['outline']
]

labeled_lines = create_labeled_dataset_final(
    featured_lines, 
    corrected_outline, 
    out['title']
)

df = pd.DataFrame(labeled_lines)
df.to_csv('o.csv',index=False)

--- Starting Match Debugger ---

Checking for: 'ontario’s digital library' on Page 2
  ✅ Best Match Found: 'rfp: to develop the ontario digital library business plan march 2003'
  ➡️ Fuzzy Score: 49

Checking for: 'a critical component for implementing ontario’s road map to prosperity strategy' on Page 2
  ✅ Best Match Found: 'the concept of an odl is a key element in the roadmap to prosperity, a strategic plan to'
  ➡️ Fuzzy Score: 62

Checking for: 'summary' on Page 2
  ✅ Best Match Found: 'background'
  ➡️ Fuzzy Score: 24

Checking for: 'timeline:' on Page 2
  ✅ Best Match Found: 'information or question responses.'
  ➡️ Fuzzy Score: 28

Checking for: 'background' on Page 3
  ✅ Best Match Found: 'shared funding:'
  ➡️ Fuzzy Score: 40

Checking for: 'equitable access for all ontarians:' on Page 4
  ✅ Best Match Found: 'of electronic content on a consortia basis for all member libraries'
  ➡️ Fuzzy Score: 43

Checking for: 'shared decision-making and accountability:' on Page 4
  ✅ Bes