Set input pdf file

In [9]:
filename = "../hackathon-task/sample-1a/Datasets/Pdfs/E0CCG5S312.pdf"
output_filename = "../hackathon-task/sample-1a/Datasets/Output.json/E0CCG5S312.json"

Extract text from all pages in the file

In [2]:
import extract

extractor = extract.TextExtractor(filename)
texts = extractor.extract_text_from_all_pages_multiprocessing()

dims = extractor.get_page_dimensions(0)
PAGE_WIDTH = dims["width"]
PAGE_HEIGHT = dims["height"]

print(f"Loaded {len(texts)} text snippets.")

Loaded 507 text snippets.


Sort text snippets in human-readable order

In [3]:
sorted_snippets = sorted(texts, key=lambda s: (s['page'], s['y_position'], s['bbox'][0]))
print(f"Sorted {len(sorted_snippets)} text snippets by page, y_position, and bbox[0].")

Sorted 507 text snippets by page, y_position, and bbox[0].


Merge snippets into lines based on position

In [4]:
def group_snippets_into_lines(snippets, y_tolerance=2.0):
    """
    Groups text snippets into lines based on their vertical position.
    """
    if not snippets:
        return []

    lines = []
    current_line_snippets = [snippets[0]]
    
    for i in range(1, len(snippets)):
        prev_snippet = snippets[i-1]
        current_snippet = snippets[i]
        
        # Check if snippets are on the same page and their y-positions are close
        if (current_snippet['page'] == prev_snippet['page'] and 
            abs(current_snippet['y_position'] - prev_snippet['y_position']) < y_tolerance):
            # If so, they belong to the same line
            current_line_snippets.append(current_snippet)
        else:
            # Otherwise, the previous line is finished. Finalize it and start a new one.
            lines.append(current_line_snippets)
            current_line_snippets = [current_snippet]
            
    # Add the last line
    lines.append(current_line_snippets)
    return lines

#--- Now, let's process the lines into a cleaner format ---

def process_lines(grouped_lines):
    processed_lines = []
    for line_snippets in grouped_lines:
        # Sort snippets within the line by their horizontal position
        line_snippets.sort(key=lambda s: s['bbox'][0])
        
        full_text = "".join(s['text'] for s in line_snippets).strip()
        if not full_text:
            continue
            
        # Fix trailing dots issue - specifically for table of contents entries
        # Pattern: "Title text ...... page number"
        import re
        
        # Remove table of contents dots and page numbers
        # Pattern matches strings with multiple consecutive dots followed by a number at the end
        toc_pattern = re.compile(r'^(.*?)\.{3,}\s*\d+$')
        match = toc_pattern.match(full_text)
        if match:
            # Keep only the heading text
            full_text = match.group(1).strip()
        
        # Also handle any other trailing dots (that aren't part of the TOC pattern)
        elif full_text.endswith('...') and full_text.count('.') > 3:
            full_text = full_text.rstrip('.')

        # Combine bounding boxes to get the full line's bbox
        x0 = min(s['bbox'][0] for s in line_snippets)
        y0 = min(s['bbox'][1] for s in line_snippets)
        x1 = max(s['bbox'][2] for s in line_snippets)
        y1 = max(s['bbox'][3] for s in line_snippets)
        
        # Calculate average font size (or you could use max)
        avg_font_size = sum(s['font_size'] for s in line_snippets) / len(line_snippets)
        
        processed_lines.append({
            "text": full_text,
            "page": line_snippets[0]['page'],
            "avg_font_size": avg_font_size,
            "y_position": line_snippets[0]['y_position'],
            "bbox": (x0, y0, x1, y1),
            # Keep a representative font name
            "font_name": line_snippets[0]['font_name'] 
        })
    return processed_lines

# Run the functions
grouped_lines = group_snippets_into_lines(sorted_snippets)
final_lines = process_lines(grouped_lines)

# Display the result
# for line in final_lines[1000:1004]:
#     print(line)

Feature Engineering 
List of features added:


In [5]:
from collections import Counter

def get_doc_stats(lines):
    """Calculates statistics needed for feature engineering."""
    # Find the most common font size (the body text size)
    font_sizes = [round(l['avg_font_size'], 2) for l in lines if l['text']]
    if not font_sizes:
        return {'modal_font_size': 10.0} # Default
        
    modal_font_size = Counter(font_sizes).most_common(1)[0][0]
    return {
        'modal_font_size': modal_font_size
    }


In [6]:
import re
import statistics

def engineer_features(lines, doc_stats):
    """
    Enriches each line with new features for ML classification.
    """
    modal_font_size = doc_stats['modal_font_size']
    
    for i, line in enumerate(lines):
        # 1. Font Weight
        font_name_lower = line['font_name'].lower()
        line['is_bold'] = any(indicator in font_name_lower for indicator in ['bold', 'black', 'heavy', 'sembold'])

        # 2. All Caps
        # isupper() is false for empty or non-alphabetic strings, which is what we want.
        # We add a length check to avoid flagging single-letter words or short acronyms.
        line['is_all_caps'] = line['text'].isupper() and len(line['text']) > 3
        
        # Note on Underline: Checking for underline requires data from the original PDF extraction
        # (often a 'flags' attribute). Since it's not in our line object, we'll skip it.
        # If you had it, you would add: line['is_underline'] = bool(line['flags'] & 2)

        # 3. Text Length
        line['text_len'] = len(line['text'])

        # 4. Starts with Numbering (using a comprehensive regex)
        # Catches: 1., 1.1, 1.1.1, A., (a), i., Chapter 1, Section A
        numbering_pattern = re.compile(
            r'^\s*(?:(?:Chapter|Section)\s+[\w\d]+|'  # "Chapter 1", "Section A"
            r'\d{1,2}(?:\.\d{1,2})*\.?|'             # "1.", "1.1", "1.1.1."
            r'[A-Z]\.|'                             # "A."
            r'\([a-z]\)|'                           # "(a)"
            r'[ivx]+\.)'                            # "i.", "iv."
        )
        line['starts_with_numbering'] = bool(numbering_pattern.match(line['text']))

        # 5. Relative Font Size
        # How much larger is this line's font than the document's main body font?
        if modal_font_size > 0:
            line['relative_font_size'] = line['avg_font_size'] / modal_font_size
        else:
            line['relative_font_size'] = 1.0

        # 6. Line Position (normalized)
        # 0.0 is top of page, 1.0 is bottom
        line['norm_y_pos'] = line['y_position'] / PAGE_HEIGHT

        # 7. Line Centering
        line_center = (line['bbox'][0] + line['bbox'][2]) / 2
        page_center = PAGE_WIDTH / 2
        # is the line's center within 10% of the page's center?
        line['is_centered'] = abs(line_center - page_center) < (0.1 * PAGE_WIDTH)

        # 8. Vertical Spacing
        space_before = -1
        space_after = -1
        
        # Space Before (compare with previous line on the same page)
        if i > 0 and lines[i-1]['page'] == line['page']:
            prev_line_bottom = lines[i-1]['bbox'][3]
            current_line_top = line['bbox'][1]
            space_before = current_line_top - prev_line_bottom
            
        # Space After (compare with next line on the same page)
        if i < len(lines) - 1 and lines[i+1]['page'] == line['page']:
            current_line_bottom = line['bbox'][3]
            next_line_top = lines[i+1]['bbox'][1]
            space_after = next_line_top - current_line_bottom

        line['space_before'] = space_before
        line['space_after'] = space_after
        
    return lines

In [7]:
document_stats = get_doc_stats(final_lines)
print(f"Document Statistics: {document_stats}\n")

featured_lines = engineer_features(final_lines, document_stats)
print(f"Engineered features for {len(featured_lines)} lines.\n")
import json
print(json.dumps(featured_lines, indent=1))

Document Statistics: {'modal_font_size': 9.96}

Engineered features for 283 lines.

[
 {
  "text": "Overview",
  "page": 1,
  "avg_font_size": 24.0,
  "y_position": 189.62002563476562,
  "bbox": [
   252.64999389648438,
   189.62002563476562,
   365.9539489746094,
   216.38002014160156
  ],
  "font_name": "Arial,Bold",
  "is_bold": true,
  "is_all_caps": false,
  "text_len": 8,
  "starts_with_numbering": false,
  "relative_font_size": 2.4096385542168672,
  "norm_y_pos": 0.23941922428632023,
  "is_centered": true,
  "space_before": -1,
  "space_after": 28.464004516601562
 },
 {
  "text": "Foundation Level Extensions",
  "page": 1,
  "avg_font_size": 24.0,
  "y_position": 244.84402465820312,
  "bbox": [
   139.3459930419922,
   244.84402465820312,
   479.3563537597656,
   271.6040344238281
  ],
  "font_name": "Arial,Bold",
  "is_bold": true,
  "is_all_caps": false,
  "text_len": 27,
  "starts_with_numbering": false,
  "relative_font_size": 2.4096385542168672,
  "norm_y_pos": 0.3091464957

Creating Labeled dataset

In [None]:
from thefuzz import fuzz
import pandas as pd

def create_labeled_dataset(featured_lines, ground_truth_headings, ground_truth_title, fuzz_threshold=70):
    """
    Assigns a label to each line using fuzzy matching against ground truth.
    
    Labels: 'Title', 'H1', 'H2', 'H3', 'Body Text'
    """
    
    # --- 1. Find and label the main Title first ---
    # Heuristic: The line on page 1 with the largest font size is likely the title.
    # We compare it against the ground_truth_title for confirmation.
    page_1_lines = [line for line in featured_lines if line['page'] == 1]
    
    if page_1_lines:
        # Find the best match for the title on page 1
        best_title_match_score = 0
        best_title_candidate = None
        
        for line in page_1_lines:
            score = fuzz.ratio(line['text'].strip().lower(), ground_truth_title.strip().lower())
            if score > best_title_match_score:
                best_title_match_score = score
                best_title_candidate = line
        
        # If we have a good match, label it and give it a unique ID to ignore later
        if best_title_candidate and best_title_match_score > fuzz_threshold:
            best_title_candidate['label'] = 'Title'
            # Use bbox as a unique identifier for the line
            best_title_candidate['id'] = str(best_title_candidate['bbox'])
    
    
    # --- 2. Label H1, H2, H3 headings ---
    # Create a quick lookup for ground truth headings by page
    gt_headings_by_page = {}
    for h in ground_truth_headings:
        page = h['page']
        if page not in gt_headings_by_page:
            gt_headings_by_page[page] = []
        gt_headings_by_page[page].append(h)
        
    # Iterate through every line to label it
    for line in featured_lines:
        # Skip lines we've already labeled (i.e., the Title)
        if 'label' in line:
            continue

        # Set default label
        line['label'] = 'Body Text'
        line['id'] = str(line['bbox'])

        page_num = line['page']
        
        # Only search for matches on the correct page
        if page_num in gt_headings_by_page:
            best_match_score = 0
            best_match_level = ''
            
            for gt_heading in gt_headings_by_page[page_num]:
                # Calculate fuzzy match score
                score = fuzz.ratio(line['text'].strip(), gt_heading['text'].strip())
                
                if score > best_match_score:
                    best_match_score = score
                    best_match_level = gt_heading['level']
            
            # If the best match is strong enough, assign the label
            if best_match_score > fuzz_threshold:
                line['label'] = best_match_level

    return featured_lines


In [20]:
import json

with open(output_filename, 'r', encoding='utf-8') as f:
    out = json.load(f)

labeled_lines = create_labeled_dataset(
    featured_lines, 
    out['outline'], 
    out['title']
)

df = pd.DataFrame(labeled_lines)
df.to_csv('o.csv',index=False)