# PDF Text Extraction and Analysis - Batch Processing for Sample-1a

## Setting Input and Output Directories
This notebook processes multiple PDF documents from the sample-1a dataset to extract text and structure. The next cell sets up the input PDF directory path and the corresponding JSON output directory to build a larger dataset.

In [16]:
import os
import glob

# Define the input and output directories
pdf_dir = "../hackathon-task/sample-1a/Datasets/Pdfs/"
json_dir = "../hackathon-task/sample-1a/Datasets/Output.json/"

# Get list of all PDF files in the directory
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))

# Get list of all JSON files in the directory
json_files = glob.glob(os.path.join(json_dir, "*.json"))

# Map PDFs to their corresponding JSON files (if available)
pdf_to_json = {}
for pdf_path in pdf_files:
    pdf_basename = os.path.basename(pdf_path).split('.')[0]
    json_path = os.path.join(json_dir, f"{pdf_basename}.json")
    
    # Only include PDFs that have corresponding JSON files
    if os.path.exists(json_path):
        pdf_to_json[pdf_path] = json_path

print(f"Found {len(pdf_files)} PDF files in the directory.")
print(f"Found {len(json_files)} JSON files in the directory.")
print(f"Processing {len(pdf_to_json)} PDFs with matching JSON files.")

Found 7 PDF files in the directory.
Found 5 JSON files in the directory.
Processing 5 PDFs with matching JSON files.


# Text Extraction Using MultiProcessing
The code below imports the custom `extract` module to extract text from the PDF file. It uses multiprocessing to efficiently extract text snippets from all pages and retrieves the page dimensions which will be needed for feature engineering later.

# Batch Processing Function
Let's create a function that processes a single PDF file and its corresponding JSON output file from the sample-1a dataset. This function will encapsulate all the processing steps from the original notebook so we can easily apply them to multiple files.

In [None]:
import extract
import pandas as pd
import json
import re
from collections import Counter
from thefuzz import fuzz
import os

def process_pdf_file(pdf_path, json_path):
    """Process a single PDF file and its corresponding JSON file
    
    Args:
        pdf_path: Path to the PDF file
        json_path: Path to the corresponding JSON file with ground truth
        output_csv_dir: Directory to save the output CSV
        
    Returns:
        DataFrame containing the labeled text lines
    """
    print(f"Processing: {os.path.basename(pdf_path)}")
    
    # Initialize the extractor
    extractor = extract.TextExtractor(pdf_path)
    texts = extractor.extract_text_from_all_pages_multiprocessing()
    
    # Get page dimensions
    dims = extractor.get_page_dimensions(0)
    PAGE_WIDTH = dims["width"]
    PAGE_HEIGHT = dims["height"]
    
    # Sort text snippets
    sorted_snippets = sorted(texts, key=lambda s: (s['page'], s['y_position'], s['bbox'][0]))
    
    # Group snippets into lines and process them
    def group_snippets_into_lines(snippets, y_tolerance=2.0):
        if not snippets:
            return []
            
        lines = []
        current_line_snippets = [snippets[0]]
        
        for i in range(1, len(snippets)):
            prev_snippet = snippets[i-1]
            current_snippet = snippets[i]
            
            if (current_snippet['page'] == prev_snippet['page'] and 
                abs(current_snippet['y_position'] - prev_snippet['y_position']) < y_tolerance):
                current_line_snippets.append(current_snippet)
            else:
                lines.append(current_line_snippets)
                current_line_snippets = [current_snippet]
                
        lines.append(current_line_snippets)
        return lines

    def process_lines(grouped_lines):
        processed_lines = []
        for line_snippets in grouped_lines:
            line_snippets.sort(key=lambda s: s['bbox'][0])
            
            full_text = "".join(s['text'] for s in line_snippets).strip()
            if not full_text:
                continue
                
            import re
            
            toc_pattern = re.compile(r'^(.*?)\.{3,}\s*\d+$')
            match = toc_pattern.match(full_text)
            if match:
                full_text = match.group(1).strip()
            
            elif full_text.endswith('...') and full_text.count('.') > 3:
                full_text = full_text.rstrip('.')

            x0 = min(s['bbox'][0] for s in line_snippets)
            y0 = min(s['bbox'][1] for s in line_snippets)
            x1 = max(s['bbox'][2] for s in line_snippets)
            y1 = max(s['bbox'][3] for s in line_snippets)
            
            avg_font_size = sum(s['font_size'] for s in line_snippets) / len(line_snippets)
            
            processed_lines.append({
                "text": full_text,
                "page": line_snippets[0]['page'],
                "avg_font_size": avg_font_size,
                "y_position": line_snippets[0]['y_position'],
                "bbox": (x0, y0, x1, y1),
                "font_name": line_snippets[0]['font_name'],
                "source_pdf": os.path.basename(pdf_path)
            })
        return processed_lines

    grouped_lines = group_snippets_into_lines(sorted_snippets)
    final_lines = process_lines(grouped_lines)
    
    # Get document statistics
    def get_doc_stats(lines):
        font_sizes = [round(l['avg_font_size'], 2) for l in lines if l['text']]
        if not font_sizes:
            return {'modal_font_size': 10.0}
            
        modal_font_size = Counter(font_sizes).most_common(1)[0][0]
        return {
            'modal_font_size': modal_font_size
        }
    
    document_stats = get_doc_stats(final_lines)
    
    # Engineer features
    def engineer_features(lines, doc_stats):
        modal_font_size = doc_stats['modal_font_size']
        
        for i, line in enumerate(lines):
            font_name_lower = line['font_name'].lower()
            line['is_bold'] = any(indicator in font_name_lower for indicator in ['bold', 'black', 'heavy', 'sembold'])

            line['is_all_caps'] = line['text'].isupper() and len(line['text']) > 3
            
            line['text_len'] = len(line['text'])

            numbering_pattern = re.compile(
                r'^\s*(?:(?:Chapter|Section)\s+[\w\d]+|'
                r'\d{1,2}(?:\.\d{1,2})*\.?|'
                r'[A-Z]\.|'
                r'\([a-z]\)|'
                r'[ivx]+\.)'
            )
            line['starts_with_numbering'] = bool(numbering_pattern.match(line['text']))

            if modal_font_size > 0:
                line['relative_font_size'] = line['avg_font_size'] / modal_font_size
            else:
                line['relative_font_size'] = 1.0

            line['norm_y_pos'] = line['y_position'] / PAGE_HEIGHT

            line_center = (line['bbox'][0] + line['bbox'][2]) / 2
            page_center = PAGE_WIDTH / 2
            line['is_centered'] = abs(line_center - page_center) < (0.1 * PAGE_WIDTH)

            space_before = -1
            space_after = -1
            
            if i > 0 and lines[i-1]['page'] == line['page']:
                prev_line_bottom = lines[i-1]['bbox'][3]
                current_line_top = line['bbox'][1]
                space_before = current_line_top - prev_line_bottom
                
            if i < len(lines) - 1 and lines[i+1]['page'] == line['page']:
                current_line_bottom = line['bbox'][3]
                next_line_top = lines[i+1]['bbox'][1]
                space_after = next_line_top - current_line_bottom

            line['space_before'] = space_before
            line['space_after'] = space_after
            
        return lines
    
    featured_lines = engineer_features(final_lines, document_stats)
    
    # Label dataset
    def normalize_text(text):
        if not isinstance(text, str):
            return ""
        text = text.lower()
        text = text.strip()
        text = re.sub(r'[\u2013\u2014]', '-', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def create_labeled_dataset_final(featured_lines, ground_truth_headings, ground_truth_title, fuzz_threshold=85):
        gt_headings_by_page = {}
        for h in ground_truth_headings:
            page = h['page']
            if page not in gt_headings_by_page:
                gt_headings_by_page[page] = []
            gt_headings_by_page[page].append({
                'level': h['level'],
                'norm_text': normalize_text(h['text'])
            })
        
        page_1_lines = [line for line in featured_lines if line['page'] == 1]
        if page_1_lines:
            title_candidate = max(page_1_lines, key=lambda l: l['relative_font_size'])
            
            norm_gt_title = normalize_text(ground_truth_title)
            
            title_text_to_check = normalize_text(title_candidate['text'])
            candidate_index = featured_lines.index(title_candidate)
            if candidate_index + 1 < len(featured_lines):
                next_line = featured_lines[candidate_index+1]
                if abs(next_line['avg_font_size'] - title_candidate['avg_font_size']) < 1:
                     title_text_to_check += " " + normalize_text(next_line['text'])

            if fuzz.partial_ratio(title_text_to_check, norm_gt_title) > fuzz_threshold:
                title_candidate['label'] = 'Title'
                if " " in title_text_to_check:
                     featured_lines[candidate_index+1]['label'] = 'Title'
                
        for line in featured_lines:
            if 'label' in line:
                continue
                
            line['label'] = 'Body Text'
            page_num = line['page']
            
            if page_num in gt_headings_by_page:
                norm_line_text = normalize_text(line['text'])
                
                best_match = max(
                    gt_headings_by_page[page_num],
                    key=lambda h: fuzz.ratio(norm_line_text, h['norm_text']),
                    default=None
                )
                
                if best_match:
                    score = fuzz.ratio(norm_line_text, best_match['norm_text'])
                    if score > fuzz_threshold:
                        line['label'] = best_match['level']
                        
        return featured_lines
    
    # Load ground truth data
    with open(json_path, 'r', encoding='utf-8') as f:
        out = json.load(f)
    
    # Correct page numbering
    corrected_outline = [{**h, 'page': h['page'] + 1} for h in out['outline']]
    
    # Create labeled dataset
    labeled_lines = create_labeled_dataset_final(featured_lines, corrected_outline, out['title'])
    
    # Fix page numbers to be 0-indexed
    for line in labeled_lines:
        line['page'] -= 1
    
    # Convert to DataFrame
    df = pd.DataFrame(labeled_lines)
        
    return df

# Executing Batch Processing
Now that we've created our function to process individual PDF files, let's create an output directory for our labeled data and then process all the PDF files with matching JSON files in batch.

In [18]:
# Create output directory for labeled data CSVs

# Process all PDF files with matching JSON files
all_dataframes = []

for pdf_path, json_path in pdf_to_json.items():
    try:
        df = process_pdf_file(pdf_path, json_path)
        all_dataframes.append(df)
    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")

print(f"Successfully processed {len(all_dataframes)} PDF files.")

Error processing ../hackathon-task/sample-1a/Datasets/Pdfs/E0CCG5S239.pdf: process_pdf_file() missing 1 required positional argument: 'output_csv_dir'
Error processing ../hackathon-task/sample-1a/Datasets/Pdfs/E0H1CM114.pdf: process_pdf_file() missing 1 required positional argument: 'output_csv_dir'
Error processing ../hackathon-task/sample-1a/Datasets/Pdfs/E0CCG5S312.pdf: process_pdf_file() missing 1 required positional argument: 'output_csv_dir'
Error processing ../hackathon-task/sample-1a/Datasets/Pdfs/STEMPathwaysFlyer.pdf: process_pdf_file() missing 1 required positional argument: 'output_csv_dir'
Error processing ../hackathon-task/sample-1a/Datasets/Pdfs/TOPJUMP-PARTY-INVITATION-20161003-V01.pdf: process_pdf_file() missing 1 required positional argument: 'output_csv_dir'
Successfully processed 0 PDF files.


# Combining All Data into a Single Dataset
Let's combine all the individual labeled datasets into one large dataset, which will be useful for training machine learning models. We'll concatenate all the DataFrames and save the result to a single CSV file.

In [19]:
# Combine all DataFrames into a single dataset
if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    print(f"Combined dataset created with {len(combined_df)} rows")
    
    # Display statistics about the combined dataset
    label_counts = combined_df['label'].value_counts()
    print("\nLabel distribution in the combined dataset:")
    print(label_counts)
    
    pdf_counts = combined_df['source_pdf'].value_counts()
    print("\nNumber of rows from each PDF:")
    print(pdf_counts)
else:
    print("No data to combine. Please check that PDF processing was successful.")

No data to combine. Please check that PDF processing was successful.


# Finalizing the Dataset
Let's finalize our dataset by renaming the combined dataset for clarity and generating additional statistics.

In [20]:
# We'll use the dataframes we've already created
if all_dataframes:
    # This is already our final DataFrame
    final_df = combined_df
    
    # Save to CSV with a different name to distinguish it
    final_csv_path = os.path.join("final_dataset.csv")
    final_df.to_csv(final_csv_path, index=False)
    
    print(f"Created final dataset with {len(final_df)} rows from {len(all_dataframes)} PDFs")
    print(f"Saved to {final_csv_path}")
    
    # Display dataset statistics
    print("\nFinal Dataset Statistics:")
    print(f"Total rows: {len(final_df)}")
    print(f"Unique PDFs: {final_df['source_pdf'].nunique()}")
    print("\nLabel distribution:")
    print(final_df['label'].value_counts())
    
    # Create a summary table by PDF source
    pdf_summary = final_df.groupby('source_pdf')['label'].value_counts().unstack().fillna(0)
    print("\nRows by PDF and label type:")
    print(pdf_summary)
else:
    print("No data to combine.")

No data to combine.
