In [52]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig  
from IPython.display import display, Image
import pdfplumber
import pprint
from functools import partial
import re
import pandas as pd

# Data Cleaning

## Invisible Object Detection and Filtering

In [53]:
pdf_path = 'pdfs/example.pdf'
pdf = pdfplumber.open(pdf_path)
pd.set_option("display.max_rows", None)

In [54]:
def filterInvisibleObjects(object, page_width, page_height):
    # 0 was the color of visible characters
    # Check if 'non_stroking_color' exists and is equal to (1,)
    if object.get('non_stroking_color') == (1,):
        return False
    
  # Check for valid coordinates
    x0, x1, y0, y1 = object.get('x0'), object.get('x1'), object.get('y0'), object.get('y1')
    if x0 is not None and (x0 < 0 or x1 > page_width):
        return False
    if y0 is not None and (y0 < 0 or y1 > page_height):
        return False

    return True

In [55]:


def clean_pdf(pdf):
    pages = []

    for i,p_i in enumerate(pdf.pages):
        filter_function = partial(filterInvisibleObjects, page_width=p_i.width, page_height=p_i.height)
        p_i_filtered = p_i.filter(filter_function)
        # if i == 7:
        #     o_im = p_i.to_image().draw_rects(p_i.extract_words())
        #     f_im = p_i_filtered.to_image().draw_rects(p_i_filtered.extract_words())

        #     or_im = p_i.to_image().draw_rects(p_i.rects)
        #     fr_im = p_i_filtered.to_image().draw_rects(p_i_filtered.rects)
        #     print('Image with original hidden characters:')
        #     display(o_im)

        #     print('Image with invisible characters filtered out')
        #     display(f_im)

        #     print('Image with invisble rectangles')
        #     display(or_im)
        #     print('Image without invisible rectangles')
        #     display(fr_im)
        pages.append(p_i_filtered)

    return pages

pages = clean_pdf(pdf)


# Other Logic

In [56]:
def get_bounding_box(chars):
    alphanumeric = [char for char in chars if char['text'] != ' ']

    x0 = min([char['x0'] for char in alphanumeric if char['text'] != ' '])
    x1 = max([char['x1'] for char in alphanumeric if char['text'] != ' '])

    y0 = min([char['top'] for char in alphanumeric])
    y1 = max([char['bottom'] for char in alphanumeric])

    return(x0,x1,y0,y1)

def get_alignment_lines(page, center_buffer=5, show_lines=False):

    chars = page.chars
    xl,xr,y0,y1 = get_bounding_box(chars)

    x_span = xr - xl

    # now divide by 3 
    x_span_3 = x_span/3

    x1 = xl + x_span_3 - center_buffer
    x2 = xl + 2*x_span_3 + center_buffer

    if show_lines:
        im = page.to_image().draw_lines(
            [
                ((xl,y0), (xl,y1)),
                ((xr,y0), (xr,y1)),
                ((x1,y0),(x1,y1)),
                ((x2,y0),(x2,y1))
                
            ]
        )
        display(im)
    return xl,x1,x2,xr

In [57]:
def get_segment_chars(line, space_threshold=10):
    chars = line['chars']
    segments = []
    current_segment = []
    
    for i, char in enumerate(chars):
        if i == 0:
            current_segment.append(char)
        else:
            gap = char['x0'] - chars[i-1]['x1']
            # print(f' gap from {chars[i-1]['text']} to {char['text']} is {gap}')
            if gap > space_threshold:
                # print('gap is bigger than threshold')
                if current_segment:
                    # print('appending current segment to segments')
                    segments.append(current_segment)
                # print('resetting current_segment')
                current_segment = [char]
            else:
                current_segment.append(char)
    
    if current_segment:
        segments.append(current_segment)
    # print('identified number of segments: ', len(segments))
    return segments

def get_bounding_box(chars):
    if not chars:
        return None, None, None, None
    x0 = min(char['x0'] for char in chars)
    x1 = max(char['x1'] for char in chars)
    y0 = min(char['y0'] for char in chars)
    y1 = max(char['y1'] for char in chars)
    return x0, x1, y0, y1

def prepare_line_segments(line, space_threshold=10):

    
    # Get segments based on character positions
    char_segments = get_segment_chars(line, space_threshold)
    
    # Prepare segment information
    segments = []
    text_index = 0
    for char_segment in char_segments:
        seg_x0, seg_x1, seg_y0, seg_y1 = get_bounding_box(char_segment)
        
        # Extract text including spaces
        segment_text = ""
        char_index = 0
        while text_index < len(line['text']) and char_index < len(char_segment):
            if line['text'][text_index] == char_segment[char_index]['text']:
                segment_text += line['text'][text_index]
                char_index += 1
            else:
                segment_text += line['text'][text_index]
            text_index += 1
        
        segment_info = line.copy()
        del segment_info['chars']
        segment_info.update({
            'text': segment_text.strip(),
            'x0': seg_x0,
            'x1': seg_x1,
        })
        segments.append(segment_info)
    
    return segments

In [58]:
# pprint.pprint(prepare_line_segments(pages[7].extract_text_lines(layout=True, strip=False, x_tolerance=0.2)[0]))

In [59]:
# pprint.pprint(pages[7].extract_text_lines(layout=True, strip=False, x_tolerance=0.2)[0]['text'])
# df = pd.DataFrame(pages[7].extract_text_lines(layout=True, x_tolerance=0.2)[0]['chars'])
# df

In [60]:


def extract_aligned_segments(line, xl, x1, x2, xr, tolerance=0.04):
    segments = prepare_line_segments(line)
    result = {'left': None, 'center': None, 'right': None}
    page_width = xr - xl
    if len(segments) > 3:
        return result
    
    for segment in segments:
        seg_x0, seg_x1 = segment['x0'], segment['x1']

        # Left-aligned check
        if abs(seg_x0 - xl) <= tolerance * page_width and seg_x1 <= x1:
            if result['left'] is not None:
                return {'left': None, 'center': None, 'right': None}
            result['left'] = segment
            continue
        
        # Centered check
        sentence_center = (seg_x0 + seg_x1) / 2
        expected_center = (x1 + x2) / 2

        if x1 <= seg_x0 and seg_x1 <= x2 and abs(sentence_center - expected_center) <= tolerance * page_width:
            if result['center'] is not None:
                return {'left': None, 'center': None, 'right': None}
            result['center'] = segment
            continue
        
        # Right-aligned check
        if seg_x0 >= x2 and abs(seg_x1 - xr) <= tolerance * page_width:
            if result['right'] is not None:
                return {'left': None, 'center': None, 'right': None}
            result['right'] = segment
            continue
        
        return {'left': None, 'center': None, 'right': None}
    
    return result

def collect_headerfooter_lines(page, type='header'):
    xl,x1,x2,xr = get_alignment_lines(page)
    lines = page.extract_text_lines(layout=True, strip=False).copy()

    # If we are attempting to extract footer, we need to reverse list and start from bottom
    if type == 'footer':
        lines.reverse()

    aligned_lines = {
        'left': [],
        'center': [],
        'right': []
    }

    for line in lines:

        aligned_segments = extract_aligned_segments(line,xl,x1,x2,xr)

        # Check if the text in the line violated header/footer rules
        if all(segment is None for segment in aligned_segments.values()):
            break

        for alignment, segment in aligned_segments.items():
            if segment is not None:
                aligned_lines[alignment].append(segment)

    return aligned_lines

page = pages[7]
def extract_top_bottom_positions(aligned_lines):
    top = float('inf')
    bottom = float('-inf')
    
    for alignment in ['left', 'center', 'right']:
        lines = aligned_lines[alignment]
        if lines:
            top = min(top, lines[0]['top'])
            bottom = max(bottom, lines[-1]['bottom'])
    
    if top == float('inf') or bottom == float('-inf'):
        return None, None
    
    return top, bottom
headers = collect_headerfooter_lines(page)
# print(extract_top_bottom_positions(headers))
# pprint.pprint(headers)

In [61]:
# Dictionary of multiplier keywords and their corresponding values
multipliers = {
    'billion': 1000000000,
    '$b': 1000000000,
    'million': 1000000,
    '$m': 1000000,
    'thousand': 1000,
    '$k': 1000
}

def extract_multiplier_from_string(text):
    text = text.lower().replace(' ', '')
    
    for keyword, value in multipliers.items():
        if keyword in text:
            return value
    
    return 1  # Default multiplier if no match found

def check_header_for_multiplier(header_lines):
    for alignment in ['left', 'center', 'right']:
        for line in header_lines[alignment]:
            multiplier = extract_multiplier_from_string(line['text'])
            if multiplier != 1:
                return multiplier
    return 1

In [62]:
import re

def extract_numbers_from_text(text):
    numbers = []
    processed_indices = set()
    
    # Pattern to match numbers with optional magnitude indicators
    # pattern = r'\$?((?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?)\s*([mMbBkK]|million|Million|billion|Billion|thousand|Thousand)?\b'
    # pattern = r'(?<!\w)\$?((?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?)\s*([mMbBkK]|million|Million|billion|Billion|thousand|Thousand)\b'
    pattern = r'(?<![a-zA-Z0-9-])\$?((?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?)\s*([mMbBkK]|million|Million|billion|Billion|thousand|Thousand)\b(?![-a-zA-Z0-9])'
    
    
    matches = re.finditer(pattern, text)
    
    for match in matches:
        number_str, magnitude = match.groups()
        try:
            # Remove commas and convert to float
            number = float(number_str.replace(',', ''))
            magnitude = magnitude.lower() if magnitude else ''
            
            if magnitude in ['m', 'million']:
                number *= 1_000_000
            elif magnitude in ['b', 'billion']:
                number *= 1_000_000_000
            elif magnitude in ['k', 'thousand']:
                number *= 1_000
            
            numbers.append(number)
            processed_indices.update(range(match.start(), match.end()))
        except ValueError:
            continue
    
    # Second pass: Match individual numbers embedded in text
    pattern2 = r'\d+(?:\.\d+)?'
    matches2 = re.finditer(pattern2, text)

    for match in matches2:
        if not any(i in processed_indices for i in range(match.start(), match.end())):
            try:
                number = float(match.group())
                numbers.append(number)
            except ValueError:
                continue

    return numbers

# Test the function
test_cases = [
    '$1200M',
    '-1b',
    '1b',
    '$1,200 million',
    '$1200 m',
    '1 me',
    '1me',
    '1 m',
    '1.5B dollars',
    '2.5 billion in revenue',
    '$500K in expenses',
    '1  1f 1,2 1,200  1200 1234.2 1,234.2',
    '$1230 $1,230 1230 1,230.12',
    'The revenue was $1,500M in Q1 and 2,750,000.50 in Q2',
    'No numbers here',
    'Mixed cases: 1234, $5,678.90M, and 91,011.12K'
]

# for case in test_cases:
#     print(f"Input: {case}")
#     print(f"Extracted numbers: {extract_numbers_from_text(case)}")
#     print()



In [63]:
def mark_processed(text, start, end, processed):
    """Mark a range of text as processed."""
    for i in range(start, end):
        processed[i] = True

def is_unprocessed(text, start, end, processed):
    """Check if a range of text is unprocessed."""
    return not any(processed.get(i, False) for i in range(start, end))

def extract_year_numbers(texts, processed):
    """
    Extracts year numbers from strings containing 'FY' or 'Fiscal Year'.
    Examples: 'FY2024', 'Fiscal Year 2022', ' FY 2023'
    """
    pattern = r'(?:FY|Fiscal Year)\s*(\d{4})'
    years = []
    for text in texts:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            if is_unprocessed(text, match.start(), match.end(), processed):
                years.append(int(match.group(1)))
                mark_processed(text, match.start(), match.end(), processed)
                # print(f"Year extracted: {match.group(1)}")
    return years

def extract_formatted_floats(texts, processed):
    """
    Extracts formatted floats, including those with commas and optional dollar signs.
    Examples: 2,123.4, $2,123.4
    """
    pattern = r'\$?((?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+))'
    floats = []
    for text in texts:
        for match in re.finditer(pattern, text):
            if is_unprocessed(text, match.start(), match.end(), processed):
                floats.append(float(match.group(1).replace(',', '')))
                mark_processed(text, match.start(), match.end(), processed)
                # print(f"Formatted float extracted: {match.group(1)}")
    return floats

def extract_integers(texts, processed):
    """
    Extracts integers, including those with commas but no decimal points.
    Examples: 50000, 1,234,567
    """
    pattern = r'\$?((?:\d{1,3}(?:,\d{3})*|\d+))\b(?!\.)'
    integers = []
    for text in texts:
        for match in re.finditer(pattern, text):
            if is_unprocessed(text, match.start(), match.end(), processed):
                integers.append(int(match.group(1).replace(',', '')))
                mark_processed(text, match.start(), match.end(), processed)
                # print(f"Integer extracted: {match.group(1)}")
    return integers

def extract_numbers_with_magnitude(texts, processed):
    """
    Extracts numbers with magnitude indicators (k, m, b, thousand, million, billion).
    Example: 1k, 2M, 3 billion, -4B
    """
    pattern = r'(-?\$?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?)\s*([kmb]|thousand|million|billion)\b'
    numbers = []
    for text in texts:
        for match in re.finditer(pattern, text, re.IGNORECASE):
            if is_unprocessed(text, match.start(), match.end(), processed):
                number_str, magnitude = match.groups()
                number = float(number_str.replace(',', '').replace('$', ''))
                magnitude = magnitude.lower()
                
                if magnitude in ['k', 'thousand']:
                    number *= 1000
                elif magnitude in ['m', 'million']:
                    number *= 1000000
                elif magnitude in ['b', 'billion']:
                    number *= 1000000000
                
                numbers.append(number)
                mark_processed(text, match.start(), match.end(), processed)
                # print(f"Number with magnitude extracted: {match.group()}")
    return numbers

def extract_embedded_numbers(texts, processed):
    """
    Extracts all remaining numbers embedded in text.
    """
    pattern = r'-?\$?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?'
    numbers = []
    for text in texts:
        for match in re.finditer(pattern, text):
            if is_unprocessed(text, match.start(), match.end(), processed):
                numbers.append(float(match.group().replace(',', '').replace('$', '')))
                mark_processed(text, match.start(), match.end(), processed)
                # print(f"Embedded number extracted: {match.group()}")
    return numbers

def extract_all_number_types(texts):
    """
    Extracts all types of numbers using the specialized functions above.
    """
    processed = {}
    results = {
        'years': extract_year_numbers(texts, processed),
        'formatted_floats': extract_formatted_floats(texts, processed),
        'integers': extract_integers(texts, processed),
        'magnitudes': extract_numbers_with_magnitude(texts, processed),
        'embedded': extract_embedded_numbers(texts, processed)
    }
    # print("\nExtraction results:")
    # for category, numbers in results.items():
    #     print(f"{category}: {numbers}")
    return results

test_cases = [
    '$1200M',
    '-1b',
    '1b',
    '$1,200 million',
    '$1200 m',
    '1 me',
    '1me',
    '1 m',
    '1.5B dollars',
    '2.5 billion in revenue',
    '$500K in expenses',
    '1  1f 1,2 1,200  1200 1234.2 1,234.2',
    '$1230 $1,230 1230 1,230.12',
    'The revenue was $1,500M in Q1 and 2,750,000.50 in Q2',
    'No numbers here',
    'Mixed cases: 1234, $5,678.90M, and 91,011.12K'
]
pprint.pprint(extract_all_number_types(test_cases))

{'embedded': [0.0, 1.0, 0.0],
 'formatted_floats': [1.5, 1234.2, 1234.2],
 'integers': [1, 2, 1200, 1200, 1, 50, 2],
 'magnitudes': [],
 'years': []}


In [64]:
import re

def extract_numbers(text):
    original_text = text
    results = {
        'integers': [],
        'formatted_floats': [],
        'magnitudes': [],
        'years': [],
        'embedded': []
    }
    
    processed_indices = set()

    # Strip dollar signs, but keep track of their positions
    
    text = text.replace('$', '').lower()
    
    def mark_processed(start, end):
        processed_indices.update(range(start, end))

    # Check for multipliers
    # multiplier_pattern = r'(-?\d{1,3}(,\d{3})*(\.\d+)?|-?\d+(\.\d+)?)\s*([kmbt]|thousand|million|billion|trillion)\b'
    # Check for multipliers
    multiplier_pattern = r'(?:^|(?<=\s))(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)\s*([kmb]|thousand|million|billion|trillion)\b'
    for match in re.finditer(multiplier_pattern, text, re.IGNORECASE):
        if not any(i in processed_indices for i in range(match.start(), match.end())):
            number, multiplier = match.groups()
            value = float(number.replace(',', '').replace('$', ''))
            multiplier = multiplier.lower()
            if multiplier in ['k', 'thousand']:
                value *= 1000
            elif multiplier in ['m', 'million']:
                value *= 1000000
            elif multiplier in ['b', 'billion']:
                value *= 1000000000
            elif multiplier in ['t', 'trillion']:
                value *= 1000000000000
            results['magnitudes'].append(value)
            mark_processed(match.start(), match.end())
            # print(f"Multiplier: {match.group()} -> {value}")
    
    # Check for year numbers
    year_pattern = r'(fy|fiscal year)\s*-?\s*(\d{4})'
    for match in re.finditer(year_pattern, text, re.IGNORECASE):
        if not any(i in processed_indices for i in range(match.start(), match.end())):
            year = int(match.group(2))
            results['years'].append(year)
            mark_processed(match.start(), match.end())
            # print(f"Year: {match.group()} -> {year}")
    
    # Check for properly formatted numbers
    # number_pattern = r'(?:^|(?<=\s))(\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)\b'
    number_pattern = r'(?:^|(?<=\s))(\$?\d{1,3}(?:,\d{3})*|\d+)(?:\.(?:\d+(?:\s|$))?)?(?=\s|$)'

    number_pattern = r'(?:^|(?<=\s))(\d{1,3}(?:,\d{3})*|\d+)(?:\.(\d+))?(?=\s|$)'
    for match in re.finditer(number_pattern, text):
        if not any(i in processed_indices for i in range(match.start(), match.end())):
            value = float(match.group().replace(',', ''))
            if value.is_integer():
                results['integers'].append(int(value))
                # print(f"Integer: {match.group()} -> {int(value)}")
            else:
                results['formatted_floats'].append(value)
                # print(f"Float: {match.group()} -> {value}")
            mark_processed(match.start(), match.end())
    
    # Extract remaining embedded numbers
    # embedded_pattern = r'-?\d+(?:\.\d+)?'
    # embedded_pattern = r'(^|(?<=\s)|(?<=-))((?:\d{1,3}(,\d{3})*(\.\d+)?|\d+(\.\d+)?))'
    embedded_pattern = r'\d+'
    for match in re.finditer(embedded_pattern, text):
        if not any(i in processed_indices for i in range(match.start(), match.end())):
            value = int(match.group())
            results['embedded'].append(value)
            # print(f"Embedded: {match.group()} -> {value}")
            mark_processed(match.start(), match.end())
    
    
    # if not any(results.values()):
    #     print(f"No numbers found in: {original_text}")
    
    return results

def process_text_list(text_list):
    final_results = {
        'integers': [],
        'formatted_floats': [],
        'magnitudes': [],
        'years': [],
        'embedded': []
    }
    
    for text in text_list:
        # print(f'Currently working on: {text}')
        extracted = extract_numbers(text)
        for key, value in extracted.items():
            final_results[key].extend(value)
    
    return final_results

# Test the function
test_cases = [
    '1.2.3.4,5',
    '$1200M',
    '-1b',
    'Fiscal Year 2024 Q23 1,200',
    'FY 2021',
    '1b',
    '$1,200 million',
    '$1200 m',
    '1 me',
    '1me',
    '1 m',
    '1.5B dollars',
    '2.5 billion in revenue',
    '$500K in expenses',
    '1  1f 1,2 1,200  1200 1234.2 1,234.2',
    '$1230 $1,230 1230 1,230.12',
    'The revenue was $1,500M in Q1 and 2,750,000.50 in Q2',
    'No numbers here',
    'Mixed cases: 1234, $5,678.90M, and 91,011.12K'
]


print("\nExtracting numbers...")
results = process_text_list(test_cases)

print("\nExtraction results:")
for category, numbers in results.items():
    print(f"{category}: {numbers}")


Extracting numbers...

Extraction results:
integers: [1200, 1, 1, 1200, 1200, 1230, 1230, 1230]
formatted_floats: [1234.2, 1234.2, 1230.12, 2750000.5]
magnitudes: [1200000000.0, 1000000000.0, 1200000000.0, 1200000000.0, 1000000.0, 1500000000.0, 2500000000.0, 500000.0, 1500000000.0, 5678900000.0, 91011120.0]
years: [2024, 2021]
embedded: [1, 2, 3, 4, 5, 1, 23, 1, 1, 1, 2, 1, 2, 1234]


In [65]:
def group_lines_into_objects(lines):
    chunks = []
    current_chunk = None

    def initialize_chunk(line, line_segments, obj_type):
        return {
            'x0': line['x0'],
            'x1': line['x1'],
            'top': line['top'],
            'bottom': line['bottom'],
            'content': [line_segments],
            'type': obj_type
        }

    def update_chunk(obj, line, line_segments):
        return {
            'x0': min(obj['x0'], line['x0']),
            'x1': max(obj['x1'], line['x1']),
            'top': min(obj['top'], line['top']),
            'bottom': max(obj['bottom'], line['bottom']),
            'content': obj['content'] + [line_segments],
            'type': obj['type']
        }

    for line in lines:
        line_segments = prepare_line_segments(line)
        is_table_line = len(line_segments) > 1 or (current_chunk and current_chunk['type'] == 'table' and len(line_segments[0]['text']) < 30)

        if current_chunk is None:
            current_chunk = initialize_chunk(line, line_segments, 'table' if is_table_line else 'text')
        elif is_table_line != (current_chunk['type'] == 'table'):
            # Transition between text and table
            chunks.append(current_chunk)
            current_chunk = initialize_chunk(line, line_segments, 'table' if is_table_line else 'text')
        else:
            # Continuing the current chunk
            current_chunk = update_chunk(current_chunk, line, line_segments)

    if current_chunk:
        chunks.append(current_chunk)

    return chunks

In [66]:
from PyPDF2 import PdfWriter
def process_page(page):

    numbers = []

    # Collect headers and footers
    headers = collect_headerfooter_lines(page)
    footers = collect_headerfooter_lines(page, type='footer')


    # Crop Page
    header_top, header_bottom = extract_top_bottom_positions(headers)
    footer_top, footer_bottom = extract_top_bottom_positions(footers)

    for alignment in ['left', 'center', 'right']:
        h = [segment['text'] for segment in headers[alignment]]
        f = [segment['text'] for segment in footers[alignment]]
        header_numbers = extract_numbers_from_text(' '.join(h))
        footer_numbers = extract_numbers_from_text(' '.join(f))
        # print('header numbers: ', header_numbers)
        # print('footer agreements: ', footer_numbers)
        numbers.extend(header_numbers)
        numbers.extend(footer_numbers)

    # Handle None values for header_bottom and footer_top
    header_bottom = header_bottom if header_bottom is not None else 0
    footer_top = footer_top if footer_top is not None else page.height


    cropped_page = page.within_bbox((0,header_bottom, page.width, footer_top), relative=True, strict=True)
    

    # display(cropped_page.to_image())

    # Extract multiplier from header
    table_multiplier = check_header_for_multiplier(headers)
    # print('Table Multiplier: ', table_multiplier)

    # Extract 
    images = cropped_page.images
    # print('Images: ')
    # pprint.pprint(images)

    # Get Remaining lines on page
    lines = cropped_page.extract_text_lines(layout=True, return_chars = True)

    chunks = group_lines_into_objects(lines)

    for i,chunk in enumerate(chunks):
        top = chunk['top'] - 4
        bottom = chunk['bottom'] + 4
        page_chunk = page.crop((0,top,page.width,bottom))
        im = page_chunk.to_image()
        # rects = page_chunk.rects

        # table_settings = {
        #     'vertical_strategy': 'text',
        #     'horizontal_strategy': 'text'
        # }

        # for rect in rects:
        #     print('RECTANGLE HEIGHT: ', rect['y1']-rect['y0'])
        text_lines = []
        for line in chunk['content']:
            text_line = ' '.join([segment['text'] for segment in line])
            text_lines.append(text_line)
        all_text = '\n'.join(text_lines) 


        if chunk['type'] == 'table':
            cells = []
            for line in chunk['content']:
                line_cells = [segment['text'] for segment in line]
                cells.extend(line_cells)
            multiplier = extract_multiplier_from_string(all_text)
            if i==0:
                multiplier = max(table_multiplier, multiplier)
            table_nums = process_text_list(cells)
            nums = table_nums['integers'] + table_nums['years'] + table_nums['embedded'] + table_nums['magnitudes']
            nums.extend([num*multiplier for num in table_nums['formatted_floats'] ])
            # display(im)
            
        else:
            
            nums = extract_numbers_from_text(all_text)
            # display(im)

        if nums:
            numbers.append(max(nums))
            # print('Max found in chunk above: ', max(nums))
        # else:
        #     print('No numbers found')
        

    return numbers


page = pages[0]
chunks = process_page(page)


In [67]:
def find_max_in_pdf(pdf_path):
    pdf = pdfplumber.open(pdf_path)
    pages = clean_pdf(pdf)

    max_numbers = {}
    for i,page in enumerate(pages):
        nums = process_page(page)
        if nums:
            max_numbers[i] = max(nums)

    # Finding the page with the overall maximum number
    max_page = max(max_numbers, key=max_numbers.get) if max_numbers else None

    return max_numbers, max_page

# Final 

In [68]:
pdf_path = 'pdfs/full.pdf'

In [69]:
max_per_page, page_with_max = find_max_in_pdf(pdf_path)

if page_with_max is not None:
    print(f'Overall max is on page {page_with_max}: {max_per_page[page_with_max]}')
else:
    print('No numbers found in the document.')

Overall max is on page 12: 30704100000.0
