In [9]:
import PyPDF2
import ollama
import re

## Approach 1: Using RegEx To Find Highest Numerical Value (No Scale)
Most reliable, though does not include scale context.

In [10]:
def extract_text_from_pdf(file_path):
    text = []
    reader = PyPDF2.PdfReader(file_path)
    
    for page in reader.pages:
        text.append(page.extract_text())
    return text

def parse_numbers(text, scale_factors):
    maximum = 0
    for i in range(len(text)):
        page = text[i]
        number_pattern = r'\$?\d{1,3}(?:,\d{3})*(?:\.\d+)?'
        numbers = re.findall(number_pattern, page)
        
        if numbers:
            parsed_numbers = [float(num.replace(',', '').replace('$', '')) * scale_factors[i] for num in numbers]
            maximum = max(maximum, max(parsed_numbers))

    return maximum

def determine_scale_factors(text, scale=False):
    if not scale:
        return [1] * len(text)
    scale_factors = []
    for page in text:
        if "million" in page.lower():
            scale_factors.append(1_000_000)
        elif "billion" in page.lower():
            scale_factors.append(1_000_000_000)
        else:
            scale_factors.append(1)
    return scale_factors

def find_largest_number(file_path, scale=False):
    text = extract_text_from_pdf(file_path)
    scale_factors = determine_scale_factors(text, scale=scale)
    maximum = parse_numbers(text, scale_factors)
    return maximum if maximum else None

largest_number = find_largest_number('air_force_data.pdf')
largest_number


6000000.0

## Approach 2: Finding Highest Numerical Value Including Scale Factor
Not entirely accurate. Definitely missing some numbers, but includes scale (ex: $1 million = 1,000,000)

In [11]:
largest_number = find_largest_number('air_force_data.pdf',scale=True)
largest_number


65000000000.0

## Approach 3: Using Llama 3.2 (Lightweight Open-Source Model) To Parse Numerical Data
Least reliable. Response changes each time and can also take a while to run (averages ~30 seconds for me on an M2 MacBook Air).

In [12]:
def extract_text_from_pdf_for_llama(file_path):
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        content = []
        digits = '0123456789'
        for i in range(0, len(pdf_reader.pages)):
            filtered_text = []
            text = pdf_reader.pages[i].extract_text().lower().split(' ')
            million_in_text = False
            for word in text:
                for digit in digits:
                    if digit in word:
                        filtered_text.append(word)
                        break
                else:
                    if 'million' in word:
                        million_in_text = True
                        filtered_text.append(word)
            
            if million_in_text:
                content.append(' '.join(filtered_text))
    
    return content

def find_largest_using_llama(content):
    largest_page_values = []

    for page in content:
        response = ollama.chat(model='llama3.2:1b', messages=[
            {
            'role': 'user',
            'content': f'''What is the largest numerical value here.

        Convert values such as 1,000,000, $1000000, $1 million, (milions of dollars)... 1.000 all to a value like 1,000,000. Use these to determine maximum:

        Do not reply with anything else. Just the numerical value, no explanation.

        Content:
        {page}''',
            },
        ])
        
        largest_page_values.append(response['message']['content'])

    response = ollama.chat(model='llama3.2:1b', messages=[
        {
        'role': 'user',
        'content': f'What is the largest numerical value here. Do not reply with anything else. Just the numerical value, no explanation.: {'\n'.join(largest_page_values)}',
        },
    ])
    return response['message']['content']

def find_largest_number_using_llama(file_path):
    content = extract_text_from_pdf_for_llama(file_path)
    maximum = find_largest_using_llama(content)
    return maximum if maximum else None

largest = find_largest_number_using_llama('air_force_data.pdf')
largest

'1,011,580'