### intro

The notebook tries to load pdf tableof content first using fitz. If fitz cannot identify the page of content, it will use PyPDFLoader to get the first few pages of the 10K/10Q and send to LLM. Based on response from LLM, the result will be processed to generate page of content.

In [1]:
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser
from langchain.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate
import fitz
import os

from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


In [2]:
def load_content_using_fitz_ftrst(file_loc, pages_pull, buffer):
    toc = fitz.open(file_loc).get_toc()
    
    if len(toc)==0:
        print('TOC load failed, use LLM')
        #### cannot find table of content, use llm
        llm_response = load_content_using_llm(file_path, pages_pull)
    
        page_range, section_name = fetch_content_page(llm_response, buffer)
        
        return page_range, section_name, llm_response
        
    else:
        print('use TOC load')
        ##### use toc loaded content table to generate page_range, section_name
        ##### seems always called part and item
        which_part = ''
        which_item = ''

        section_name = []
        starting_page = []

        for line in toc:
            if len(line)<3:
                continue
            ##### each of them is a list
            if line[1].lower().startswith('part '):
                which_part = line[1].lower().strip()

            elif line[1].lower().startswith('item '):
                which_item = line[1].lower().strip()

            if len(which_part)>0 or len(which_item)>0:
                c_page_num = line[-1]

                section_name.append(which_part+'_'+which_item+'_'+line[1])
                starting_page.append(c_page_num)

        starting_page.append(1000) #### add 1000 to the end
        #### provide the range 

        page_range = [(startp-buffer, endp+buffer) for startp, endp in zip(starting_page[:-1],starting_page[1:])]
        
        return page_range, section_name, toc
    
    
def fetch_content_page(content, buffer = 3):    
    #### since not all 10-k starts couting pages on the first one.

    ##### seems always called part and item
    which_part = ''
    which_item = ''

    section_name = []
    starting_page = []

    for line in content.split(':')[1].split('\n'):
        if 'part ' in line.lower():
            which_part = line.lower().strip()
        elif 'item ' in line.lower():
            which_item = line.lower().split('-')[0].strip()
            
        if len(which_part)>0 and len(which_item)>0 and len(line)>0:
            #### extract the page number
            try:
                c_page_num = int(line.lower().split()[-1].strip())

                section_name.append(which_part+'_'+which_item+'_'+' '.join(line.lower().split()[:-1]))
                starting_page.append(c_page_num)
            except:
                continue
    
    starting_page.append(1000) #### add 1000 to the end
    #### provide the range 

    page_range = [(startp-buffer, endp+buffer) for startp, endp in zip(starting_page[:-1],starting_page[1:])]
    
    return page_range, section_name

def load_content_using_llm(file_loc, pages_pull = 5):
    '''
    use PyPDFLoader to get the first pages_pull number of pages and send to 
    '''
    loader = PyPDFLoader(file_path=file_loc)
    docs = loader.load()
    
    new_text = '/n/n'.join([doc.page_content for doc in docs[:pages_pull]])
    
    model = ChatAnthropic(
    model="claude-3-sonnet-20240229",
    temperature=0,
    max_tokens=1024,
    timeout=None,
    max_retries=2,
    api_key = os.environ.get("anthropic_API_KEY")
    )

    prompt_text = """Given the text. Could you help identify the table of content? \
    Please list the name of each section and the correspodning page number. \
    Below are the text: {text} """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Summary chain
    summarize_chain = {"text": lambda x: x} | prompt | model | StrOutputParser()
    response = summarize_chain.invoke(new_text)
    
    return response


def load_unstructured_pdf_and_add_section(file_path,page_range, section_name, **kwargs):
    #raw_pdf_elements = partition_pdf(
    #filename=file_path,
    ## Unstructured first finds embedded image blocks
    #extract_images_in_pdf=False,
    ## Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    ## Titles are any sub-section of the document
    #infer_table_structure=True,
    ## Post processing to aggregate text once we have the title
    #chunking_strategy="by_title",
    ## Chunking params to aggregate text blocks
    ## Attempt to create a new chunk 3800 chars
    ## Attempt to keep chunks > 2000 chars
    #max_characters=4000,
    #new_after_n_chars=3800,
    #combine_text_under_n_chars=2000,
    #image_output_dir_path=path,
    #kwargs
    #)
    
    raw_pdf_elements = partition_pdf(
    filename=file_path,
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    
    )
    
    
    processed_docs = [None]*len(raw_pdf_elements)
    
    for idx, doc in enumerate(raw_pdf_elements):
        templist = [] 
        doc_page = doc.metadata.page_number
        for (range_s, range_e) in page_range:
            if doc_page>=range_s and doc_page<=range_e:
                ##### add section to it
                templist.append(section_name[idx])
                
            elif doc_page>range_e:
                break
        
        ##### finally, covert section_name to string
        if len(templist)>0:
            doc.metadata.section_name = '.'.join(templist)
        else:
            doc.metadata.section_name = ''
                
        processed_docs[idx] = doc
    
        return processed_docs 

def pdf_extract_worrkflow(file_path, pages_pull = 5, buffer = 3, verbose = False ):
    '''
    file_path: location of the pdf file
    pages_pull: initial paages to review to check the table of content
    buffer: buffer to identify the section (since not all page count starts from the beginning)
    '''
    
    page_range, section_name,response = load_content_using_fitz_ftrst(file_path, pages_pull, buffer)
    
    print('Start Loading the page number')
    
    new_docs = load_unstructured_pdf_and_add_section(file_path,page_range, section_name)
    
    if verbose:
        return new_docs, page_range, section_name,response
    else:
        return new_docs
    
    
    

#### example 1, BOA 10K, toc available, use toc

In [3]:
file_path = r"/Users/Russell/Library/CloudStorage/Dropbox/MFE_Courses/langchain_deeplearning/10_KQ/bank_of_america_10Q_2023_Q1.pdf"
pages_pull = 5 #### pull first 5 pages to find content

buffer = 3
verbose = True

# docs,page_range, section_name, llm_response =  pdf_extract_worrkflow(file_path,pages_pull, buffer,verbose)

In [4]:
fitz.open(file_path).get_toc()



[[1,
  'UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549FORM 10-Q',
  1],
 [1,
  'Bank of America Corporation and SubsidiariesMarch 31, 2023 Form 10-Q',
  3],
 [2, 'Part I. Financial Information', 3],
 [2, 'Part II. Other Information', 4],
 [2,
  'Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
  4],
 [3, 'Executive Summary', 5],
 [4, 'Business Overview', 5],
 [3, 'Recent Developments', 5],
 [4, 'Capital Management', 5],
 [4, 'Financial Market Events', 5],
 [4, 'U.S. Government Debt Ceiling', 6],
 [4, 'LIBOR and Other Benchmark Rates', 6],
 [3, 'Financial Highlights', 6],
 [4, 'Net Interest Income', 7],
 [4, 'Noninterest Income', 7],
 [4, 'Provision for Credit Losses', 7],
 [4, 'Noninterest Expense', 7],
 [4, 'Income Tax Expense', 7],
 [3, 'Supplemental Financial Data', 8],
 [4, 'Non-GAAP Financial Measures', 8],
 [4, 'Key Performance Indicators', 8],
 [3, 'Business Segment Operations', 11],
 [4, 'Segment Description 

In [5]:
load_content_using_fitz_ftrst(file_path,pages_pull, buffer )

use TOC load


([(0, 7),
  (1, 7),
  (1, 8),
  (2, 8),
  (2, 8),
  (2, 8),
  (2, 8),
  (2, 9),
  (3, 9),
  (3, 9),
  (3, 10),
  (4, 10),
  (4, 10),
  (4, 10),
  (4, 10),
  (4, 11),
  (5, 11),
  (5, 11),
  (5, 14),
  (8, 14),
  (8, 14),
  (8, 14),
  (8, 14),
  (8, 15),
  (9, 15),
  (9, 16),
  (10, 17),
  (11, 18),
  (12, 18),
  (12, 19),
  (13, 20),
  (14, 21),
  (15, 21),
  (15, 22),
  (16, 22),
  (16, 22),
  (16, 22),
  (16, 22),
  (16, 23),
  (17, 25),
  (19, 26),
  (20, 26),
  (20, 26),
  (20, 29),
  (23, 29),
  (23, 29),
  (23, 30),
  (24, 30),
  (24, 30),
  (24, 34),
  (28, 40),
  (34, 41),
  (35, 43),
  (37, 43),
  (37, 44),
  (38, 44),
  (38, 45),
  (39, 45),
  (39, 46),
  (40, 46),
  (40, 46),
  (40, 47),
  (41, 48),
  (42, 49),
  (43, 49),
  (43, 53),
  (47, 54),
  (48, 55),
  (49, 55),
  (49, 62),
  (56, 65),
  (59, 72),
  (66, 72),
  (66, 80),
  (74, 80),
  (74, 80),
  (74, 80),
  (74, 81),
  (75, 83),
  (77, 87),
  (81, 102),
  (96, 103),
  (97, 104),
  (98, 105),
  (99, 106),
  (100, 100

In [6]:
%%time
docs, page_range, section_name, response = pdf_extract_worrkflow(file_path, pages_pull, buffer, verbose )

use TOC load
Start Loading the page number


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

CPU times: user 25min 44s, sys: 4min 22s, total: 30min 7s
Wall time: 11min 21s


In [15]:
docs[0].metadata.section_name

'part i. financial information__Part I. Financial Information.part i. financial information__Part I. Financial Information.part i. financial information__Part I. Financial Information'

In [9]:
page_range

[(0, 7),
 (1, 7),
 (1, 8),
 (2, 8),
 (2, 8),
 (2, 8),
 (2, 8),
 (2, 9),
 (3, 9),
 (3, 9),
 (3, 10),
 (4, 10),
 (4, 10),
 (4, 10),
 (4, 10),
 (4, 11),
 (5, 11),
 (5, 11),
 (5, 14),
 (8, 14),
 (8, 14),
 (8, 14),
 (8, 14),
 (8, 15),
 (9, 15),
 (9, 16),
 (10, 17),
 (11, 18),
 (12, 18),
 (12, 19),
 (13, 20),
 (14, 21),
 (15, 21),
 (15, 22),
 (16, 22),
 (16, 22),
 (16, 22),
 (16, 22),
 (16, 23),
 (17, 25),
 (19, 26),
 (20, 26),
 (20, 26),
 (20, 29),
 (23, 29),
 (23, 29),
 (23, 30),
 (24, 30),
 (24, 30),
 (24, 34),
 (28, 40),
 (34, 41),
 (35, 43),
 (37, 43),
 (37, 44),
 (38, 44),
 (38, 45),
 (39, 45),
 (39, 46),
 (40, 46),
 (40, 46),
 (40, 47),
 (41, 48),
 (42, 49),
 (43, 49),
 (43, 53),
 (47, 54),
 (48, 55),
 (49, 55),
 (49, 62),
 (56, 65),
 (59, 72),
 (66, 72),
 (66, 80),
 (74, 80),
 (74, 80),
 (74, 80),
 (74, 81),
 (75, 83),
 (77, 87),
 (81, 102),
 (96, 103),
 (97, 104),
 (98, 105),
 (99, 106),
 (100, 1003)]

In [10]:
section_name

['part i. financial information__Part I. Financial Information',
 'part ii. other information__Part II. Other Information',
 'part ii. other information_item 2. management’s discussion and analysis of financial condition and results of operations_Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
 'part ii. other information_item 2. management’s discussion and analysis of financial condition and results of operations_Executive Summary',
 'part ii. other information_item 2. management’s discussion and analysis of financial condition and results of operations_Business Overview',
 'part ii. other information_item 2. management’s discussion and analysis of financial condition and results of operations_Recent Developments',
 'part ii. other information_item 2. management’s discussion and analysis of financial condition and results of operations_Capital Management',
 'part ii. other information_item 2. management’s discussion and analysis of finan