##### Code to preprocess input pdf file to differentiate the tabular data and help extract its structured information without losing it like how we observe in traditional RAG splitted data

In [None]:
#.....Importing necessary libraries....#
import pdfplumber
from pdf2image import convert_from_path
import os
import fitz
from openai import OpenAI
import base64

In [None]:
'''.....Addon Ideas/trials that you can try
1. I've tested with just GPT-4o, try other multimodals, including Gemini
2. I've passed the entire context (text + all extracted images), you can consider having chuncks/splits for the text data to be retrievable 
    - instead of passing entire context and see if it can impact accuracy. You can try the feasibility of this combination.
3. In future, we can try the linkeage between the text and the connected images. There may be mention of table information somwhere in the text
    and the current approach of (text + images) might be loosing this information. We can explore a simple way of retaining this linkeage between text block and the connected images
'''

In [None]:
#.....Importing Model credentials....#
# I've used my personal credentials for testing. You can change the client to use Azure endpoints (as done in most of our projects)
# Also, recommend other multimodals (like Gemini 1.5 Pro) or Vision Modles (like LLaVA)  

gpt_model = 'YOUR_GPT_MODEL' #'gpt-4o-mini-2024-07-18' #'gpt-3.5-turbo-1106' #'gpt-3.5-turbo-0613' 
client = OpenAI(api_key = "YOUR_API_KEY")


In [None]:
# Function to extract text and find pages with tables
def extract_text_and_find_table_pages(pdf_path):
    """Extract text and identify pages containing tables."""
    table_pages = []
    text_data = {}

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            tables = page.extract_tables()
            
            if tables and any(any(cell for cell in row) for table in tables for row in table):
                table_pages.append(i + 1)  # Page numbers start from 1

            text_data[i + 1] = text if text else ""  # Store text data

    return text_data, table_pages

#....Function to convert pages (that contain tables) to images....#
def convert_table_pages_to_images(pdf_path, table_pages, output_folder):
    """Convert table pages to images and save them."""
    os.makedirs(output_folder, exist_ok=True)
    doc = fitz.open(pdf_path)
    
    for page_num in table_pages:
        page = doc[page_num - 1]
        pix = page.get_pixmap(dpi=300)  # Higher DPI for clarity
        img_path = os.path.join(output_folder, f"page_{page_num}.png")
        pix.save(img_path)
        print(f"Saved table page {page_num} as image.")

    return output_folder

In [None]:
# Path to Input file 
uploaded_file = r"PATH_TO_INPUT_FILE"
# NOTE: I've built the application considering input and ooutputs to local machine only. For deployments consider using saved file URLs

#Path to folder that stores the extracted pages of the pdf that shall be saved as images in the folder  
output_folder = r"LOCAL_FOLDER_PATH_TO_STORE_CONVERTED_IMAGES"

# Extracting page numbers of pdfs that contains tables, along with text data. Here i'm not oing to use 'text_data' but just included it FYR
text_data, table_pages = extract_text_and_find_table_pages(uploaded_file)

if table_pages:
    convert_table_pages_to_images(uploaded_file, table_pages, output_folder)

Saved table page 1 as image.
Saved table page 2 as image.
Saved table page 3 as image.
Saved table page 5 as image.
Saved table page 6 as image.
Saved table page 7 as image.
Saved table page 8 as image.
Saved table page 9 as image.
Saved table page 10 as image.
Saved table page 11 as image.
Saved table page 12 as image.
Saved table page 13 as image.


In [None]:
#....Defining the Prompt, customize this based on your application.....#
input_prompt = """Identify the specifications from the provided document context and set of images. Extract the values for the identified specifications from the context and all the input images. 
                      Provide the extracted specifications from the provided context in a structured JSON format. Each specification should have an value entry. 
                      The values should also include the associated units if available. If a specification is missing, include 'N/A' for that entry.
                      The first two entires of the specification should be 'company' and 'product/Model Number'. Extract as many specifications as possible from the provided contexts.
                      If the fetched values has any special characters like backslash etc., convert the values such that it will not create any issues while parsing the JSON. 
                      If the extracted value for any specification is in nested format, convert them into a list of key value pair and stick to the required format of JSON. Don't output in nested json format. Don't use same keys again and again, but instead merge similar key information into a list of strings.
                      Format your response as: 
                      {
                        "company": ["Value from context1"],
                        "product/Model Number": ["Value from context1"],
                        "Specification 3": ["Value from context1"], 
                        "Specification 4": ["Value from context1"], 
                        ... 
                      }
                      
                      """ 

#I was trying to extract specification from datasheets and the above prompt worked well for me.

In [None]:
# Function to convert an image file to base64. GPT needs to image to be convereted to bytes
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Function to extract specs from the input datasheet text
def extract_specs(pdf_text: str, input_prompt: str, image_files: list) -> str:
    # pdf_text: extracted text for the pages of the pdf that deosn't contain tables
    # image_files: List of images as found in the image folder
    
    # Defining the message that will be later appended with image information
    messages = [ {"role": "assistant", "content": input_prompt},
                {"role": "user", "content": [
                    {"type": "text", "text": f"Context: {pdf_text}"}
                ]},
                #   {"role": "user", "content": query_text}
                ]
    
    # Convert and add images to API request
    for image_path in image_files:
        base64_image = encode_image(image_path)
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{base64_image}"}  # Use base64-encoded image
        })

    # .....Generating the responses using the extracted pdf text and the images that contain tables......#
    response = client.chat.completions.create(
        model=gpt_model, 
        messages = messages, #prompt=prompt_template,
        max_tokens=1200,
        temperature = 0.1,
    )
    specifications = response.choices[0].message.content
    return specifications


In [None]:
# Function to extract text from the pages of the pdf that doesn't contain tables
def extract_text_from_pdf(file) -> str:
    pdf_reader = fitz.open(file)
    full_text = ""
    page_texts = []
    # for page_num in range(pdf_reader.getNumPages()):
    for page_num in range(pdf_reader.page_count):
        if (page_num + 1) not in table_pages:  # Skip table pages
            page = pdf_reader[page_num]
            # text = page.get_text("text") 
            blocks = page.get_text("blocks") 
            # Blocks can: Extracts text in structured blocks (preserving layout), Keeps text in blocks (helps preserve meaning), 
            #             Replaces \n with spaces for non-heading content, Ensures headings or key-value pairs stay structured, More control over structuring output

            processed_blocks = []
            for b in blocks:
                block_text = b[4].strip()
                if block_text:
                    if ":" in block_text:
                        processed_blocks.append(block_text)
                    else:
                        processed_blocks.append(block_text.replace("\n", " "))
            processed_text = "\n".join(processed_blocks)
            full_text += processed_text + "\n\n"
            page_texts.append(processed_text)
    return page_texts

In [None]:
# Creating a list of image files
image_files = [os.path.join(output_folder, img) for img in sorted(os.listdir(output_folder)) if img.endswith((".png", ".jpg", ".jpeg"))]

# Extracting the text of the input pdf for the pages that doesn't contain tables
pdf_text = extract_text_from_pdf(uploaded_file)

# Generating the response by calling LLM model
specifications = extract_specs(pdf_text, input_prompt, image_files)

# Printing the extracted information
print(specifications)

```json
{
  "company": ["ABB"],
  "product/Model Number": ["ITABB-RR3595"],
  "Document No": ["ITABB-RR3595"],
  "Doc. version": ["Original - Approved - Lean"],
  "Risk review number": ["ITABB-RR3595"],
  "External Doc Number (ABB SalesForce)": ["0063Y00001szH6uQAE"],
  "Opportunity Id": ["OPP-23-6138578"],
  "Project name": ["AEN Compensatori Sincroni Terna 2023 Lotto 3"],
  "Customer is ABB unit?": ["No"],
  "Customer": ["G08036496, Ansaldo Energia S.p.A., Genova, IT"],
  "Customer country CPI": ["IT/Lower"],
  "Customer at Submission": ["G08036496, Ansaldo Energia S.p.A., Genova, IT"],
  "Customer at Submission country CPI": ["IT/Lower"],
  "Customer region": ["Europe"],
  "End user": ["G00121821, Terna S.p.A., Roma, IT"],
  "End user country CPI": ["IT/Lower"],
  "End user at Submission": ["G00121821, Terna S.p.A., Roma, IT"],
  "End user at Submission country CPI": ["IT/Lower"],
  "End user region": ["Europe"],
  "Site country, exact geographic location": ["IT"],
  "Site country C