# 1. 📄 PDF information Extraction
### VQA based on QWEN2 VLM model for scanned PDF documents

In [1]:
import os
import pandas as pd

import pdfplumber
import fitz 
import io
from PIL import Image

In [2]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# !ls ./../../../../data/sample_resume

In [4]:
def read_data(data_path='./../../../../data/sample_resume'):
    """
    Finds all .pdf files in the given relative directory and returns their relative paths
    and file names without the .pdf extension.

    Parameters:
        data_path (str): Relative path to the directory to search for PDF files.

    Returns:
        Tuple[List[str], List[str]]: 
            - List of relative paths to PDF files.
            - List of PDF file names without the .pdf extension.
    """
    pdf_path_files = []
    pdf_names = []

    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.lower().endswith('.pdf'):
                relative_path = os.path.join(root, file)
                pdf_path_files.append(relative_path)
                pdf_name = os.path.splitext(file)[0]
                pdf_names.append(pdf_name)
    
    return pdf_path_files, pdf_names

pdf_path_files, pdf_names = read_data()
print(f'len(pdf_path_files): {len(pdf_path_files)}')

print(pdf_names[2])
print(pdf_path_files[2])


len(pdf_path_files): 37
Ziad Abdeltawab
./../../../../data/sample_resume/Ziad Abdeltawab.pdf


In [None]:
class PDFDataExtractionViaVLM:
  def __init__(self):
    print('VQA using QWEN-VLM model for scanned PDF docs ...')
    # load the QWEN VLM model
    self.load_model()

    # load the QWEN VLM processor
    self.load_model_processor()

    # create ./temp directory
    os.makedirs('./temp/', exist_ok=True)


  @staticmethod
  def pdf2image(pdf_path, page_number=1):
    """
    Converts a specific page of a PDF file into a .png image and displays it.

    The function performs the following steps:
    
    1. Opens the PDF document from the specified path.
    2. Loads the specified page (by default, the first page) of the PDF.
    3. Converts the page to an image (PNG format) and displays it.
    
    Args:
        pdf_path (str): The path to the PDF file.
        page_number (int, optional): The page number to convert, starting from 0. Defaults to the second page.
    
    Returns:
        Image: The converted image object that is also saved locally as ./temp/temp_image.png'.
    """
    # Open the PDF
    pdf_document = fitz.open(pdf_path)

    # Select the specific page
    page = pdf_document.load_page(page_number)

    # Render the page to a PNG image
    pix = page.get_pixmap()
    img_data = pix.tobytes("png")

    # # Create an image from the bytes and display it
    image = Image.open(io.BytesIO(img_data))
    # display(image)  # Use display() for Databricks notebooks

    # Save the image to a file
    image_path = './temp/temp_image.png'
    image.save(image_path)

    return image


  @staticmethod
  def display_pdf_doc(pdf_path):
    """
    Converts a specific page of a PDF file into a .png image and displays it.

    The function performs the following steps:
    
    1. Opens the PDF document from the specified path.
    2. Loads the specified page (by default, the first page) of the PDF.
    3. Converts the page to an image (PNG format) and displays it.
    4. Saves the converted image to a temporary file named./temp/temp_image.png'.
    
    Args:
        pdf_path (str): The path to the PDF file.
        page_number (int, optional): The page number to convert, starting from 0. Defaults to the first page.
    
    Returns:
        Image: The converted image object that is also saved locally as./temp/temp_image.png'.
    """
    # Open the PDF
    pdf_document = fitz.open(pdf_path)

    dispalay_page = True
    page_number = 0
    while dispalay_page:
      try:
        # Select the specific page
        page = pdf_document.load_page(page_number)

        # Render the page to a PNG image
        pix = page.get_pixmap()
        img_data = pix.tobytes("png")

        # Create an image from the bytes and display it
        image = Image.open(io.BytesIO(img_data))
        # print(f'page number: {page_number}')
        display(image)  # Use display() for Databricks notebooks
        page_number += 1

      except ValueError:
        dispalay_page = False


  def load_model(self):
      """
      Loads the Qwen2-VL model from the specified local directory or downloads it
      to that directory if not available locally.

      The function performs the following steps:
      
      1. Attempts to load the model from a local directory (`model_path`).
      2. If the model is not found locally, it downloads the model from Hugging Face,
        caches it in the specified cache directory, and saves it to model_path.
      3. Returns the loaded model instance, which is configured to use GPU (CUDA).

      Args:
          None

      Returns:
          model (Qwen2_5_VLForConditionalGeneration): The loaded Qwen2-VL model.
      """
      model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
      model_path = "./../../../../hf_models/qwen_vlm_model"
      cache_dir = "./../../../../hf_models/cache"
      
      # Create directories if they don't exist
      os.makedirs(model_path, exist_ok=True)
      os.makedirs(cache_dir, exist_ok=True)

      try: 
          # Try to load from local model_path first
          model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
              model_path, 
              torch_dtype="auto", 
              device_map="cuda",
              local_files_only=True
          ) 
          print("Model loaded from local directory.")

      except: 
          print("Model not found locally. Downloading from Hugging Face...")
          
          # Download from Hugging Face with custom cache directory
          model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
              model_id, 
              torch_dtype="auto", 
              device_map="cuda",
              cache_dir=cache_dir
          ) 

          # Save the model to model_path for future use
          model.save_pretrained(model_path) 
          print(f"Model downloaded, cached to {cache_dir}, and saved to {model_path}")

      self.model = model


  def load_model_processor(self):
    model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
    processor = AutoProcessor.from_pretrained(model_id)
    self.processor = processor

    print('processor loaded')
    


  def inference(self, prompt, image_local_path="./temp/temp_image.png", sys_prompt="You are a helpful assistant.", max_new_tokens=4096, return_input=False):
      image = Image.open(image_local_path)
      messages = [
          {"role": "system", "content": sys_prompt},
          {"role": "user", "content": [
                  {"type": "text", "text": prompt},
                  {"image": image_local_path},
              ]
          },
      ]
      text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
      print("text:", text)
      # image_inputs, video_inputs = process_vision_info([messages])
      inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
      inputs = inputs.to('cuda')

      output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
      generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
      output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      if return_input:
          return output_text[0], inputs
      else:
          return output_text[0]
        
  @staticmethod  
  def get_no_pages_in_pdf(pdf_path):
    pdf = pdfplumber.open(pdf_path)
    return len(pdf.pages)


  def __call__(self, file_path, prompt, verbose=False):
      # for .pdf file
      if file_path.lower().endswith('.pdf'):
          
        # get the number of pages in the pdf
          pdf_path = file_path
          no_pages_in_pdf = self.get_no_pages_in_pdf(pdf_path)
          if verbose:
            print(f'no_pages_in_pdf: {no_pages_in_pdf}')
            self.display_pdf_doc(pdf_path)
          pdf_content = []

          # extract textual and tabular content of a page
          for page_number in range(no_pages_in_pdf):
              pdf_content.append(f'\n\npage_number: {page_number}\n')
              
              # convert the page to iamge and save that image locally
              self.pdf2image(pdf_path, page_number)
              
              # call inference method to ocr the image
              output_text = self.inference(prompt, image_local_path="./temp/temp_image.png")
              pdf_content.append(output_text)
          
          return ''.join(pdf_content)
      
      # for images
      else:
         output_text = self.inference(prompt, image_local_path=file_path)
         return output_text

      

pdf_data_extraction_via_vlm = PDFDataExtractionViaVLM()

VQA using QWEN-VLM model for scanned PDF docs ...


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]


Model loaded from local directory.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


processor loaded


: 

usage example


In [6]:
pdf_path = pdf_path_files[2]

In [7]:
no_pages_in_pdf = pdf_data_extraction_via_vlm.get_no_pages_in_pdf(pdf_path)
print(no_pages_in_pdf)

2


In [8]:
# qwen 2.5
# prompt = "Read all the text in the image."
# prompt = "Please output only the text content from the image without any additional descriptions or formatting.
prompt = "Please output only the text content from the image without any additional descriptions or formatting. the image is a page of a professional resume. there are different sections in each image such as experience or projects. put together all the information about a section or topic together. separate each section with a '\n\n'."
# prompt = "Please output only the text content from the image without any additional descriptions or formatting. ouptut selected and not selected checkmarks"
# prompt = "Please output only the text content from the image without any additional descriptions or formatting. if checkmarks exists, include selected checkmarks (☒) and not selected checkmarks (☐)"

In [9]:
# pdf_content = pdf_data_extraction_via_vlm(pdf_path, prompt)

In [10]:
# pdf_path

In [11]:
# print(pdf_content)

In [12]:
# create a dataframe to keep the ocr outputs
pdf_files_contents = pd.DataFrame({
    'pdf_path': pdf_path_files,
    'pdf_name': pdf_names,
    'pdf_content': [''] * len(pdf_path_files)  # Initialize with empty strings
})

# display(pdf_path_files_contents.head(2))


In [None]:
for idx in range(pdf_files_contents.shape[0]):

    pdf_path = pdf_files_contents.at[idx, 'pdf_path']

    prompt = "Please output only the text content from the image without any additional descriptions or formatting. the image is a page of a professional resume. there are different sections in each image such as experience or projects. put together all the information about a section or topic together. separate each section with a '\n\n'."
    pdf_content = pdf_data_extraction_via_vlm(pdf_path, prompt)
    pdf_files_contents.at[idx, 'pdf_content'] = pdf_content

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Please output only the text content from the image without any additional descriptions or formatting. the image is a page of a professional resume. there are different sections in each image such as experience or projects. put together all the information about a section or topic together. separate each section with a '

'.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant

text: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Please output only the text content from the image without any additional descriptions or formatting. the image is a page of a professional resume. there are different sections in each image such as experience or projects. put together all the information about a section or topic together. separate each section with a '

'.<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant



In [None]:
# pdf_files_contents_path = './../../data/pdf_files_contents.csv'
# # Save to CSV
# pdf_files_contents.to_csv(pdf_files_contents_path, index=False)