In [2]:
# !pip3 uninstall -q ultralyticsplus==0.0.28 ultralytics==8.0.43 -y
!pip install -q PyMuPDF pillow transformers

[0m

In [3]:
!pip install -q timm


[0m

In [4]:
from transformers import AutoImageProcessor, TableTransformerForObjectDetection
import torch
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
image_processor = AutoImageProcessor.from_pretrained("microsoft/table-transformer-detection")
model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
Some weights of the model checkpoint at microsoft/table-transformer-detection were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

In [8]:
model.save_pretrained("./model/table-extraction/microsoft--table-transformer-detection")
image_processor.save_pretrained("./model/table-extraction/microsoft--table-transformer-detection")

['./model/table-extraction/microsoft--table-transformer-detection/preprocessor_config.json']

In [4]:
import io

import fitz  # PyMuPDF
import pandas as pd
import 
from PIL import Image
import os
from glob import glob
import numpy as np
from tqdm import tqdm

import pandas as pd

In [5]:
from huggingface_hub import hf_hub_download
from transformers import AutoImageProcessor, TableTransformerForObjectDetection
import torch
from PIL import Image

In [53]:
file_path = "./data/input/google-sec-1.png"
image = Image.open(file_path).convert("RGB")
imgs = [image, Image.open("./data/input/multi-table.png").convert("RGB")]

In [57]:
results

[{'scores': tensor([], device='cuda:0', grad_fn=<IndexBackward0>),
  'labels': tensor([], device='cuda:0', dtype=torch.int64),
  'boxes': tensor([], device='cuda:0', size=(0, 4), grad_fn=<IndexBackward0>)},
 {'scores': tensor([0.9964, 0.9986, 0.9975], device='cuda:0', grad_fn=<IndexBackward0>),
  'labels': tensor([0, 0, 0], device='cuda:0'),
  'boxes': tensor([[ 14.6857, 212.3847, 294.7186, 281.9496],
          [ 14.8059, 419.1996, 296.0931, 488.2941],
          [ 13.0513,   6.0678, 299.0805,  76.1381]], device='cuda:0',
         grad_fn=<IndexBackward0>)}]

In [61]:
result['boxes']

tensor([[ 14.6857, 212.3847, 294.7186, 281.9496],
        [ 14.8059, 419.1996, 296.0931, 488.2941],
        [ 13.0513,   6.0678, 299.0805,  76.1381]], device='cuda:0',
       grad_fn=<IndexBackward0>)

In [68]:
inputs = image_processor(images=imgs, return_tensors="pt").to("cuda")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
target_sizes = torch.tensor([i.size[::-1] for i in imgs])
results = image_processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)
data = []
for i, result in enumerate(results):
    print(i)
    for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
        box = [round(i, 2) for i in box.tolist()]
        print(box)
        data.append({
            "index":i,
            "xmin": box[0],
            "ymin": box[1],
            "xmax": box[2],
            "ymax": box[3],
            'confidence': float(score)
        })   
df = pd.DataFrame(data)

0
1
[14.69, 212.38, 294.72, 281.95]
[14.81, 419.2, 296.09, 488.29]
[13.05, 6.07, 299.08, 76.14]


In [89]:


class TableIdentifier:

    def __init__(self, model_dir, model=None, model_name="best.pt", confidence_threshold=0.5, batch_size=1, device='cuda'):
        self.confidence_threshold = confidence_threshold
        self.batch_size = batch_size
        self.device = device
        if model: 
            self.model = model.to(device)
        else:
            model_path = os.path.join(model_dir, model_name)
            self.model = torch.hub.load('ultralytics/yolov5', 'custom', path=model_path)
            self.model.eval()
        self.model = self.model.to(self.device)
            

    def gen_pages_with_tables(self, doc_file: fitz.Document, filename: str = None):
        for page in tqdm(doc_file):  # iterate the document pages
            tabs = page.find_tables(horizontal_strategy="text", snap_y_tolerance=3, join_tolerance=20)


            if tabs.tables:
                yield page, tabs, filename

    def yolo_predict(self, images):
        # Perform inference
        with torch.no_grad():
            preds = self.model(images)
            preds = preds.pandas().xyxy
            for pred in preds:
                pred = pred[pred['confidence'] > self.confidence_threshold]
                pred = pred.reset_index(drop=True)

        return preds
    
    def transformer_predict(self, images):
        
        inputs = image_processor(images=images, return_tensors="pt").to(self.device)
        outputs = model(**inputs)
        target_sizes = torch.tensor([i.size[::-1] for i in images])
        results = image_processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)
        data = []
        for i, result in enumerate(results):
            for score, label, box in zip(result["scores"], result["labels"], result["boxes"]):
                box = [round(i, 2) for i in box.tolist()]
                data.append({
                    "index":i,
                    "xmin": box[0],
                    "ymin": box[1],
                    "xmax": box[2],
                    "ymax": box[3],
                    'confidence': round(float(score), 2)

                })      
        preds = pd.DataFrame(data)
        preds = preds[preds['confidence'] > self.confidence_threshold]
        return preds

    def extract_tables_imgs(self, images: list) -> pd.DataFrame:
        image_batch = []
        predictions = []
        for img in images:
            image_batch.append(Image.open(io.BytesIO(img)))
            if len(image_batch) == self.batch_size:
                results = self.yolo_predict(image_batch)
                predictions.append(results)
                image_batch = list()

        if image_batch:
            results = self.yolo_predict(image_batch)
            predictions.extend(results)
        if predictions:
            return pd.concat(predictions, ignore_index=True)
        return pd.DataFrame()

    def extract_tables_pdf(self, filename: str = None, contents: bytes = None):
        if filename:
            pdf_file = fitz.open(filename)
        elif contents:
            pdf_file = fitz.open("pdf", contents)
        else:
            raise ValueError("Either filename or contents should be provided")

        data = []
        num_to_page = {}
        imgs = []
        results = []
        for page, tabs, filename in self.gen_pages_with_tables(pdf_file):
            width, height = page.rect.width, page.rect.height
            if tabs:
                # get the page number
                page_num = page.number
                num_to_page[page_num] = page
                # get the page image
                pix = page.get_pixmap()
                # convert to PIL image
                # img = Image.open(io.BytesIO(pix.get)
                img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
                img_w, img_h = img.size
                imgs.append(img)
                result = {}
                result['page_no'] = page_num
                result['width'] = width
                result['height'] = height
                result['img_width'] = img_w
                result['img_height'] = img_h
                
                results.append(result)
        self.imgs = imgs
        preds = self.transformer_predict(imgs)
        results = pd.DataFrame(results) 
            
        return preds.merge(results.reset_index(), how='left', on="index")

In [90]:
ti = TableIdentifier("./model/table-extraction/", model=model)

In [91]:
DATA_DIR = "./data/input/pdfs"

In [None]:
preds_list = []
all_results = []
for filename in glob(DATA_DIR + "/*.pdf")[1:]:
    results = ti.extract_tables_pdf(filename)
    results['filename'] = filename
    print(results)
    all_results.append(results)

In [93]:
df = pd.concat(all_results, ignore_index=True)

In [100]:
df[df.page_no == 133]

Unnamed: 0,index,xmin,ymin,xmax,ymax,confidence,page_no,width,height,img_width,img_height,filename
26,2,141.5,700.53,516.55,752.8,0.76,133,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...


In [94]:
from pathlib import Path

In [95]:
def hires_img_crop(page: fitz.Page, bbox: dict) -> Image.Image:
    """
    Generate a high resolution image for the table
    """
    hires_pix = page.get_pixmap(dpi=120)
    wf, hf = hires_pix.width / page.rect.width, hires_pix.height / page.rect.height
    hires_img = Image.frombytes("RGB", (hires_pix.width, hires_pix.height), hires_pix.samples)
    hires_bbox = (bbox['xmin'] * wf, bbox['ymin'] * hf, bbox['xmax'] * wf, bbox['ymax'] * hf)
    return hires_img.crop(hires_bbox)

In [110]:
df.sort_values(by='confidence')

Unnamed: 0,index,xmin,ymin,xmax,ymax,confidence,page_no,width,height,img_width,img_height,filename
25,0,88.38,695.91,518.49,756.36,0.61,84,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
41,17,199.14,270.99,434.12,321.13,0.61,343,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
26,2,141.5,700.53,516.55,752.8,0.76,133,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
28,6,118.29,700.35,514.97,758.84,0.78,233,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
22,19,81.88,149.8,517.24,689.3,0.79,295,612.0,792.0,612,792,./data/input/pdfs/EdgeCore-Mesa-1-2-Loan-Agree...
33,12,83.94,76.08,528.36,147.54,0.8,301,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
27,5,109.62,715.79,518.85,765.02,0.81,232,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
47,23,86.21,707.36,520.05,755.43,0.9,551,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
46,22,86.0,696.05,521.06,753.07,0.92,547,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...
40,16,93.95,154.94,520.36,180.29,0.98,338,612.0,792.0,612,792,./data/input/pdfs/01-Polaris-Credit-Agreement-...


In [117]:
# pix = pdf_file[547].get_pixmap()
# img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
# img

In [97]:
for row in df.groupby('filename').agg(list).reset_index().to_dict(orient='records'): 
    pdf_file = fitz.open(filename)
    output_dir = os.path.join(Path(filename).parent.parent, 'table_image',  Path(filename).stem)
    os.makedirs(output_dir, exist_ok=True)
    for xmin, ymin, xmax, ymax, page_no in tqdm(zip(row['xmin'], row['ymin'], row['xmax'], row['ymax'], row['page_no'])): 
        page = pdf_file[page_no]
        width, height = page.rect.width, page.rect.height
        # get the page image
        pix = page.get_pixmap(dpi=120)
        # convert to PIL image
        # img = Image.open(io.BytesIO(pix.get)
        img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
        image_path = os.path.join(output_dir, f"{page_no}_full.png")
        img.save(image_path)
        img = hires_img_crop(page, {'xmin': xmin, 'xmax': xmax, 'ymin': ymin, 'ymax': ymax})
        image_path = os.path.join(output_dir, f"{page_no}.png")
        img.save(image_path)
        
    

23it [00:03,  6.35it/s]
25it [00:04,  5.31it/s]
