In [1]:
from fitz.fitz import Rect  # Rect coordinates
from tabula import read_pdf  # to extract table dataframe from pdf
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_path = "missouri1.pdf"

In [3]:
class TextRect:
    """
    Class TextRect aim to store an object with a text and its coordinates in a Rect object.
    Arguments:
        - top (float) : top coordinates of the text
        - left (float) : left coordinates of the text
        - width (float) : width coordinates of the text
        - height (float) : height coordinates of the text
        - text (string) : text of the PDF we are dealing with
    """

    def __init__(self, top, left, width, height, text):
        self._rect = Rect(left + width, top - height, left, top)
        self._text = text

    def get_rect(self):
        """
        Function to return the Rect object
        """
        return self._rect

    def get_text(self):
        """
        Function to return the string text
        """
        return self._text


def transform(x):
    """
    Function to transform dictionary into TextRect object
    Argument:
        - x (dict) : dictionary with key {'top', 'left', 'width', 'height', 'text'}
    Return:
        - TextRect object with coordinates of text, text and page of text.
    """
    return TextRect(**x)


def transform_text(textRect):
    """
    Function to get the text of a TextRect object
    Argument:
        - textRect (TextRect) : TextRect object
    Return:
        - text (string)
    """
    return textRect.get_text()


def transform_rect(x):
    """
    Function to get the Rect coordinates of a TextRect object
    Argument:
        - textRect (TextRect) : TextRect object
    Return:
        - rectangle coordinates (Rect)
    """
    return x.get_rect()


def get_text_df(json_df):
    """
    Function to get a dataframe with text only
    Arguments :
        - json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
    Return : pandas.DataFrame(String)
    """
    # Vectorisation of transform() function for more speed
    vtransform = np.vectorize(transform)
    vtransform_text = np.vectorize(transform_text)

    # Apply on all value transform() and transform_text() to get from json
    # dataframe the TextRect object then the string text
    return json_df.apply(vtransform).apply(vtransform_text)


def get_rect_df(json_df):
    """
    Function to get a dataframe with Rect coordinates only
    Arguments :
        - json_df (pandas.DataFrame(TextRect)) : df is a dataframe from raw json table
    Return : pandas.DataFrame(Rect)
    """
    # Vectorisation of transform() function for more speed
    vtransform = np.vectorize(transform)

    # Apply on all value transform() and transform_text() to get from json
    # dataframe the TextRect object then the Rect coordinates
    return json_df.apply(vtransform).applymap(transform_rect)

def extract_df_list(file_path):
    """
    Method to extract 2 lists, one with dataframes containing text table,
    one with dataframes containing coordinates tables.
    Argument:
        - file_path (string) : String path of the PDF file
    Returns :
        - document_txt (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
        to text found in all table.
        - document_rect (list of pandas.DataFrame) : List of DataFrame. Each dataframe correspond
        to text coordinates found in all table.
    """
    json_pages = []

    # For all page, we extract all table as a json and put them into a list
    json_pages.append(read_pdf(file_path, pages='all', output_format='json'))

    # Extract txt and coordinates from json
    list_df_txt = []
    list_df_rect = []

    # For all page, we iterate over each json table found to extract text and coordinates
    for json_df in json_pages:
        print("Jsondata",json_df[0]['data'])
        
        list_df_txt.append(get_text_df(pd.DataFrame(json_df[0]['data'])))
#         list_df_rect.append(get_rect_df(pd.DataFrame(json_df[0]['data'])))

    return list_df_txt, list_df_rect

# Get your 2 dataframes
list_df_1_txt, list_df_1_rect = extract_df_list(file_path)

Jsondata []


In [4]:
list_df_1_txt

[Empty DataFrame
 Columns: []
 Index: []]

In [5]:
list_df_1_rect

[]