In [1]:
import json
import pandas as pd

# Load the JSON file containing the Textract output
with open('textract_output.json', 'r') as json_file:
    textract_response = json.load(json_file)

def extract_tables_from_json(response):
    # Create a dictionary to map Block Ids to Blocks for faster lookup
    block_map = {block['Id']: block for block in response['Blocks']}
    
    tables = []
    for block in response['Blocks']:
        if block['BlockType'] == 'TABLE':
            table = {}
            for relationship in block.get('Relationships', []):
                if relationship['Type'] == 'CHILD':
                    cell_ids = relationship['Ids']
                    for cell_id in cell_ids:
                        cell_block = block_map.get(cell_id)
                        if cell_block and cell_block['BlockType'] == 'CELL':
                            row = cell_block['RowIndex']
                            col = cell_block['ColumnIndex']
                            cell_text = ''
                            
                            # Get the text from the WORD blocks inside the CELL block
                            for rel in cell_block.get('Relationships', []):
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word_block = block_map.get(word_id)
                                        if word_block and word_block['BlockType'] == 'WORD':
                                            cell_text += word_block['Text'] + ' '
                            
                            # Add the cell text to the table dictionary
                            if row not in table:
                                table[row] = {}
                            table[row][col] = cell_text.strip()
            tables.append(table)
    return tables

In [4]:
# Call the function to extract tables
tables = extract_tables_from_json(textract_response)

# Display the extracted table data as DataFrames for better readability
for table in tables:
    # Convert the extracted table into a DataFrame
    df = pd.DataFrame.from_dict(table, orient='index')
    #print(df)
    
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
1,Claim Number,Policy Insured Code,Dealer / Lessee Name,VIN,Subline,State / Province,Cause of Loss,Loss Date,Claim Close Date,Units,Indemnity Payments,Indemnity Expenses,Net Insurance Recoveries,Indemnity Payments Net
2,150123008611,EKA119,ANDERSON FORD OF ST JOSEP,2LMDU88C77BJ14427,FCNAUS,MO,Collision,01/23/2015,02/19/2021,0,$0.00,$0.00,$0.00,$0.00
3,150123008611,,,,,,,,,0,$0.00,$0.00,$0.00,$0.00
4,161216008812,EKAA4G,ANDERSON FORD OF GRAND ISLAND,2LMHJ5AT5CBL55030,FCNAUS,NE,Collision,12/16/2016,,0,$0.00,$607.50,"$2,002.46",$607.50
5,161216008812,,,,,,,,,0,$0.00,$607.50,"$2,002.46",$607.50
6,190806002768,EKAA4G,ANDERSON FORD OF GRAND ISLAND,MULTI,FCNAUS,NE,Wind,08/06/2019,12/23/2020,8,$0.00,$0.00,"$1,677.00",$0.00
7,190806002768,,,,,,,,,8,$0.00,$0.00,"$1,677.00",$0.00
8,190927007769,EKAA4G,ANDERSON FORD OF GRAND ISLAND,2FMPK4J91KBB61371,FCNAUS,NE,Theft,09/27/2019,,1,$0.00,$586.07,"$35,200.00",$586.07
9,190927007769,,,,,,,,,1,$0.00,$586.07,"$35,200.00",$586.07
10,200118002308,EKA119,ANDERSON FORD OF ST JOSEP,1FT8W3DT3HEC08556,FCNAUS,MO,Theft,02/18/2017,09/17/2020,1,$0.00,$0.00,"$27,750.00",$0.00
