In [55]:
import os
import time
import boto3
import json
import pandas as pd
from dotenv import load_dotenv

### Function to Parse AWS Textract document analysis

In [65]:
# Load the environment variables from the .env file
load_dotenv()

# Initialize Textract client
textract = boto3.client('textract', region_name='us-east-1')

# Get Bucket Name from Environment Variable
bucket_name = os.getenv('BUCKET')

def startDocumentAnalysis(file_name: str) -> str:
    # Start the document analysis job
    response = textract.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': bucket_name,
                'Name': file_name,
            }
        },
        FeatureTypes=['TABLES'],
    )

    job_id = response['JobId']
    return job_id

def extract_tables_from_json(job_id):
    # Fetch first page of results
    response = textract.get_document_analysis(JobId=job_id)
    #print(response)
    
    # Create tables array to store each table from each page
    tables = []
    i = 0
    while True:  # Continue until there are no more pages
        # Create a dictionary to map Block Ids to Blocks for faster lookup
        block_map = {block['Id']: block for block in response['Blocks']}

        print(f"Processing page {i}, Total Blocks: {len(response.get('Blocks', []))}")
        with open(f'textract_output_{i}.json', 'w') as json_file:
            json.dump(response, json_file, indent=4)  # Save with indentation for readability
        i += 1
        # Iterate through the blocks in the current response
        for block in response.get('Blocks', []):
            # If block is a table, traverse each CHILD ID represents a table cell
            if block['BlockType'] == 'TABLE':
                curr_table = {}  # Initialize a new table for this block
                for relationship in block.get('Relationships', []):
                    if relationship['Type'] == 'CHILD':
                        cell_ids = relationship.get('Ids', [])
                        for cell_id in cell_ids:
                            cell_block = block_map.get(cell_id)

                            if cell_block and cell_block['BlockType'] == 'CELL':
                                row = cell_block['RowIndex']
                                col = cell_block['ColumnIndex']
                                cell_text = ''

                                # Get the text from the WORD blocks inside the CELL block
                                for rel in cell_block.get('Relationships', []):
                                    if rel['Type'] == 'CHILD':
                                        for word_id in rel.get('Ids', []):
                                            word_block = block_map.get(word_id)
                                            if word_block and word_block['BlockType'] == 'WORD':
                                                cell_text += word_block['Text'] + ' '
                                # Add the cell text to the table dictionary
                                if row not in curr_table:
                                    curr_table[row] = {}
                                curr_table[row][col] = cell_text.strip()
                #print(curr_table)
                # Append the current table to the tables list
                tables.append(curr_table)

        # Check for the next token
        next_token = response.get('NextToken')
        print(next_token)
        if not next_token:
            break  # Exit the loop if there are no more pages

        # Fetch the next page of results
        response = textract.get_document_analysis(JobId=job_id, NextToken=next_token)

    print(len(tables))
    table_dfs = []
    for table in tables:
        # Convert the extracted table into a DataFrame
        table_dfs.append(pd.DataFrame.from_dict(table, orient='index'))
    return table_dfs

def getDocumentAnalysis(job_id: str):
    # Check job status
    while True:
        response = textract.get_document_analysis(JobId=job_id)
        status = response['JobStatus']

        if status in ['SUCCEEDED', 'FAILED']:  # If status received, escape loop
            break
        time.sleep(5)  # Wait before checking again

    if status == 'SUCCEEDED':
        return extract_tables_from_json(job_id)
    else:
        print("Job failed.")
        return []

In [23]:
response = textract.start_document_analysis(
    DocumentLocation={
        'S3Object': {
            'Bucket': bucket_name,
            'Name': 'AndersonTest.pdf',
        }
    },
    FeatureTypes=['TABLES'],
)
job_id = response['JobId']

In [24]:
job_id

'64660a22d00a412e4066f90dd0b3a6f596a33c04fb8fffbc75c43c3c18c9443a'

In [66]:
table_dfs = getDocumentAnalysis(job_id)
table_dfs[1]

Processing page 0, Total Blocks: 1000
8uHBS5Iss32p1NF6Jx/Uonxv4RSBI8z2PRRW/1QIJFz3UC5vKTlRR7jjIGX9EDTVQs7lpK+CksSJSV2DVyMm/nkXgI/MbhBIzPjzt6D4T7yt8XQzD1G3H0hTU0NogJny/PtDn+0=
Processing page 1, Total Blocks: 249
None
2


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
1,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,
4,,,,,,,(Wholesale Onl,,,,,,,
5,200401008217.0,,,,,,,,,1.0,$0.00,$0.00,$0.00,$0.00
6,200401024760.0,EKAA4G,ANDERSON FORD OF GRAND ISLAND,ZACCJBBB8JPJ32663,FCNAUS,NE,Hail,04/01/2020,01/28/2021,1.0,"$8,747.38",$0.00,$0.00,"$8,747.38"
7,200401024760.0,,,,,,,,,1.0,"$8,747.38",$0.00,$0.00,"$8,747.38"
8,200412008346.0,EKA894,ANDERSON FORD OF LINCOLN / ANDERSON LINCOLN OF...,MULTI,FCNAUS,NE,Hail,04/12/2020,04/14/2020,0.0,$0.00,$0.00,$0.00,$0.00
9,200412008346.0,,,,,,,,,0.0,$0.00,$0.00,$0.00,$0.00
10,200412008349.0,EKA538,ANDERSON AUTO GROUP OF LINCLON - SOUTH,MULTI,FCNAUS,NE,Hail,04/12/2020,11/13/2021,284.0,$0.00,"$7,448.00","$72,950.00","$7,448.00"
