In [None]:
import boto3
from IPython.display import Image, display
from trp import Document
from PIL import Image as PImage, ImageDraw
import time
from IPython.display import IFrame

In [None]:
# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content

mySession = boto3.session.Session()
awsRegion = mySession.region_name
mySession, awsRegion

In [None]:
s3BucketName = "jiaxistempbucket"

In [None]:
# Amazon S3 client
s3 = boto3.client('s3')

# Amazon Textract client
textract = boto3.client('textract')

In [None]:
# Document
documentName = "sample_datasets/textract_immersion_day/bank_statement.pdf"

In [None]:
IFrame(s3.generate_presigned_url('get_object', Params={'Bucket': s3BucketName, 'Key': documentName}), 900, 400)

In [None]:
def startJob(s3BucketName, objectName):
    response = None
    response = textract.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES','FORMS']
    )

    return response["JobId"]

def isJobComplete(jobId):
    response = textract.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(5)
        response = textract.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId):

    pages = []
    response = textract.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']

    while(nextToken):
        response = textract.get_document_analysis(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [None]:
jobId = startJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
    response = getJobResults(jobId)

#print(response)

# Print detected text
for resultPage in response:
    for item in resultPage["Blocks"]:
        if item["BlockType"] == "LINE":
            print ('\033[94m' +  item["Text"] + '\033[0m')

In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
doc = Document(response)

In [None]:
statements = defaultdict(list)

column_names = ['Date', 'Details', 'Money out $', 'Money in $', 'Balance $']
cnt = 0
for page in doc.pages:
    for table in page.tables:
        skip_table = False
        for r, row in enumerate(table.rows):
            skip_row = False
            cnt+=1
            for c, cell in enumerate(row.cells):
                if r == 0:
                    if (cell.text.strip() not in column_names):
                        skip_table = True
                        cnt-=1
                        break

                else: 
                    print("Table[{}][{}] = {}".format(r, c, cell.text))
                    statements[column_names[c]].append(cell.text)            
            if skip_table: break

In [None]:
cnt

In [None]:
pd.DataFrame(statements)

In [None]:
for page in doc.pages:
    # Print fields
    print("Fields:")
    for field in page.form.fields:
        print("Key: {}, Value: {}".format(field.key, field.value))