<a href="https://colab.research.google.com/github/arnabd64/Amazon-Textract-Guide/blob/main/AsyncTextract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Textract: Asynchronous Multipage PDF extraction

In [None]:
! pip install --progress-bar=off \
    boto3 \
    botocore \
  > install.txt

In [None]:
import logging

# Create a logger
log = logging.getLogger('log')
log.propagate = False
log.setLevel(logging.INFO)

# Create a custom logging format
formatter = logging.Formatter('%(levelname)s - %(message)s')

# Create a console handler
console = logging.StreamHandler()
console.setFormatter(formatter)
logfile = logging.FileHandler('log.txt')
logfile.setFormatter(formatter)

# Add the handler to the logger
log.addHandler(console)
log.addHandler(logfile)

In [None]:
# @title AWS Credentials
AWS_ACCESS_KEY_ID = "" # @param {"type":"string"}
AWS_SECRET_ACCESS_KEY = "" # @param {"type":"string"}
AWS_REGION_NAME = "" # @param {"type":"string"}

# Create Session
import boto3
import botocore

AWS_SESSION = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION_NAME
)
log.info(f"AWS Session Created")

s3 = AWS_SESSION.client('s3')
textract = AWS_SESSION.client('textract')

INFO - AWS Session Created


## Create Bucket

In [None]:
s3.list_buckets()

{'ResponseMetadata': {'RequestId': '5G3KMK2VXP42ADTB',
  'HostId': 'SXzJgIpqpwej1Z6+C4rrplr2+jrcEGdRX/3Q5RwoYRxx/YV9vfPK4cJHYmyL3Rid9F7YcIzkxmzNCorLWPjvmg==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'SXzJgIpqpwej1Z6+C4rrplr2+jrcEGdRX/3Q5RwoYRxx/YV9vfPK4cJHYmyL3Rid9F7YcIzkxmzNCorLWPjvmg==',
   'x-amz-request-id': '5G3KMK2VXP42ADTB',
   'date': 'Tue, 04 Feb 2025 11:24:18 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'textract-cache-store',
   'CreationDate': datetime.datetime(2025, 1, 20, 7, 46, 23, tzinfo=tzlocal())}],
 'Owner': {'ID': '173f9568245bd4d315dcdb1f79aff2fe640487d178e0f3b5252f341860a6e0de'}}

## Upload Document

In [None]:
from pathlib import Path
from google.colab import files

# Initiate Upload
uploaded_files = files.upload()

# Retrieve the Filepath
FILEPATH = Path(list(uploaded_files.keys())[0])
log.info(f"FilePath: {FILEPATH.absolute()}")

INFO - FilePath: /content/DriverLicense.pdf


Saving DriverLicense.pdf to DriverLicense.pdf


## Upload Document to S3 Bucket

In [None]:
import uuid

# create bucket textract-cache-store if not exists
BUCKET_NAME = 'textract-cache-store'
try:
  s3.head_bucket(Bucket=BUCKET_NAME)
  log.info(f"Bucket {BUCKET_NAME} Found")

except botocore.exceptions.ClientError as e:
  if e.response['Error']['Code'] == '404':
    s3.create_bucket(Bucket=BUCKET_NAME)
    log.info(f"Bucket {BUCKET_NAME} Created")
  else:
    log.error(str(e))
    raise e

# upload document to S3
DOCUMENT_KEY = f"{str(uuid.uuid4())}.pdf"
try:
  s3.upload_file(
      Filename=FILEPATH.absolute(),
      Bucket=BUCKET_NAME,
      Key=DOCUMENT_KEY
  )

except botocore.exceptions.ClientError as e:
  log.error(str(e))
  raise e

log.info(f"Document {DOCUMENT_KEY} Uploaded to Bucket {BUCKET_NAME}")

INFO - Bucket textract-cache-store Found
INFO - Document 43c5aca6-e8e2-48f2-8fff-25b054ab2e0a.pdf Uploaded to Bucket textract-cache-store


# Amazon Textract

## Create Client

In [None]:
textract = AWS_SESSION.client('textract')

## Define Queries

In [None]:
QUERIES = [
    "What is the Title of the Document",
    "What is the Name of Employer",
    "What is the Date?"
]

ALIAS = [
    "Title",
    "Employer",
    "Date"

]

if not len(QUERIES) == len(ALIAS):
  log.error("Mismatch in Number of Queries & Aliases")
  raise ValueError("Number of Queries and Aliases must be equal")

log.info(f"Total Queries: {len(QUERIES)}")

INFO - Total Queries: 3


## Start Job

In [None]:
response = textract.start_document_analysis(
    DocumentLocation={
        'S3Object': {
            'Bucket': BUCKET_NAME,
            'Name': DOCUMENT_KEY
        }
    },
    FeatureTypes=['QUERIES'],
    QueriesConfig={
        'Queries': [
            {'Text': query, "Alias": alias}
            for query, alias in zip(QUERIES, ALIAS)
        ]
    }
)

jobid = response['JobId']
log.info(f"JobId: {jobid}")

INFO - JobId: 9197be8c3cc8a0faa5a46a00214e12525ec2aaceefb1b9bf6925545a2a51e823


## Check if Completed

In [None]:
import time

while True:
  response = textract.get_document_analysis(JobId=jobid)
  status = response['JobStatus']
  log.info(f"JobStatus: {status}")

  if status == 'SUCCEEDED':
    break
  elif status == 'FAILED':
    log.error(response['StatusMessage'])
  else:
    time.sleep(2)
    continue

INFO - JobStatus: IN_PROGRESS
INFO - JobStatus: IN_PROGRESS
INFO - JobStatus: SUCCEEDED


# Filter Results

In [None]:
def retrieve_answer_block_id(block: dict):
    if "Relationships" not in block:
        return None
    for relation in block["Relationships"]:
        if relation["Type"] == "ANSWER":
            return relation["Ids"][0]


def retrieve_answer_text(blocks: list[dict], answer_block_id: str):
    for block in blocks:
        if block["Id"] == answer_block_id:
            return block["Text"]


def postprocess(textract_response: dict) -> dict[str, str]:
    response = dict()
    blocks = textract_response["Blocks"]
    for block in blocks:
        if block["BlockType"] == "QUERY":
            query = block["Query"]["Text"]
            alias = block['Query']['Alias']
            answer_id = retrieve_answer_block_id(block)
            answer = retrieve_answer_text(blocks, answer_id)
            response[alias] = answer
    log.info("Filtered the Response")
    return response

filtered_response = postprocess(response)

INFO - Filtered the Response


In [None]:
filtered_response

{'Title': 'PERMIS DE CONDUIRE', 'Employer': 'RENCK', 'Date': '02/21/01'}

## Delete File from S3

In [None]:
def delete_file(key: str, bucket: str):
  try:
    s3.delete_object(
        Bucket=bucket,
        Key=key
    )
    log.info(f"Deleted {key} from Bucket {bucket}")

  except botocore.exceptions.ClientError as e:
    log.error(str(e))
    raise e


delete_file(DOCUMENT_KEY, 'textract-cache-store')

INFO - Deleted 43c5aca6-e8e2-48f2-8fff-25b054ab2e0a.pdf from Bucket textract-cache-store


# List files in Textract Bucket

In [None]:
response = s3.list_objects_v2(Bucket='textract-cache-store')

if 'Contents' not in response:
  log.error("No Files Found")
  files = None
else:
  files = [obj['Key'] for obj in response['Contents']]
  print(files)

ERROR - No Files Found


In [None]:
for file in files:
  delete_file(file, 'textract-cache-store')

TypeError: 'NoneType' object is not iterable