In [None]:
!pip3 install textract-trp
!pip3 install simplejson
!pip install pythena

In [None]:
import boto3
from IPython.display import Image, display
from trp import Document
from PIL import Image as PImage, ImageDraw
import time
from IPython.display import IFrame
import pandas as pd
import random
import uuid
import numpy as np
import simplejson
import os
import datetime
import pythena

In [None]:
with open('/opt/ml/metadata/resource-metadata.json') as fh:
    metadata = simplejson.loads(fh.read())
accountid = metadata['ResourceArn'].split(':')[4]
%set_env accountid={accountid}
%set_env bucket_name=lab-{accountid}
bucket_name = os.getenv('bucket_name')

x = datetime.datetime.now()
etl_date = x.strftime("%Y%m%d_%H%M%S")
print(etl_date) 
%set_env etl_date={etl_date}

In [None]:
# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content

mySession = boto3.session.Session()
awsRegion = mySession.region_name

In [None]:
# S3 bucket that contains sample documents

# We are providing sample documents in this bucket so
# you do not have to manually download/upload test documents.

s3BucketName = "aws-workshops-" + awsRegion

In [None]:
# Amazon S3 client
s3 = boto3.client('s3')

# Amazon Textract client
textract = boto3.client('textract')

# Forms: Key/Values

https://docs.aws.amazon.com/textract/latest/dg/API_AnalyzeDocument.html

In [None]:
# Document
documentName = "textract-samples/employmentapp.png"

In [None]:
display(Image(url=s3.generate_presigned_url('get_object', Params={'Bucket': s3BucketName, 'Key': documentName})))

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    },
    FeatureTypes=["FORMS"])

#print(response)

doc = Document(response)
applicant_df = pd.DataFrame()
application_id = uuid.uuid4().hex
applicant_df.insert(0, 'application_id', [application_id], True) 

for page in doc.pages:
    # Print fields
    print("Fields:")
    column_index = 1
    for field in page.form.fields:
        #print("Key: {}, Value: {}".format(field.key, field.value))
        key = str(field.key).lower().replace(':','').replace(' ','_')
        value = str(field.value)
        applicant_df.insert(column_index, key, [value], True) 
        column_index = column_index + 1


In [None]:
applicant_df

# Tables

In [None]:
# Document
documentName = "textract-samples/employmentapp.png"

In [None]:
display(Image(url=s3.generate_presigned_url('get_object', Params={'Bucket': s3BucketName, 'Key': documentName})))

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': s3BucketName,
            'Name': documentName
        }
    },
    FeatureTypes=["TABLES"])

#print(response)

doc = Document(response)
employment_history_df = pd.DataFrame()
employment_history_df['application_id'] = np.nan
qtd_columns = 0

for page in doc.pages:
     # Print tables
    
    for table in page.tables:
        employment_history_list = table.rows
        employment_history_list.pop(0)
        header = employment_history_list[0]
        
        lines = []
        columns = []
        
        for r, row in enumerate(employment_history_list):
            line = []
            for c, cell in enumerate(row.cells):
                #print("Table[{}][{}] = {}".format(r, c, cell.text))
                if r == 0:
                    qtd_columns = qtd_columns + 1
                    column_name = str(cell.text).lower().strip().replace(' ','_')
                    employment_history_df[column_name] = np.nan
                    columns.append(column_name)
                else:
                    line.append(str(cell.text.strip()))
                    if (len(line) == qtd_columns):
                        lines.append(line)

employment_history_df = pd.DataFrame(lines, columns=columns)
employment_history_df['application_id'] = application_id

employment_history_df

In [None]:
applicant_df.to_parquet(
        "s3://" + 
        os.getenv('bucket_name') + 
        "/data/analytics/applicant/applicant_" +
        etl_date + 
        ".parquet.snappy")

employment_history_df.to_parquet(
        "s3://" + 
        os.getenv('bucket_name') + 
        "/data/analytics/employment_history/employment_history_" +
        etl_date + 
        ".parquet.snappy")

In [None]:
!pip install pythena

In [None]:
import pythena

In [None]:
client = boto3.client('athena', region_name="us-east-2")

client.start_query_execution(
    QueryString='drop table default.applicant',
    ResultConfiguration={'OutputLocation': 's3://' + bucket_name + '/output/'})

client.start_query_execution(
    QueryString='drop table default.employment_history',
    ResultConfiguration={'OutputLocation': 's3://' + bucket_name + '/output/'})

In [None]:
with open('create_applicant.sql') as ddl:
    client.start_query_execution(
        QueryString=ddl.read().format(bucket_name), 
        ResultConfiguration={'OutputLocation': 's3://' + bucket_name + '/output/'})

In [None]:
with open('create_employment_history.sql') as ddl:
    client.start_query_execution(
        QueryString=ddl.read().format(bucket_name), 
        ResultConfiguration={'OutputLocation': 's3://' + bucket_name + '/output/'})

In [None]:
athena_client = pythena.Athena(database="default", region="us-east-2") 

sql = """
select * 
from applicant a join employment_history e 
on a.application_id = e.application_id
"""

print(sql)

df_join, exec_id = athena_client.execute(sql)

df_join