See more at: 
- https://github.com/aws-samples/amazon-textract-textractor
- https://aws-samples.github.io/amazon-textract-textractor/

In [None]:
import os
import pandas as pd
from textractor import Textractor
from textractor.data.constants import TextractFeatures
from IPython.display import Markdown

In [None]:
# Start Amazon Extract, through wrapper
extractor = Textractor(region_name="us-west-2", profile_name="pp-genai")

## IMPORTANTE: 
> Deshabilita (save_image=False) si tu laptop tiene problemas con "save_image"

In [None]:
# Analyze document
# e.g. pdf_samples/p7-9-Doc_2018_UserManual_Q9FNSeries.pdf, images/recibo_aeropuerto.jpg
file = "images/recibo_aeropuerto.jpg"
document = extractor.start_document_analysis(
    file_source=f"s3://genai-carlos-contreras-bucket-data-quarks-labs-oregon-01/data/{file}",
    features=[
        TextractFeatures.TABLES,
        TextractFeatures.LAYOUT,
        TextractFeatures.FORMS
    ],
    save_image=True
)

In [None]:
Markdown(document.to_markdown())

In [None]:
# Visualize Document Layouts
document.pages[0].visualize()

#### Begin: Procesado de Recibos

In [None]:
# Visualize Document Layouts
document.pages[0].key_values

In [None]:
# Get KV pairs; e.g. document.key_values
receipt_kv = [{'key':str(kv.key), 'value':str(kv.value)} for kv in document.key_values]

In [None]:
receipt_kv

In [None]:
print(receipt_kv[0]['value'])

In [None]:
# Export the key-values as csv, to Amazon S3
document.export_kv_to_csv(
    include_kv=True,
    include_checkboxes=False,
    filepath=os.path.join("data/receipts/receipt_1.csv")
)

In [None]:
# Read CSV into a DF
df_key_values = pd.read_csv("data/receipts/receipt_1.csv", sep=";")

In [None]:
# Rows to Columns
df_raw = pd.pivot_table(df_key_values, values='Value', columns='Key', aggfunc="sum")

In [None]:
df_raw.info()

In [None]:
df_raw['IMPORTE'].head()

#### End: Procesado de Recibos

---
### Example of ETL with PDF, before ingesting these docs into the KB

In [None]:
layouts = []

for page in document.pages:
    for layout in page.layouts:
        match layout.layout_type:
            case "LAYOUT_TABLE" | "LAYOUT_TITLE" | "LAYOUT_TEXT":
                print(f"Layout found: {layout.layout_type}")
                layouts.append({
                    "page_num": page.page_num,
                    "text": layout.to_markdown()
                })
            case _:
                print(f"Unsupported layout type: {layout.layout_type}")
                pass

In [None]:
layouts