In [13]:
import os
import time
import boto3
import json
import pandas as pd
from dotenv import load_dotenv

ModuleNotFoundError: No module named 'dotenv'

### Function to Parse AWS Textract document analysis

In [2]:
# Load the environment variables from the .env file
load_dotenv()

# Initialize Textract client
textract = boto3.client('textract', region_name='us-east-1')

# Get Bucket Name from Environment Variable
bucket_name = os.getenv('BUCKET')

def startDocumentAnalysis(file_name: str) -> str:
    # Start the document analysis job
    response = textract.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': bucket_name,
                'Name': file_name,
            }
        },
        FeatureTypes=['TABLES'],
    )

    job_id = response['JobId']
    return job_id

def extract_tables_from_json(job_id):
    # Fetch first page of results
    response = textract.get_document_analysis(JobId=job_id)
    
    # Create tables array to store each table from each page
    tables = []
    
    while True:  # Continue until there are no more pages
        # Create a dictionary to map Block Ids to Blocks for faster lookup
        block_map = {block['Id']: block for block in response['Blocks']}

        with open(f'textract_output_{i}.json', 'w') as json_file:
            json.dump(response, json_file, indent=4)  # Save with indentation for readability
        i += 1
        # Iterate through the blocks in the current response
        for block in response.get('Blocks', []):
            # If block is a table, traverse each CHILD ID represents a table cell
            if block['BlockType'] == 'TABLE':
                curr_table = {}  # Initialize a new table for this block
                for relationship in block.get('Relationships', []):
                    if relationship['Type'] == 'CHILD':
                        cell_ids = relationship.get('Ids', [])
                        for cell_id in cell_ids:
                            cell_block = block_map.get(cell_id)

                            if cell_block and cell_block['BlockType'] == 'CELL':
                                row = cell_block['RowIndex']
                                col = cell_block['ColumnIndex']
                                cell_text = ''

                                # Get the text from the WORD blocks inside the CELL block
                                for rel in cell_block.get('Relationships', []):
                                    if rel['Type'] == 'CHILD':
                                        for word_id in rel.get('Ids', []):
                                            word_block = block_map.get(word_id)
                                            if word_block and word_block['BlockType'] == 'WORD':
                                                cell_text += word_block['Text'] + ' '
                                # Add the cell text to the table dictionary
                                if row not in curr_table:
                                    curr_table[row] = {}
                                curr_table[row][col] = cell_text.strip()
                #print(curr_table)
                # Append the current table to the tables list
                tables.append(curr_table)

        # Check for the next token
        next_token = response.get('NextToken')
        print(next_token)
        if not next_token:
            break  # Exit the loop if there are no more pages

        # Fetch the next page of results
        response = textract.get_document_analysis(JobId=job_id, NextToken=next_token)

    print(len(tables))
    table_dfs = []
    for table in tables:
        # Convert the extracted table into a DataFrame
        table_dfs.append(pd.DataFrame.from_dict(table, orient='index'))
    return table_dfs

def getDocumentAnalysis(job_id: str):
    # Check job status
    while True:
        response = textract.get_document_analysis(JobId=job_id)
        status = response['JobStatus']

        if status in ['SUCCEEDED', 'FAILED']:  # If status received, escape loop
            break
        time.sleep(5)  # Wait before checking again

    if status == 'SUCCEEDED':
        return extract_tables_from_json(job_id)
    else:
        print("Job failed.")
        return []

In [90]:
response = textract.start_document_analysis(
    DocumentLocation={
        'S3Object': {
            'Bucket': bucket_name,
            'Name': 'LenStolerPage3.pdf',
        }
    },
    FeatureTypes=['TABLES'],
)
job_id = response['JobId']

In [136]:
job_id = 'b1a8ede14bb1550344aaf3862a72f6d50e94bb3c1badc6fc01ceb2e50b9359c4'

In [147]:
table_dfs = getDocumentAnalysis(job_id)
len(table_dfs)

Processing page 0, Total Blocks: 1000
qo+wUWWB68RHs5kgpvznzPtbg5FLfUv9g3MV81d214mrqD2YRDgRB8I2lsAubjHyW/ZDE7h2jUbEGx080pVZHIdR74Z78k+ul4PnnxjO5xzOjunLszQH5oNx6trF5IqW8aRqpvg=
Processing page 1, Total Blocks: 782
None
3


3

## Function to Load File Depending on Extension

In [112]:
# Function to load file based on its extension
def load_file(file_path):
    # Get the file extension
    file_extension = os.path.splitext(file_path)[1]

    # Conditional logic to load the file
    if file_extension == '.csv':
        # Load CSV file
        df = pd.read_csv(file_path)
    elif file_extension == '.xlsx':
        # Load Excel file
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV or XLSX file.")
    
    # Use loc to slice the DataFrame up to column 'description' (inclusive)
    df = df.loc[:, :'description']
    return df

## Prepare Correct Output Data for Labeling

In [126]:
frames = []

# Convert all output files into single dataframe
for filename in os.listdir('loss_runs/output'):
    f = os.path.join('loss_runs/output', filename)
    # checking if it is a file
    if os.path.isfile(f):
        frames.append(load_file(f))

# Merge all output dataframes
merged_output_frames = pd.concat(frames)
merged_output_frames.to_csv('merged_loss_runs.csv', index=False)

## Label Input Data if Claim Number Appears in Output

In [145]:
df1 = table_dfs[2]

# Set the first row as the column headers
df1.columns = df1.iloc[0]  # Use the first row as header
df1 = df1.drop(df1.index[0])  # Drop the first row from the DataFrame
df1['relevant'] = df1['Claim Number'].isin(merged_output_frames['claim_number'])
df1

1,Coverage,Sub Coverage,Val ID Description,Claim Number,Date of Loss,Status,Claimant Name,Accident Narrative,Paid Indemnity,Paid Expense,Reserves Total,Claim Recovery,Net Incurred,relevant
2,AUTOMOBILE,AUTO PHYSICAL DAMAGE,SERVICE VEHICLE COLLISION,1510189190,20220818.0,Closed,Kingsley Co.,OV was in the R through lane attempting to cha...,$509.73,$138.85,$0.00,$0.00,$648.58,False
3,AUTOMOBILE,GARAGEKEEP ERS,GARAGE KEEPERS LIAB- COLL,1510188434,20220616.0,Closed,Luckey John,Tech was driving customers vehicle out of serv...,"$6,415.24",$143.67,$0.00,$0.00,"$6,558.91",False
4,AUTOMOBILE,GARAGEKEEP ERS,GARAGE KEEPERS LIAB- COMP,1510180666,20220216.0,Closed,Henderson Joseph,vehicle caught on fire,"$3,397.39",$113.80,$0.00,$0.00,"$3,511.19",False
5,GENERAL LIABILITY,GARAGE LIABILITY,GARAGE LIABILITY - BI,4620227025,20220818.0,Closed,Gibb-Martin Shannon F,OV was in the R through lane attempting to cha...,$0.00,$41.60,$0.00,$0.00,$41.60,False
6,GENERAL LIABILITY,GARAGE LIABILITY,GARAGE LIABILITY - PD,4620227025,20220818.0,Closed,Gibb-Martin Shannon F,OV was in the R through lane attempting to cha...,$0.00,$0.00,$0.00,$0.00,$0.00,False
7,GENERAL LIABILITY,GARAGE LIABILITY,GARAGE LIAB MEDICAL PYMTS,4620221432,20220211.0,Closed,Gensor Janet,Letter of representation received for Janet Ge...,"$5,000.00",$0.00,$0.00,$0.00,"$5,000.00",False
8,GENERAL LIABILITY,GARAGE LIABILITY,GARAGE PREMISES BI,4620221432,20220211.0,Closed,Gensor Janet,Letter of representation received for Janet Ge...,$0.00,$0.00,$0.00,$0.00,$0.00,False
9,,Claim Count:,,5,,,,Sum:,"$15,322.36",$437.92,$0.00,$0.00,"$15,760.28",False


In [3]:
!pip install gmft

Collecting gmft
  Downloading gmft-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pypdfium2>=4 (from gmft)
  Downloading pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)
Collecting matplotlib (from gmft)
  Using cached matplotlib-3.9.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting tabulate (from gmft)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting contourpy>=1.0.1 (from matplotlib->gmft)
  Using cached contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->gmft)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->gmft)
  Using cached fonttools-4.54.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (163 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->gmft)
  Using cached kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib->gmft)
  Using cached pyparsing-

In [53]:
from gmft.auto import CroppedTable, AutoTableDetector, AutoFormatConfig, AutoTableFormatter
from gmft.pdf_bindings import PyPDFium2Document
from gmft.detectors.common import CroppedTable

detector = AutoTableDetector()
config = AutoFormatConfig(verbosity=3)
formatter = AutoTableFormatter(config=config)

def ingest_pdf(pdf_path): # produces list[CroppedTable]
    doc = PyPDFium2Document(pdf_path)
    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

tables, doc = ingest_pdf("loss_runs/input/LenStolerTest.pdf")
doc.close() # once you're done with the document

In [48]:
print(len(tables))

3


In [58]:
!pip show pypdfium2

Name: pypdfium2
Version: 4.30.0
Summary: Python bindings to PDFium
Home-page: https://github.com/pypdfium2-team/pypdfium2
Author: pypdfium2-team
Author-email: geisserml@gmail.com
License: (Apache-2.0 OR BSD-3-Clause) AND LicenseRef-PdfiumThirdParty
Location: /Users/benjaminmiller/UW-Madison_Undergraduate/Understory/venv/lib/python3.12/site-packages
Requires: 
Required-by: gmft


## Upload PDF File to S3

In [10]:
import requests
import base64

# The API endpoint URL for your Lambda function
API_URL = 'https://s5yeiaxtg6.execute-api.us-east-1.amazonaws.com/upload'

# Function to call the Lambda API
def upload_pdf_to_lambda(pdf_path, file_name):
    # Read and encode the PDF file as base64
    with open(pdf_path, "rb") as pdf_file:
        pdf_data = base64.b64encode(pdf_file.read()).decode('utf-8')

    # Prepare the request payload
    payload = {
        "pdf_data": pdf_data,
        "file_name": file_name
    }

    # Send POST request to the API Gateway endpoint
    try:
        response = requests.post(API_URL, json=payload)
        
        # Check if the request was successful
        if response.status_code == 200:
            print("File uploaded successfully:", response.text)
        else:
            print("Failed to upload file:", response.text)
    
    except requests.exceptions.RequestException as e:
        print("Error during request:", e)

# Usage example
upload_pdf_to_lambda("loss_runs/input/Loss_Run___len stoler 8-24_page_5.pdf", "sample2.pdf")

File uploaded successfully: File uploaded successfully!
