# Test File to Explore AWS Textract Parsing Behavior

### Include necessary imports in the cell below:

In [1]:
import os
import time
import json
import boto3
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAIError, RateLimitError
from langchain import HuggingFaceHub, PromptTemplate
import tiktoken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

ModuleNotFoundError: No module named 'boto3'

### Function Call to Retrieve Entire Document Analysis
- Requires a job ID from previous call

In [22]:

# Initialize Textract client
textract = boto3.client('textract', region_name='us-east-1')

def get_all_document_analysis(job_id):
    results = []
    next_token = None

    while True:
        # Prepare parameters for the API call
        params = {
            'JobId': job_id
        }
        
        if next_token:
            params['NextToken'] = next_token  # Include NextToken if available

        # Call the Textract API
        print(params)
        response = textract.get_document_analysis(**params)

        # Append the blocks to results
        results.extend(response.get('Blocks', []))

        # Check for NextToken in the response
        next_token = response.get('NextToken')

        # Break if there's no more pages
        if not next_token:
            break

    return results

# Usage
job_id = 'a60cf021227e3eb17e2288f6c104f782d0a36cb199f99f46cfefc813224640e3'
all_results = get_all_document_analysis(job_id)

with open('textract_output_next_page.json', 'w') as json_file:
    json.dump(all_results, json_file, indent=4)

{'JobId': 'a60cf021227e3eb17e2288f6c104f782d0a36cb199f99f46cfefc813224640e3'}
{'JobId': 'a60cf021227e3eb17e2288f6c104f782d0a36cb199f99f46cfefc813224640e3', 'NextToken': 'VDZS/xOfdp0zpWvvVE4/0e6UIUK7SSlFUKY6eu2GHYGA4X8D9jHW0tzK6okH4PYcKiWliGAJtET/fEHnjdPnQGpwniOJY+2GD0i/w6Gp/Or9LFY3byZdaYk+BgiAynDB4lAGx3s='}


In [6]:
# Load the JSON file containing the Textract output
with open('textract_output.json', 'r') as json_file:
    textract_response = json.load(json_file)

def extract_tables_from_json(response):
    # Create a dictionary to map Block Ids to Blocks for faster lookup
    block_map = {block['Id']: block for block in response['Blocks']}
    
    tables = []
    current_table = {}
    current_page = 1
    
    for block in response['Blocks']:
        if block['BlockType'] == 'TABLE':
            table = {}
            for relationship in block.get('Relationships', []):
                if relationship['Type'] == 'CHILD':
                    cell_ids = relationship['Ids']
                    for cell_id in cell_ids:
                        cell_block = block_map.get(cell_id)
                        if cell_block and cell_block['BlockType'] == 'CELL':
                            row = cell_block['RowIndex']
                            col = cell_block['ColumnIndex']
                            cell_text = ''
                            
                            # Get the text from the WORD blocks inside the CELL block
                            for rel in cell_block.get('Relationships', []):
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word_block = block_map.get(word_id)
                                        if word_block and word_block['BlockType'] == 'WORD':
                                            cell_text += word_block['Text'] + ' '
                            
                            # Add the cell text to the table dictionary
                            if row not in table:
                                table[row] = {}
                            table[row][col] = cell_text.strip()
            
            # Check if the table belongs to the current page
            if block.get('Page') == current_page:
                # Merge current table with the new one if it's on the same page
                current_table = merge_tables(current_table, table)
            else:
                # If page number changes, push the current table to the list and reset
                if current_table:
                    tables.append(current_table)
                current_table = table
                current_page = block.get('Page', current_page)

    # Append the last table after iteration
    if current_table:
        tables.append(current_table)
        
    return tables

def merge_tables(existing_table, new_table):
    """Merges two tables by appending rows from the new table to the existing table."""
    merged_table = existing_table.copy()
    
    for row, cols in new_table.items():
        if row in merged_table:
            # If row exists, merge column values
            merged_table[row].update(cols)
        else:
            # Add the new row if not present
            merged_table[row] = cols
            
    return merged_table


In [7]:
# Load the JSON file containing the Textract output
with open('textract_output.json', 'r') as json_file:
    textract_response = json.load(json_file)

def extract_tables_from_json(response):
    # Create a dictionary to map Block Ids to Blocks for faster lookup
    block_map = {block['Id']: block for block in response['Blocks']}
    
    tables = []
    for block in response['Blocks']:
        if block['BlockType'] == 'TABLE':
            table = {}
            for relationship in block.get('Relationships', []):
                if relationship['Type'] == 'CHILD':
                    cell_ids = relationship['Ids']
                    for cell_id in cell_ids:
                        cell_block = block_map.get(cell_id)
                        if cell_block and cell_block['BlockType'] == 'CELL':
                            row = cell_block['RowIndex']
                            col = cell_block['ColumnIndex']
                            cell_text = ''
                            
                            # Get the text from the WORD blocks inside the CELL block
                            for rel in cell_block.get('Relationships', []):
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word_block = block_map.get(word_id)
                                        if word_block and word_block['BlockType'] == 'WORD':
                                            cell_text += word_block['Text'] + ' '
                            
                            # Add the cell text to the table dictionary
                            if row not in table:
                                table[row] = {}
                            table[row][col] = cell_text.strip()
            tables.append(table)
    return tables

In [48]:
# Call the function to extract tables
tables = extract_tables_from_json(textract_response)

# Display the extracted table data as DataFrames for better readability
for table in tables:
    # Convert the extracted table into a DataFrame
    df = pd.DataFrame.from_dict(table, orient='index')
    #print(df)

# Set the first row as the column headers
df.columns = df.iloc[0]  # Use the first row as header
df = df.drop(df.index[0])  # Drop the first row from the DataFrame

# Remove duplicate claims
df = df.drop_duplicates(subset='Claim Number')

In [49]:
df

1,Claim Number,Policy Insured Code,Dealer / Lessee Name,VIN,Subline,State / Province,Cause of Loss,Loss Date,Claim Close Date,Units,Indemnity Payments,Indemnity Expenses,Net Insurance Recoveries,Indemnity Payments Net
2,150123008611,EKA119,ANDERSON FORD OF ST JOSEP,2LMDU88C77BJ14427,FCNAUS,MO,Collision,01/23/2015,02/19/2021,0,$0.00,$0.00,$0.00,$0.00
4,161216008812,EKAA4G,ANDERSON FORD OF GRAND ISLAND,2LMHJ5AT5CBL55030,FCNAUS,NE,Collision,12/16/2016,,0,$0.00,$607.50,"$2,002.46",$607.50
6,190806002768,EKAA4G,ANDERSON FORD OF GRAND ISLAND,MULTI,FCNAUS,NE,Wind,08/06/2019,12/23/2020,8,$0.00,$0.00,"$1,677.00",$0.00
8,190927007769,EKAA4G,ANDERSON FORD OF GRAND ISLAND,2FMPK4J91KBB61371,FCNAUS,NE,Theft,09/27/2019,,1,$0.00,$586.07,"$35,200.00",$586.07
10,200118002308,EKA119,ANDERSON FORD OF ST JOSEP,1FT8W3DT3HEC08556,FCNAUS,MO,Theft,02/18/2017,09/17/2020,1,$0.00,$0.00,"$27,750.00",$0.00
12,200206005764,EKA894,ANDERSON FORD OF LINCOLN / ANDERSON LINCOLN OF...,JN8AT2MV6HW016228,FCNAUS,NE,Collision,02/06/2020,02/11/2020,1,$0.00,$0.00,$0.00,$0.00
14,200218006998,EKA965,ANDERSON KIA,5XYPHDA52LG647197,FCNAUS,MO,All Other (non WX),02/18/2020,04/07/2020,1,$0.00,$0.00,$0.00,$0.00


In [50]:
df['text'] = df.apply(lambda x: ' '.join(x.astype(str)), axis=1)
df.iloc[1]['text']

'161216008812 EKAA4G ANDERSON FORD OF GRAND ISLAND 2LMHJ5AT5CBL55030 FCNAUS NE Collision 12/16/2016  0 $0.00 $607.50 $2,002.46 $607.50'

In [51]:
import re

# Regex pattern to match prices
pattern = r'\$([0-9]{1,3}(?:,[0-9]{3})*|[0-9]+)(\.[0-9]{2})?'

# Removing prices from the text column
df['text'] = df['text'].apply(lambda x: re.sub(pattern, '', x))
df.iloc[1]['text']

'161216008812 EKAA4G ANDERSON FORD OF GRAND ISLAND 2LMHJ5AT5CBL55030 FCNAUS NE Collision 12/16/2016  0    '

In [52]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # 2 labels: relevant or not

# Define the prompt (your task description and data)
prompt_template = """
Given the following claims data, remove all claims not relevant to open dealer lot coverage or that involve third-party coverage.
Return the Claim Number of the remaining claims along with a short description of why each was included.
CLAIM DATA GOES HERE
"""

# Function to count tokens using Hugging Face tokenizer
def count_tokens(text):
    tokens = tokenizer.encode(text, return_tensors="pt")
    return tokens.size(1)  # Return the token count

# Function to batch the DataFrame
def batch_dataframe(df, max_tokens, prompt_tokens):
    batches = []
    current_batch = []
    current_tokens = 0

    for _, row in df.iterrows():
        row_str = row.to_string(index=False, header=False)  # Convert row to string
        row_tokens = count_tokens(row_str)

        # Check if adding this row exceeds the limit
        if current_tokens + row_tokens + prompt_tokens > max_tokens:
            if current_batch:  # If there's already data in the current batch
                batches.append(pd.DataFrame(current_batch))
                current_batch = []  # Reset current batch
                current_tokens = 0

        current_batch.append(row)
        current_tokens += row_tokens

    if current_batch:  # Add any remaining data
        batches.append(pd.DataFrame(current_batch))

    return batches

# Generate responses for each batch
def process_batches(batches, prompt_template, model, tokenizer, max_length=200):
    responses = []
    for i, batch in enumerate(batches):
        data_string = batch.to_string(index=False, header=True)  # Create the data string from the batch
        
        # Create the full prompt with data included
        full_prompt = prompt_template.replace("CLAIM DATA GOES HERE", data_string)  # Replace placeholder with data
        
        try:
            response = generate_response(full_prompt, model, tokenizer, max_length)
            responses.append((i + 1, response))  # Append the batch number and response
        except Exception as e:
            print(f"Error processing batch {i + 1}: {e}")
    
    return responses

# Function to generate a response from the model
def generate_response(prompt, model, tokenizer, max_length=200):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    # Generate the output
    outputs = model.generate(inputs["input_ids"], max_length=max_length)

    # Decode the generated tokens to get the output text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Calculate prompt tokens and convert the entire DataFrame into batches
prompt_tokens = count_tokens(prompt_template)
max_tokens = 512  # Adjust based on model limit
df_batches = batch_dataframe(df, max_tokens, prompt_tokens)

# Process the batches and get responses
responses = process_batches(df_batches, prompt_template, model, tokenizer)

# Print or handle the responses
for batch_num, response in responses:
    print(f"Response for batch {batch_num}:")
    print(response)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors


Response for batch 1:
['Relevant']
Response for batch 2:
['Relevant']
Response for batch 3:
['Relevant']
Response for batch 4:
['Relevant']
Response for batch 5:
['Relevant']
Response for batch 6:
['Relevant']
Response for batch 7:
['Relevant']


In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
model = AutoModelForSequenceClassification.from_pretrained("llmware/industry-bert-insurance-v0.1")

def create_prompt(claim):
    return f"Evaluate the following claim for relevance to open dealer lot coverage excluding third parties:{claim}\nIs this claim relevant? (yes/no)"

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at llmware/industry-bert-insurance-v0.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
import torch.nn.functional as F
# Classify claims
def classify_claims(df):
    relevant_claims = []

    for index, row in df.iterrows():
        prompt = create_prompt(row)
        inputs = tokenizer(prompt, return_tensors="pt")

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = F.softmax(logits, dim=1)  # Apply softmax to get probabilities
            
            # Print logits and probabilities for debugging
            print(f"Claim {row['Claim Number']} Logits: {logits}, Probabilities: {probabilities}")

            predicted_class = logits.argmax().item()  # Get the predicted class index

        # Assuming class index 1 is "relevant"
        if predicted_class == 1:  # Adjust based on your model's output class index
            relevant_claims.append(row)

    return pd.DataFrame(relevant_claims)

# Get relevant claims
relevant_claims_df = classify_claims(df)

# Print the filtered claims
print("Filtered Claims:")
print(relevant_claims_df)

Claim 150123008611 Logits: tensor([[ 0.1962, -0.2636]]), Probabilities: tensor([[0.6130, 0.3870]])
Claim 161216008812 Logits: tensor([[ 0.1819, -0.2930]]), Probabilities: tensor([[0.6165, 0.3835]])
Claim 190806002768 Logits: tensor([[ 0.1818, -0.2586]]), Probabilities: tensor([[0.6084, 0.3916]])
Claim 190927007769 Logits: tensor([[ 0.1822, -0.2629]]), Probabilities: tensor([[0.6095, 0.3905]])
Claim 200118002308 Logits: tensor([[ 0.2032, -0.2967]]), Probabilities: tensor([[0.6224, 0.3776]])
Claim 200206005764 Logits: tensor([[ 0.1700, -0.3021]]), Probabilities: tensor([[0.6159, 0.3841]])
Claim 200218006998 Logits: tensor([[ 0.1874, -0.2653]]), Probabilities: tensor([[0.6113, 0.3887]])
Filtered Claims:
Empty DataFrame
Columns: []
Index: []


In [79]:


from google.cloud import documentai_v1 as documentai
from google.oauth2 import service_account
import base64
import json

# Prepare variables
project_id = 'verdant-cargo-443521-j3'
location = 'us'
processor_id = 'f2db9bcc34ed8bb5'

file_path = '/path/to/local/file/.pdf'
mime_type = 'application/pdf'

# Load the service account key
credentials = service_account.Credentials.from_service_account_file(
    "../secret/verdant-cargo-443521-j3-8d56893a2e2e.json"
)

opts = {
    "api_endpoint" : f"{location}-documentai.googleapis.com"
}

# Configure the processor client (i.e. prepare the endpoint)
client = documentai.DocumentProcessorServiceClient(client_options=opts, credentials=credentials)

name = client.processor_path(project_id, location, processor_id)

#Open File
with open('../classification_model/loss_runs/input/Copy of Stoler of Queens, Inc dba Silver Star Motors - Loss Run - 942024.pdf', 'rb') as pdf_file:
    pdf_data = pdf_file.read()

# Construct the request
raw_document = documentai.RawDocument(content=pdf_data, mime_type=mime_type)

request = documentai.ProcessRequest(name=name, raw_document=raw_document)

# Analyze output
result = client.process_document(request=request)

document = result.document

In [80]:
print(document)
for page in document.pages:
    for table in page.tables:
        #print(table)
        for row in table.header_rows:
            row_text = ""
            for cell in row.cells:
                cell_text = ""
                for text_segment in cell.layout.text_anchor.text_segments:
                    cell_text += document.text[text_segment.start_index:text_segment.end_index]
                row_text += cell_text.strip() + ' '
                
            #print(f'{row_text}\n')

uri: ""
mime_type: "application/pdf"
text: "RISK POINT\nRisk Point Program\nLoss History\nEXCELLENCE INTEGRITY PROFESSIONALISM\nAccount:\nStoler of Queens, Inc dba Silver Star Motors\nAccount #:\n100046290\nCity, State Zip:\nCoverage Period:\nLong Island City, New York 11101\n3/28/2024 to 3/28/2025\n9/4/2024\nValued as of:\nCause of\nDate of\nClaim Number\nInsured Name\nDescription of Accident\nIndemnity\nExpense\nRecovery\nStatus\nLoss\nLoss\n907885-CB\nStoler of Queens, Inc dba\nCollision\n17,143.90\n240.00\n0.00\nClosed\nSilver Star Motors\n7/25/2024 IVD trying to park vehicle\nin stacker lift that they\npark the cars in, as he\nwas trying to back the car\nin, his foot got stuck\nwhile reversing and hit\nthe stackers.\n907814-CB\nStoler of Queens, Inc dba\nCollision\n6/21/2024 Rental was involved in\n0.00\n400.00\n0.00\nClosed\nSilver Star Motors\ncollision while being used\nas a service loaner.\n907812-CB\nStoler of Queens, Inc dba\nCollision\n1,300.00\n0.00\nOpen\n4/23/2024 Custom

In [75]:
from google.cloud import documentai_v1 as docai
from google.oauth2 import service_account
import os
from typing import Iterator, MutableSequence, Optional, Sequence, Tuple, List
from tabulate import tabulate
import pandas as pd

def table_extraction(
    document: docai.Document
) -> List[pd.DataFrame]:
    '''
    Function to extract tables from document AI output
    '''
    # Extract document text
    full_text = document.text
    tables_as_dataframes = []

    # Iterate over available pages and handle each available table
    for page_index, page in enumerate(document.pages):
        print(f"\n--- Page {page_index + 1} ---\n")

        for table_index, table in enumerate(page.tables):
            print(f"Table {table_index + 1}:")
            # Extract and print header and body rows
            headers = extract_table_rows(table.header_rows, full_text)
            body = extract_table_rows(table.body_rows, full_text)

            # Use the first header row (if available) as column names
            if headers:
                df = pd.DataFrame(body, columns=headers[0])
            else:
                # If no headers, create generic column names
                num_columns = max(len(row) for row in body) if body else 0
                df = pd.DataFrame(body, columns=[f"Column {i+1}" for i in range(num_columns)])
            
            # Append the DataFrame for this table
            tables_as_dataframes.append(df)
    
    return tables_as_dataframes

def extract_table_rows(
    table_rows: Sequence[docai.Document.Page.Table.TableRow], text: str
) -> List[List[str]]:
    '''
    Extracts rows of text from Document AI table rows
    '''
    rows = []
    for table_row in table_rows:
        row_data = []
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text).strip()
            row_data.append(cell_text)
        rows.append(row_data)
    return rows

def layout_to_text(layout: docai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )



In [81]:
dfs = table_extraction(document)


--- Page 1 ---

Table 1:
Table 2:


In [83]:
dfs[1]

Unnamed: 0,Claim Number,Insured Name,Cause of\nLoss,Date of\nLoss,Description of Accident,Indemnity,Expense,Recovery,Status
0,907885-CB,"Stoler of Queens, Inc dba\nSilver Star Motors",Collision,7/25/2024,IVD trying to park vehicle\nin stacker lift th...,17143.9,240.0,0.0,Closed
1,907814-CB,"Stoler of Queens, Inc dba\nSilver Star Motors",Collision,6/21/2024,Rental was involved in\ncollision while being ...,0.0,400.0,0.0,Closed
2,907812-CB,"Stoler of Queens, Inc dba\nSilver Star Motors",Collision,4/23/2024,Customer was involved in a collision while in ...,6262.12,1300.0,0.0,Open
