In [1]:
import json
import pandas as pd

# Load the JSON file containing the Textract output
with open('textract_output.json', 'r') as json_file:
    textract_response = json.load(json_file)

def extract_tables_from_json(response):
    # Create a dictionary to map Block Ids to Blocks for faster lookup
    block_map = {block['Id']: block for block in response['Blocks']}
    
    tables = []
    for block in response['Blocks']:
        if block['BlockType'] == 'TABLE':
            table = {}
            for relationship in block.get('Relationships', []):
                if relationship['Type'] == 'CHILD':
                    cell_ids = relationship['Ids']
                    for cell_id in cell_ids:
                        cell_block = block_map.get(cell_id)
                        if cell_block and cell_block['BlockType'] == 'CELL':
                            row = cell_block['RowIndex']
                            col = cell_block['ColumnIndex']
                            cell_text = ''
                            
                            # Get the text from the WORD blocks inside the CELL block
                            for rel in cell_block.get('Relationships', []):
                                if rel['Type'] == 'CHILD':
                                    for word_id in rel['Ids']:
                                        word_block = block_map.get(word_id)
                                        if word_block and word_block['BlockType'] == 'WORD':
                                            cell_text += word_block['Text'] + ' '
                            
                            # Add the cell text to the table dictionary
                            if row not in table:
                                table[row] = {}
                            table[row][col] = cell_text.strip()
            tables.append(table)
    return tables

In [2]:
# Call the function to extract tables
tables = extract_tables_from_json(textract_response)

# Display the extracted table data as DataFrames for better readability
for table in tables:
    # Convert the extracted table into a DataFrame
    df = pd.DataFrame.from_dict(table, orient='index')
    #print(df)

# Set the first row as the column headers
df.columns = df.iloc[0]  # Use the first row as header
df = df.drop(df.index[0])  # Drop the first row from the DataFrame

# Remove duplicate claims
df = df.drop_duplicates(subset='Claim Number')

In [None]:
df

In [3]:
import os
import time
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAIError, RateLimitError
from langchain import HuggingFaceHub, PromptTemplate
import tiktoken  # Make sure to install this library

In [None]:
# Load the environment variables from the .env file
load_dotenv()

# Get the OpenAI API key from Environment Variable (if still using)
huggingfacehub_api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Initialize LLM with Hugging Face Model
llm = HuggingFaceHub(repo_id="flan-t5", model_kwargs={"temperature": 0.7}, huggingfacehub_api_token=huggingfacehub_api_token)

# Define prompt to instruct the LLM
prompt = PromptTemplate(
    input_variables=["data"],
    template="""
    You are a domain expert in insurance claims. 
    Given the following claims data, please remove all claims not relevant to open dealer lots and involve third-party coverage.
    Return the Claim Number of the remaining claims along with a short description of why the claim was included. 
    Do not include any of the input text or the prompt in your response..
    {data}
    """
)

# Calculate prompt tokens
prompt_tokens = count_tokens(prompt.template)
    
# Function to count tokens
def count_tokens(text):
    # Use the appropriate encoding for the model
    enc = tiktoken.encoding_for_model("flan-t5")  # Change as needed
    return len(enc.encode(text))

# Function to batch the DataFrame
def batch_dataframe(df, max_tokens):
    batches = []
    current_batch = []
    current_tokens = 0

    for index, row in df.iterrows():
        row_str = row.to_string(index=False, header=False)  # Convert row to string
        row_tokens = count_tokens(row_str)

        # Check if adding this row exceeds the limit
        if current_tokens + row_tokens + prompt_tokens > max_tokens:
            if current_batch:  # If there's already data in the current batch
                batches.append(pd.DataFrame(current_batch))
                current_batch = []  # Reset current batch
                current_tokens = 0

        current_batch.append(row)
        current_tokens += row_tokens

    if current_batch:  # Add any remaining data
        batches.append(pd.DataFrame(current_batch))

    return batches

# Convert the entire DataFrame into batches
max_tokens = 1024  # Adjust based on your model's limit
df_batches = batch_dataframe(df, max_tokens)

# Process each batch
for i, batch in enumerate(df_batches):
    data_string = batch.to_string(index=False, header=True)
    
    max_retries = 5
    for attempt in range(max_retries):
        try:
            # Format and send the prompt to the model
            response = llm(prompt.format(data=data_string))
            #print(f"Response received successfully for batch {i + 1}:")
            print(response)  # Print the LLM's response
            break  # Exit loop if the request is successful
        except RateLimitError:
            wait_time = 2 ** attempt  # Exponential backoff
            print(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except OpenAIError as e:
            print(f"An error occurred: {e}")
            break  # Exit the loop on other OpenAI errors
    else:
        print("Max retries reached. Could not complete the request.")

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model for flan-t5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")


model.safetensors:  20%|##        | 62.9M/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

NameError: name 'df_batches' is not defined

In [15]:
import pandas as pd
import tiktoken
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the tokenizer and model for flan-t5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

# Define the prompt (your task description and data)
prompt = """
    You are a domain expert in insurance claims. 
    Given the following claims data, remove all claims either not relevant to open dealer lots or involve third-party coverage or both.
    Return the Claim Number of the remaining claims along with a short description of why the claim was included.
    CLAIM DATA GOES HERE
    """

# Function to count tokens
def count_tokens(text):
    enc = tiktoken.get_encoding("flan-t5")  # Use the appropriate encoding for the model
    return len(enc.encode(text))

# Function to batch the DataFrame
def batch_dataframe(df, max_tokens, prompt_tokens):
    batches = []
    current_batch = []
    current_tokens = 0

    for index, row in df.iterrows():
        row_str = row.to_string(index=False, header=False)  # Convert row to string
        row_tokens = count_tokens(row_str)

        # Check if adding this row exceeds the limit
        if current_tokens + row_tokens + prompt_tokens > max_tokens:
            if current_batch:  # If there's already data in the current batch
                batches.append(pd.DataFrame(current_batch))
                current_batch = []  # Reset current batch
                current_tokens = 0

        current_batch.append(row)
        current_tokens += row_tokens

    if current_batch:  # Add any remaining data
        batches.append(pd.DataFrame(current_batch))

    return batches

# Generate responses for each batch
def process_batches(batches, prompt, model, tokenizer, max_length=200):
    responses = []
    for i, batch in enumerate(batches):
        data_string = batch.to_string(index=False, header=True)  # Create the data string from the batch
        
        # Create the full prompt with data included
        full_prompt = prompt.replace("CLAIM DATA GOES HERE", data_string)  # Replace placeholder with data
        
        try:
            response = generate_response(full_prompt, model, tokenizer, max_length)
            responses.append((i + 1, response))  # Append the batch number and response
        except Exception as e:
            print(f"Error processing batch {i + 1}: {e}")
    
    return responses

def generate_response(prompt, model, tokenizer, max_length=200):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate the output
    outputs = model.generate(inputs["input_ids"], max_length=max_length)

    # Decode the generated tokens to get the output text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Calculate prompt tokens and convert the entire DataFrame into batches
prompt_tokens = count_tokens(prompt)
max_tokens = 512  # Adjust based on your model's limit
df_batches = batch_dataframe(df, max_tokens, prompt_tokens)

# Process the batches and get responses
responses = process_batches(df_batches, prompt, model, tokenizer)

# Print or handle the responses
for batch_num, response in responses:
    print(f"Response for batch {batch_num}:")
    print(response)


ValueError: Unknown encoding flan-t5.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.8.0 (are you on latest?)

In [18]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel

# Load the tokenizer and model for FLAN-T5
tokenizer = AutoTokenizer.from_pretrained("llmware/industry-bert-insurance-v0.1")
model = AutoModel.from_pretrained("llmware/industry-bert-insurance-v0.1")

# Define the prompt (your task description and data)
prompt_template = """
You are a domain expert in insurance claims. 
Given the following claims data, remove all claims either not relevant to open dealer lots or involve third-party coverage or both.
Return the Claim Number of the remaining claims along with a short description of why the claim was included.
CLAIM DATA GOES HERE
"""

# Function to count tokens using Hugging Face tokenizer
def count_tokens(text):
    tokens = tokenizer.encode(text, return_tensors="pt")
    return tokens.size(1)  # Return the token count

# Function to batch the DataFrame
def batch_dataframe(df, max_tokens, prompt_tokens):
    batches = []
    current_batch = []
    current_tokens = 0

    for _, row in df.iterrows():
        row_str = row.to_string(index=False, header=False)  # Convert row to string
        row_tokens = count_tokens(row_str)

        # Check if adding this row exceeds the limit
        if current_tokens + row_tokens + prompt_tokens > max_tokens:
            if current_batch:  # If there's already data in the current batch
                batches.append(pd.DataFrame(current_batch))
                current_batch = []  # Reset current batch
                current_tokens = 0

        current_batch.append(row)
        current_tokens += row_tokens

    if current_batch:  # Add any remaining data
        batches.append(pd.DataFrame(current_batch))

    return batches

# Generate responses for each batch
def process_batches(batches, prompt_template, model, tokenizer, max_length=200):
    responses = []
    for i, batch in enumerate(batches):
        data_string = batch.to_string(index=False, header=True)  # Create the data string from the batch
        
        # Create the full prompt with data included
        full_prompt = prompt_template.replace("CLAIM DATA GOES HERE", data_string)  # Replace placeholder with data
        
        try:
            response = generate_response(full_prompt, model, tokenizer, max_length)
            responses.append((i + 1, response))  # Append the batch number and response
        except Exception as e:
            print(f"Error processing batch {i + 1}: {e}")
    
    return responses

# Function to generate a response from the model
def generate_response(prompt, model, tokenizer, max_length=200):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)

    # Generate the output
    outputs = model.generate(inputs["input_ids"], max_length=max_length)

    # Decode the generated tokens to get the output text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example DataFrame (replace this with your actual claims data)
data = {'Claim Number': [101, 102, 103],
        'Claim Description': ["Open lot damage due to rain", 
                              "Third-party liability claim for auto accident", 
                              "Open lot damage from hail"]}

df = pd.DataFrame(data)

# Calculate prompt tokens and convert the entire DataFrame into batches
prompt_tokens = count_tokens(prompt_template)
max_tokens = 512  # Adjust based on your model's limit
df_batches = batch_dataframe(df, max_tokens, prompt_tokens)

# Process the batches and get responses
responses = process_batches(df_batches, prompt_template, model, tokenizer)

# Print or handle the responses
for batch_num, response in responses:
    print(f"Response for batch {batch_num}:")
    print(response)


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Error processing batch 1: The current model class (BertModel) is not compatible with `.generate()`, as it doesn't have a language model head. Classes that support generation often end in one of these names: ['ForCausalLM', 'ForConditionalGeneration', 'ForSpeechSeq2Seq', 'ForVision2Seq'].
