In [1]:
!pip3 install -r requirements.txt




In [4]:
!pip3 install python-dotenv



In [1]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [3]:
from dotenv import load_dotenv
from pytesseract import image_to_string
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json
import requests
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from tempfile import NamedTemporaryFile
from jsonformer.format import highlight_values
from jsonformer.main import Jsonformer
import time


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from huggingface_hub.inference_api import InferenceApi
import hashlib
import os
import sys
from datetime import datetime
from pathlib import Path

import requests
from dotenv import load_dotenv
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, \
    HumanMessagePromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from unstract.llmwhisperer.client import LLMWhispererClient

In [5]:
!pip3 install huggingface_hub



In [6]:
class CustomerAddress(BaseModel):
    zip_code: str = Field(description="Should contain the zip code alone")
    city: str = Field(description="Should hold the city name from the address")
    full_address: str = Field(
        description="Should hold the full address of the customer")


class PaymentInfo(BaseModel):
    due_date: datetime = Field(description="The due date of the credit card statement. Also known as the payment due "
                                           "date")
    minimum_payment: float = Field(
        description="the minimum amount that is due")
    new_balance: float = Field(
        description="the total new balance amount that can be paid")


class SpendLineItem(BaseModel):
    spend_date: datetime = Field(description="The date of the transaction. If the year part isn't mentioned in the "
                                             "line item explicitly, pick up the year from the statement date and use "
                                             "it instead.")
    spend_description: str = Field(description="The description of the spend")
    amount: float = Field(description="The amount of the transaction")


class ParsedCreditCardStatement(BaseModel):
    issuer_name: str = Field(description="What is the name of the issuer or the bank who has issued this credit card? "
                                         "I am not interested in the legal entity, but the primary brand name of the "
                                         "credit card.")
    customer_name: str = Field(description="What is the name of the customer to whom this credit card statement "
                                           "belongs to? Format the name of the customer well with the first letter of "
                                           "each name capitalized.")
    customer_address: CustomerAddress = Field(description="Since there might be multiple addresses in the context "
                                                          "provided to you, first gather all addresses. Try to "
                                                          "understand whom this credit card statement is being "
                                                          "addressed to or in other words, the name of the customer. "
                                                          "Find the address that matches that person's. Be sure to "
                                                          "return the customer's address, for whom this credit card "
                                                          "statement is for. Do not respond with any other address.")
    payment_info: PaymentInfo = Field(description="Payment information is important part of any credit card statement "
                                                  "and it consists of the new balance or the full amount due for the "
                                                  "current statement, the minimum payment due and the payment due "
                                                  "date.")
    spend_line_items: list[SpendLineItem] = Field(description="This credit card statement contains spending details "
                                                              "line items. Spend details can be split across the "
                                                              "provided context. Respond with details of all the "
                                                              "spend items by looking at the whole context always.")

In [7]:
def make_llm_whisperer_call(file_path):
    print(f"Processing file:{file_path}...")
    # LLMWhisperer API key is picked up from the environment variable
    client = LLMWhispererClient()
    result = client.whisper(file_path=file_path,
                            processing_mode="ocr", output_mode="line-printer")
    return result["extracted_text"]

In [8]:
def generate_cache_file_name(file_path):
    # For our use case, PDFs won't be less than 4096, practically speaking.
    if os.path.getsize(file_path) < 4096:
        error_exit("File too small to process.")
    with open(file_path, "rb") as f:
        first_block = f.read(4096)
        # seek to the last block
        f.seek(-4096, os.SEEK_END)
        f.read(4096)
        last_block = f.read(4096)

    first_md5_hash = hashlib.md5(first_block).hexdigest()
    last_md5_hash = hashlib.md5(last_block).hexdigest()
    return f"/tmp/{first_md5_hash}_{last_md5_hash}.txt"

In [9]:
def is_file_cached(file_path):
    cache_file_name = generate_cache_file_name(file_path)
    cache_file = Path(cache_file_name)
    if cache_file.is_file():
        return True
    else:
        return False

In [10]:
def extract_text(file_path):
    if is_file_cached(file_path):
        print(f"Info: File {file_path} is already cached.")
        cache_file_name = generate_cache_file_name(file_path)
        with open(cache_file_name, "r") as f:
            return f.read()
    else:
        data = make_llm_whisperer_call(file_path)
        cache_file_name = generate_cache_file_name(file_path)
        with open(cache_file_name, "w") as f:
            f.write(data)
        return data

In [11]:
def error_exit(error_message):
    print(error_message)
    sys.exit(1)

In [12]:
def show_usage_and_exit():
    error_exit("Please pass name of directory or file to process.")

In [13]:
def enumerate_pdf_files(file_path):
    files_to_process = []
    # Users can pass a directory or a file name
    if os.path.isfile(file_path):
        if os.path.splitext(file_path)[1][1:].strip().lower() == 'pdf':
            files_to_process.append(file_path)
    elif os.path.isdir(file_path):
        print("Under the is directory else case")
        files = os.listdir(file_path)
        for file_name in files:
            print("This is one of the fileName ", file_name)
            full_file_path = os.path.join(file_path, file_name)
            print("This is the full_file_path ", full_file_path)

            if os.path.isfile(full_file_path):
                print("Yes it is the file")
                if os.path.splitext(file_name)[1][1:].strip().lower() == 'pdf' or os.path.splitext(file_name)[1][1:].strip().lower() == 'png':
                    files_to_process.append(full_file_path)
    else:
        error_exit(f"Error. {file_path} should be a file or a directory.")

    return files_to_process

In [14]:
def extract_values_from_file(raw_file_data):
    preamble = ("\n"
                "Your ability to extract and summarize this information accurately is essential for effective "
                "credit card statement analysis. Pay close attention to the credit card statement's language, "
                "structure, and any cross-references to ensure a comprehensive and precise extraction of "
                "information. Do not use prior knowledge or information from outside the context to answer the "
                "questions. Only use the information provided in the context to answer the questions.\n")
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        system_template)
    human_template = "{format_instructions}\n{raw_file_data}\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(
        human_template)

    parser = PydanticOutputParser(pydantic_object=ParsedCreditCardStatement)
    print("HAHAH", parser.get_format_instructions())

    # compile chat template
    chat_prompt = ChatPromptTemplate.from_messages(
        [system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                        format_instructions=parser.get_format_instructions(),
                                        raw_file_data=raw_file_data,
                                        postamble=postamble).to_messages()
    print("This is the request ", request)
    model = ChatOpenAI()
    print("Querying model...")
    result = model(request, temperature=0)
    print("Response from model:")
    print(result.content)
    return result.content

In [15]:
def process_pdf_files(file_list):
    for file_path in file_list:
        raw_file_data = extract_text(file_path)
        print(f"Extracted text for file {file_path}:\n{raw_file_data}")
        extracted_json = extract_values_from_file(raw_file_data)
        json_file_path = f"{file_path}.json"
        with open(json_file_path, "w") as f:
            f.write(extracted_json)

In [16]:
def main():
    load_dotenv()
    if len(sys.argv) < 2:
        show_usage_and_exit()

    print(f"Processing path {sys.argv[1]}...")
    file_list = enumerate_pdf_files(sys.argv[1])
    print(f"Processing {len(file_list)} files...")
    print(f"Processing first file: {file_list[0]}...")
    process_pdf_files(file_list)

In [17]:
load_dotenv()

True

In [18]:
print(enumerate_pdf_files('/Users/ajshinde/Work/structured-extraction-main/assets/imgs'))

Under the is directory else case
This is one of the fileName  chase_cc.png
This is the full_file_path  /Users/ajshinde/Work/structured-extraction-main/assets/imgs/chase_cc.png
Yes it is the file
['/Users/ajshinde/Work/structured-extraction-main/assets/imgs/chase_cc.png']


In [19]:
txt = extract_text(
    '/Users/ajshinde/Work/structured-extraction-main/assets/imgs/chase_cc.png')

Info: File /Users/ajshinde/Work/structured-extraction-main/assets/imgs/chase_cc.png is already cached.


In [20]:
txt



In [21]:
# 3. Extract structured info from text via LLM


class HuggingFaceLLM:
    def __init__(self, temperature=0, top_k=50, model_name="databricks/dolly-v2-12b"):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, use_cache=True, device_map="auto",offload_folder="offload")
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, use_fast=True, use_cache=True,offload_folder="offload")
        self.top_k = top_k

    def generate(self, prompt, max_length=1024):
        json = {
            "type": "object",
            "properties": {
                "items": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "description": {"type": "string"},
                            "price": {"type": "number"}
                        }
                    }
                },
                "Company_name": {"type": "string"},
                "invoice_date": {"type": "string"},
            }
        }

        builder = Jsonformer(
            model=self.model,
            tokenizer=self.tokenizer,
            json_schema=json,
            prompt=prompt,
            max_string_token_length=20
        )

        print("Generating...")
        output = builder()
        # highlight_values(output)
        # print(output)
        return output

In [22]:
def extract_structured_data(content: str, data_points):
    # Choose the desired Hugging Face model
    llm = HuggingFaceLLM(temperature=0)

    template = """
    You are an expert admin people who will extract core information from documents

    {content}

    Above is the content; please try to extract all data points from the content above:
    {data_points}
    """

    # Fill in the placeholders in the template
    formatted_template = template.format(
        content=content, data_points=data_points)
    # print(formatted_template)

    # Generate text using the formatted template
    print("Before the llm.genetate")
    results = llm.generate(formatted_template)
    print("After the llm.genetate")

    return results

In [23]:
default_data_points = """{
        "item": [{
            "description": "description or name of the item that has been bougth",
            "price": "how much does the item cost"
        }],
        "Company_name": "company that issued the invoice",
        "invoice_date": "when was the invoice issued",
    }"""

In [24]:
!pip install 'accelerate>=0.26.0'



In [25]:
extract_structured_data(txt,default_data_points)

Some parameters are on the meta device because they were offloaded to the disk.


Before the llm.genetate


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating...


RuntimeError: MPS backend out of memory (MPS allocated: 35.81 GB, other allocations: 318.89 MB, max allowed: 36.27 GB). Tried to allocate 315.09 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [20]:
!pip install -U "huggingface_hub[cli]"

zsh:1: command not found: pip


In [21]:
!huggingface-cli whoami

ajinkyashinde
