# Invoice Data Extraction using IBM Granite LLM from watsonx
Author: [@Surya Deep Singh](https://www.linkedin.com/in/surya-deep-singh-b9b94813a/)


## Description

This Jupyter Notebook is designed to process and extract key details from invoices stored as PDFs. It uses IBM watsonx's granite-3-8b-instruct LLM to interpret the invoice data and extract specific fields such as invoice number, total amounts and customer details, etc. Extracted data is validated and saved into a structured DataFrame for further analysis.

## Step 1: Setup and Installation

We start by installing the required dependencies. This includes libraries for document conversion, IBM watsonx LLM integration, and data handling.

In [None]:
!pip install -q git+https://github.com/ibm-granite-community/utils \
    docling==2.14.0 \
    langchain==0.2.12 \
    langchain-ibm==0.1.11 \
    langchain-community==0.2.11 \
    langchain-core==0.2.28 \
    ibm-watsonx-ai==1.1.2 \
    transformers==4.47.1


## Step 2. Import Libraries and Load Environment Variables

Import necessary libraries and load API credentials and configurations using the dotenv library.

In [None]:
from docling.document_converter import DocumentConverter
from langchain_ibm import WatsonxLLM
from langchain_core.prompts import PromptTemplate
import os
import json
import pandas as pd
import re
import os
import requests
from dotenv import load_dotenv, dotenv_values 
load_dotenv()
from ibm_granite_community.notebook_utils import get_env_var

## Step 3: Define the InvoiceProcessor Class

The InvoiceProcessor class handles the processing of invoices, including document conversion, text extraction using IBM Docling and processing using Granite LLM

In [None]:
# Define the InvoiceProcessor class
class InvoiceProcessor:
    def __init__(self, ibm_cloud_api_key, project_id, watson_url):
        self.llm = WatsonxLLM(
            model_id='ibm/granite-3-8b-instruct',
            apikey=ibm_cloud_api_key,
            project_id=project_id,
            params={
                "decoding_method": "greedy",
                "max_new_tokens": 8000,
                "min_new_tokens": 1,
                "repetition_penalty": 1.01
            },
            url=watson_url
        )
        self.converter = DocumentConverter()

        

    def extract_invoice_data(self, source):
        result = self.converter.convert(source)
        markdown_output =  result.document.export_to_markdown()
      

        #prompt_template = PromptTemplate.from_template('''
        prompt_template = PromptTemplate(
            input_variables=["DOCUMENT"],
            template='''
            <|start_of_role|>System<|end_of_role|> You are an AI assistant for processing invoices. Based on the provided invoice data, extract the 'Invoice Number', 'Total Net Amount', 'Total VAT or TAX or GST Amount', 'Total Amount' , 'Invoice Date', 'Purchase Order Number' and 'Customer number', without the currency values.

            |Instructions|
            Identify and extract the following information:
            - **Invoice Number**: The unique identifier for the invoice.
            - **Net Amount**: The Total Net Amount indicated on the invoice.
            - **VAT or TAX or GST Amount**: The Total VAT or TAX or GST Amount indicated on the invoice.
            - **Total Amount**: The Total Cost indicated on the invoice.
            - **Invoice Date**: The date the invoice was issued.
            - **Purchase Order Number**: The unique identifier for the purchase order.
            - **Customer Number**: The unique identifier for the customer.

            Invoice Data:
            {DOCUMENT}


            Strictly provide the extracted information in the following JSON format:

            ```json
            {{
              "invoice_number": "extracted_invoice_number",
              "net_amount": "extracted_new_amount",
              "vat_or_tax_or_gst_amount" : "extracted_vat_or_tax_or_gst_amount",
              "total_amount": "extracted_total_amount",
              "invoice_date": "extracted_invoice_date",
              "purchase_order_number": "extracted_purchase_order_number",
              "customer_number": "extracted_customer_number"
            }}

            <|end_of_text|>

            <|start_of_role|>assistant<|end_of_role|>
        ''')

        prompt = prompt_template.format(DOCUMENT=str(markdown_output).strip())
        answer = self.llm.invoke(prompt)
        #print(answer)

        json_string = re.search(r'\{.*\}', answer, re.DOTALL).group(0).replace('\n', '')
        data = json.loads(json_string)

        try:
            net_amount = round(float(data['net_amount'].replace(",", "").replace("$", "").strip()), 2)
            vat_or_tax_or_gst_amount = round(float(data['vat_or_tax_or_gst_amount'].replace(",", "").replace("$", "").strip()), 2)
            total_amount = round(float(data['total_amount'].replace(",", "").replace("$", "").strip()), 2)

            data['Validation'] = 'correct' if round(net_amount + vat_or_tax_or_gst_amount, 2) == total_amount else 'check'
            print("Processed -- ", source)
        except (ValueError, KeyError):
            data['Validation'] = 'check'

        return data

    def process_invoices(self, folder_path):
        columns = ['File_Name', 'Invoice_Number', 'Net_Amount', 'TAX_Amount', 'Total_Amount', 'Validation', 'Invoice_Date', 'Purchase_Order_Number', 'Customer_Number']
        df_invoice = pd.DataFrame(columns=columns)

        for filename in os.listdir(folder_path):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(folder_path, filename)
                try:
                    data = self.extract_invoice_data(pdf_path)
                    data['FileName'] = filename

                    new_row = {
                        'File_Name': data['FileName'],
                        'Invoice_Number': data['invoice_number'],
                        'Net_Amount': data['net_amount'],
                        'TAX_Amount': data['vat_or_tax_or_gst_amount'],
                        'Total_Amount': data['total_amount'],
                        'Validation': data['Validation'],
                        'Invoice_Date': data['invoice_date'],
                        'Purchase_Order_Number': data['purchase_order_number'],
                        'Customer_Number': data['customer_number']
                    }

                    df_invoice = pd.concat([df_invoice, pd.DataFrame([new_row])], ignore_index=True)
                except Exception:
                    pass

        return df_invoice


In [None]:

def setup_directory(directory):
    """
    Ensure the specified directory exists. Create it if it doesn't.
    """
    os.makedirs(directory, exist_ok=True)
    print(f"Directory '{directory}' is ready.")

def download_files(file_list, base_url, directory):
    """
    Download files from a given base URL into a specified directory.
    """
    for file_name in file_list:
        file_url = base_url + file_name
        local_file_path = os.path.join(directory, file_name)
        try:
            # Download the file
            response = requests.get(file_url)
            if response.status_code == 200:
                # Save to the specified directory
                with open(local_file_path, "wb") as file:
                    file.write(response.content)
                print(f"Downloaded: {file_name}")
            else:
                print(f"Failed to download {file_name}. Status code: {response.status_code}")
        except Exception as e:
            print(f"Error downloading {file_name}: {e}")

def delete_files(directory):
    """
    Delete all files in the specified directory.
    """
    try:
        for file_name in os.listdir(directory):
            file_path = os.path.join(directory, file_name)
            if os.path.isfile(file_path):
                os.remove(file_path)
                print(f"Deleted: {file_name}")
    except Exception as e:
        print(f"Error deleting files: {e}")

def cleanup_directory(directory):
    """
    Remove the directory if it is empty.
    """
    try:
        os.rmdir(directory)
        print(f"Directory '{directory}' removed successfully.")
    except OSError as e:
        print(f"Error removing directory '{directory}': {e}")

def download_invoice():
    """
    Main workflow for downloading, processing, and cleaning up files.
    """
    # List of file names
    files = [
        "6900026063.pdf",
        "6900026069.pdf",
        "6905212892.pdf",
        "904000640.pdf",
        "PL_IERPIC_MISSING.pdf"
    ]

    # Base URL for the raw files
    #base_url = "https://raw.githubusercontent.com/SinghSuryaDeep/Granite_Recipes_Invoices/main/Invoices/"
    base_url = "https://raw.githubusercontent.com/SinghSuryaDeep/granite-snack-cookbook/refs/heads/main/recipes/Invoice-Extraction/Invoices/"
  

    
    #base_url = "https://raw.githubusercontent.com/Surya-Deep-Singh/Granite_Recipes_Invoices"
    data_dir = "data"
    setup_directory(data_dir)

    # Step 2: Download the files
    print("Downloading files...")
    download_files(files, base_url, data_dir)




## Step 4. Initialize and Process Invoices

Set up the necessary credentials for IBM watsonx, process invoices using the InvoiceProcessor class, and store the results in a pandas DataFrame.

In [None]:
if __name__ == "__main__":
    download_invoice()
    # Main script to initialize and process invoices
    ibm_cloud_api_key = get_env_var('WATSONX_APIKEY')
    project_id = get_env_var('WATSONX_PROJECT_ID')
    watson_url = get_env_var('WATSONX_URL')
    folder_path = os.getenv('folder_path')
    invoice_processor = InvoiceProcessor(ibm_cloud_api_key, project_id, watson_url)
    df_invoice = invoice_processor.process_invoices("data")
    delete_files('data')
df_invoice