## Install necessary packages

In [1]:
!pip install openai
!pip install llama-index
!pip install PyPDF2
!pip install guardrails-ai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.3-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.2/70.2 KB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5
  Downloading 

In [2]:
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader
from llama_index.output_parsers import GuardrailsOutputParser
from llama_index.llm_predictor import StructuredLLMPredictor
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL

import pandas as pd
import json

In [16]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/.DS_Store          
replace __MACOSX/data/._.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: __MACOSX/data/._.DS_Store  
   creating: data/swiggy2/
   creating: data/swiggy3/
   creating: data/zomato1/
   creating: data/swiggy1/
   creating: data/zomato2/
   creating: data/zomato3/
  inflating: data/swiggy2/swiggy2.pdf  
  inflating: __MACOSX/data/swiggy2/._swiggy2.pdf  
  inflating: data/swiggy3/swiggy3.pdf  
  inflating: __MACOSX/data/swiggy3/._swiggy3.pdf  
  inflating: data/zomato1/zomato.pdf  
  inflating: __MACOSX/data/zomato1/._zomato.pdf  
  inflating: data/swiggy1/swiggy.pdf  
  inflating: __MACOSX/data/swiggy1/._swiggy.pdf  
  inflating: data/zomato2/zomato2.pdf  
  inflating: __MACOSX/data/zomato2/._zomato2.pdf  
  inflating: data/zomato3/zomato3.pdf  
  inflating: __MACOSX/data/zomato3/._zomato3.pdf  


## Using Guardrails for output 
Look at image below. Somehow on github jupyter notebook is hiding the xml type contents from below code cell (rail_spec="""...."""). Although when you clone and use this notebook it is visible.

![image](./images/rail_spec.png)


In [None]:
rail_spec = """
<rail version="0.1">

<output>
    <object name="bill_info">
        <string name="invoice_number" description="invoice or order" />
        <string name="restaurant_name" description="name of restaurant" />
        <float name="total_bill" description="total net payable amount" />
        <date name="invoice_date" description="invoice date" date-format="%Y-%m-%d" />
        <list
            name="food_items"
            description="Food items which was ordered. Each food item should be classified into a separate item in the list.">
            <object>
                <string 
                    name="food_item" 
                    description="food, desrciption or particulars which has been purchased"
                />
                <float
                    name="cost_price"
                    description="What was the total cost of item"
                    on-fail-valid-choices="reask"
                />
            </object>
        </list>
    </object>
</output>

<prompt>

Query string here.

@xml_prefix_prompt

{output_schema}

@json_suffix_prompt_v2_wo_none
</prompt>
</rail>
"""

In [18]:
import openai
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "..." # place your key here

## Eample with one pdf file

In [20]:
# load documents, build index
llm_predictor = StructuredLLMPredictor()
documents = SimpleDirectoryReader('./data/swiggy1').load_data()
index = GPTSimpleVectorIndex.from_documents(documents)

In [21]:
# define output parser
output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, llm=llm_predictor.llm)

# format each prompt with output parser instructions
fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)
refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)

In [22]:
# obtain a structured response
response = index.query(
    "What are the food items purchased and its cost?", 
    text_qa_template=qa_prompt, 
    refine_template=refine_prompt, 
    #llm_predictor=llm_predictor
)
print(response)


{
    "bill_info": {
        "invoice_number": "0217048033000150",
        "restaurant_name": "Chinese BAE",
        "total_bill": 203.70,
        "invoice_date": "2023-03-30",
        "food_items": [
            {
                "food_item": "Veg Schezwan Fried Rice",
                "cost_price": 179.00
            },
            {
                "food_item": "Order Packing Charges",
                "cost_price": 15.00
            }
        ]
    }
}


## To get source info

In [23]:
print(response.get_formatted_sources())
print("-"*20)
print(response.source_nodes)

> Source (Doc id: 47ca3892-f698-4c8f-a82b-dba1d6133427): Taxes Rate
IGST 0% 0.00
CGST 2.5% 4.85
SGST/UTGST 2.5% 4.85
Total taxes 9.70
Invoice Total 203.70...
--------------------
[NodeWithScore(node=Node(text="Taxes Rate\nIGST 0% 0.00\nCGST 2.5% 4.85\nSGST/UTGST 2.5% 4.85\nTotal taxes 9.70\nInvoice Total 203.70\nTAX INVOICE\nInvoice To: Vikash Invoice issued by Bundl Technologies Private \nLimited on behalf of:\nGSTIN: Unregistered Restaurant Name: Chinese BAE\nCustomer Address: 4A, 4th floor, #1325, 32F cross road, 4th T Block \nEast, Pattabhirama Nagar, Jayanagar, Bengaluru, \nKarnataka, IndiaRestaurant GSTIN: 29CITPS2827J3ZB\nOrder ID: 163271281959 Address: NO.805/A, 1ST FLOOR, \n7TH CROSS, BTM \nLAYOUT 2ND STAGE, \nMICO LAYOUT, \nBANGALORE., B.B.M.P \nSouth (Karnataka) - 560076\nCity: Bangalore\nState: Karnataka\nDocument: INV Place of Supply: Karnataka\nInvoice No: 0217048033000150 Service Description: Restaurant Service\nDate of Invoice: 30-03-2023 Category: B2C\nHSN Code: 996331

In [24]:
# convert dictionary string to dictionary
res = json.loads(str(response))

## Loop over all folders and return a DataFrame, 
Later this can be save as CSV file, pushed to db etc

In [39]:
# Function to convert dict to dataframe
def dictToDataFrame(res) :
  list_rec = []

  for i in range(len(res['bill_info']['food_items'])):
    x = [res['bill_info']['invoice_number'],
         res['bill_info']['invoice_date'],
         res['bill_info']['restaurant_name'], 
         res['bill_info']['total_bill'], 
         res['bill_info']['food_items'][i]['food_item'], 
         res['bill_info']['food_items'][i]['cost_price']]
    list_rec.append(x)
    
  return pd.DataFrame(list_rec, columns=['invoice_number', 'invoice_date', 'restaurant_name', 'total_bill', 'food_item', 'cost_price'])

In [27]:
# define output parser
output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, llm=llm_predictor.llm)

# format each prompt with output parser instructions
fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)
refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)

In [32]:
os.listdir('./data')

['zomato1', 'swiggy1', 'zomato2', 'swiggy2', 'zomato3', 'swiggy3']

## Looping and appending records

In [36]:
# Looping over folders
folders = os.listdir('./data')
#['swiggy1', 'swiggy2', 'swiggy3', 'zomato1', 'zomato2', 'zomato3']

df = pd.DataFrame()

for f in folders :
  documents = SimpleDirectoryReader('./data/'+f).load_data()
  index = GPTSimpleVectorIndex.from_documents(documents)

  # obtain a structured response
  response = index.query(
      "What are the food items purchased and its cost?", 
      text_qa_template=qa_prompt, 
      refine_template=refine_prompt, 
      #llm_predictor=llm_predictor
  )
  #print(response)

  # convert dictionary string to dictionary
  res = json.loads(str(response))

  # get transformed df data
  df_temp = dictToDataFrame(res)

  # append to final df
  df = pd.concat([df, df_temp], axis=0, ignore_index=True)


# Yay!! 💞

In [40]:
df.shape

(14, 6)

In [41]:
df

Unnamed: 0,invoice_number,invoice_date,restaurant_name,total_bill,food_item,cost_price
0,23DZFN3Z00000062,2023-04-01,Kolkata Kathi Rolls,128.64,1 x Chicken Dum Biryani,106.58
1,23DZFN3Z00000062,2023-04-01,Kolkata Kathi Rolls,128.64,1 x Plain Paratha,22.06
2,0217048033000150,2023-03-30,Chinese BAE,203.7,Veg Schezwan Fried Rice,179.0
3,0217048033000150,2023-03-30,Chinese BAE,203.7,Order Packing Charges,15.0
4,4684812609,2023-02-18,Natural Ice Cream,407.04,Tender Coconut Ice Cream,344.92
5,4684812609,2023-02-18,Natural Ice Cream,407.04,Empty Waffle cone,37.32
6,162542398027,2023-03-22,ROTTI MANE UTTARA KARNATAKA STORE,330.75,Dal Holige (Obbattu),157.5
7,162542398027,2023-03-22,ROTTI MANE UTTARA KARNATAKA STORE,330.75,Coconut Holige (Obbattu),157.5
8,4475983444,2022-11-14,Truffles,955.02,Devil's Chicken Sub,157.15
9,4475983444,2022-11-14,Truffles,955.02,Spaghetti Carbonara Chicken,247.62
