## Import libraries

In [1]:
import openai
import os
import requests
import numpy as np
import pandas as pd
from typing import Iterator
import tiktoken
import textract
from numpy import array, average

from database import get_redis_connection

# Set our default models and chunking size
from config import COMPLETIONS_MODEL, EMBEDDINGS_MODEL, CHAT_MODEL, TEXT_EMBEDDING_CHUNK_SIZE, VECTOR_FIELD_NAME

# Setup the API keys
# openai.api_key = os.environ.get('OPENAI_API_KEY')
openai.api_key = "put_your_api_key"
# Ignore unclosed SSL socket warnings - optional in case you get these errors
import warnings

warnings.filterwarnings(action="ignore", message="unclosed", category=ImportWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
pd.set_option('display.max_colwidth', 0)

In [3]:
data_dir = os.path.join(os.curdir,'data_tax')
pdf_files = sorted([x for x in os.listdir(data_dir) if 'DS_Store' not in x])
pdf_files

['Arun_Duggal_Faridabad_vs_Dcit_Central_Circle_1_on_4_January_2022.PDF',
 'Assistant_Commissioner_Of_Income_vs_M_S_Hari_Narain_Parwal_Huf_on_24_February_2022.PDF',
 'Astt_Commissioner_Of_Income_Tax_vs_M_S_U_P_Awas_Evam_Vikas_Parishad_on_8_June_2022.PDF',
 'Eshita_Dye_Chem_Pvt_Ltd_Mumbai_vs_Assessee_on_25_January_1006.PDF',
 'Fidelity_Mangaemtn_Research_Co_vs_Department_Of_Income_Tax_on_2_September_6648.PDF',
 'Income_Tax_Officer_Jaipur_vs_Motisons_Jewellers_Ltdl_Jaipur_on_29_September_2022-1.PDF',
 'Income_Tax_Officer_Jaipur_vs_Motisons_Jewellers_Ltdl_Jaipur_on_29_September_2022.PDF',
 'Shri_Ramesh_Chand_Rai_Indore_vs_The_Cit_A_3_Bhopal_Bhopal_on_18_April_2022.PDF',
 'Torrent_Pharmaceuticals_Ltd_vs_The_Deputy_Commissioner_Of_Income_on_22_February_2022.PDF',
 'Young_Indian_New_Delhi_vs_Acit_E_New_Delhi_on_31_March_2022.PDF']

## Setup Redis Vector Database

In [3]:
# Setup Redis
from redis import Redis
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField,
    NumericField
)
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)

redis_client = get_redis_connection()

## Create an Index in redis

In [4]:
# Constants
VECTOR_DIM = 1536 #len(data['title_vector'][0]) # length of the vectors
#VECTOR_NUMBER = len(data)                 # initial number of vectors
PREFIX = "taxdoc_v1"                            # prefix for the document keys
DISTANCE_METRIC = "COSINE"       
# Index
INDEX_NAME = "tax-index-v1"           # name of the search index
VECTOR_FIELD_NAME = 'content_vector'

In [5]:
redis_client.ping()

True

In [9]:
from database import create_hnsw_index 

In [None]:
# Create Index
create_hnsw_index(redis_client,VECTOR_FIELD_NAME, VECTOR_DIM, DISTANCE_METRIC)

In [None]:
# Optional step to drop the index if it already exists
##redis_client.ft(INDEX_NAME).dropindex()

# Check if index exists
# try:
#     redis_client.ft(INDEX_NAME).info()
#     print("Index already exists")
# except Exception as e:
#     print(e)
#     # Create RediSearch Index
#     print('Not there yet. Creating')
#     redis_client.ft(INDEX_NAME).create_index(
#         fields = fields,
#         definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)
#     )

### Ingestion

We'll load up our PDFs and do the following

    Initiate our tokenizer
    Run a processing pipeline to:
        Mine the text from each PDF
        Split them into chunks and embed them
        Store them in Redi

In [13]:
from transformers import handle_file_string

In [18]:
%%time
# This step takes about 5 minutes

# Initialise tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Process each PDF file and prepare for embedding
for pdf_file in pdf_files:
    
    pdf_path = os.path.join(data_dir,pdf_file)
    print(pdf_path)
    
    # Extract the raw text from each PDF using textract
    text = textract.process(pdf_path, method='pdftotext')
    
    # Chunk each document, embed the contents and load to Redis
    handle_file_string((pdf_file,text.decode("utf-8")),tokenizer,redis_client,VECTOR_FIELD_NAME,INDEX_NAME)

.\data_tax\Arun_Duggal_Faridabad_vs_Dcit_Central_Circle_1_on_4_January_2022.PDF
.\data_tax\Assistant_Commissioner_Of_Income_vs_M_S_Hari_Narain_Parwal_Huf_on_24_February_2022.PDF
.\data_tax\Astt_Commissioner_Of_Income_Tax_vs_M_S_U_P_Awas_Evam_Vikas_Parishad_on_8_June_2022.PDF
.\data_tax\Eshita_Dye_Chem_Pvt_Ltd_Mumbai_vs_Assessee_on_25_January_1006.PDF
.\data_tax\Fidelity_Mangaemtn_Research_Co_vs_Department_Of_Income_Tax_on_2_September_6648.PDF
.\data_tax\Income_Tax_Officer_Jaipur_vs_Motisons_Jewellers_Ltdl_Jaipur_on_29_September_2022-1.PDF
.\data_tax\Income_Tax_Officer_Jaipur_vs_Motisons_Jewellers_Ltdl_Jaipur_on_29_September_2022.PDF
.\data_tax\Shri_Ramesh_Chand_Rai_Indore_vs_The_Cit_A_3_Bhopal_Bhopal_on_18_April_2022.PDF
.\data_tax\Torrent_Pharmaceuticals_Ltd_vs_The_Deputy_Commissioner_Of_Income_on_22_February_2022.PDF
.\data_tax\Young_Indian_New_Delhi_vs_Acit_E_New_Delhi_on_31_March_2022.PDF
CPU times: total: 34.9 s
Wall time: 1min 26s


## The below steps are only for testing purposes.

In [39]:
# Check that our docs have been inserted
redis_client.ft("idx").info()['num_docs']

'2749'

### Search in the Database

In [6]:
from database import get_redis_results

In [7]:
%%time

prompt = "Grounds of appeal raised by the Revenue for AY 2010-11 in IT(SS)A No.49/Ind/2021"
result_df = get_redis_results(redis_client,prompt,index_name="idx")
result_df.head(2)

CPU times: total: 31.2 ms
Wall time: 754 ms


Unnamed: 0,id,result,certainty,filename
0,0,"\r 167 At the outset, we note that the issue raised by the Revenue in its ground of appeal for the AY 2011-12 is identical to the issue raised by the assessee vide ground no. 1 in ITA No. 1285/AHD/2017 for the assessment year 2009- 10 and ground no. 2 in ITA No. 1286/Ahd/2017 for A.Y. 2010-11 Therefore, the findings given in ITA No. 1285 and 1286/AHD/2017 shall also be applicable for the year under consideration i.e. AY 2011-12. The appeal of the Assessee for the assessment 2009-10 and 2010-11 has been decided by us vide paragraph Nos. 12 and 84 of this order and has been decided in favour of the assessee. The learned AR and the DR also agreed that whatever will be the findings for the assessment year 2009-10 and 2010-11 shall also be applied for the year under consideration i.e. AY 2011-12. Hence, the grounds of appeal filed by the Revenue is hereby dismissed.\r 168. In the result appeal of the Revenue is dismissed.\r Coming to ITA No. 1397/AHD/2018, an appeal by the assessee for the AY 2012-13 ITA.Nos.1285/Ahd/2017 & 7 others A.Y.2009-10\r 169. The assessee has raised following grounds of appeal:\r 1. On the facts and in the circumstances of the case, the learned CIT(Appeals) erred in confirming disallowance of Rs.7,26,245 made by the Assessing Officer in respect of Employees Provident Fund and ESI contributions on the ground that these payments were made by the appellant company beyond the time limit prescribed under the relevant provisions of PF and ESIC Acts.",0.0786235928535,Torrent_Pharmaceuticals_Ltd_vs_The_Deputy_Commissioner_Of_Income_on_22_February_2022.PDF
1,1,"\r \r 123. At the outset, we note that the issue raised by the Revenue in its ground of appeal for the AY 2010-11 is identical to the issue raised by the Revenue vide ground no. 8 in ITA No. 1327/AHD/2017 for the assessment year 2009-\r \r Indian Kanoon - http://indiankanoon.org/doc/167968689/\r \r 66\r \r Torrent Pharmaceuticals Ltd.,, ... vs The Deputy Commissioner Of Income ... on 22 February, 2022\r 10. Therefore, the findings given in ITA No. 1327/AHD/2017 shall also be applicable for the year under consideration i.e. AY 2010-11. The ground of appeal of the Revenue for the assessment 2009-10 has been decided by us vide paragraph Nos. 72 of this order against the Revenue. The learned AR and the DR also agreed that whatever will be the findings for the assessment year 2009-10 shall also be applied for the year under consideration i.e. AY 2010-\r \r 11. Hence, the ground of appeal filed by the Revenue is hereby dismissed.\r \r ITA.Nos.1285/Ahd/2017 & 7 others A.Y.2009-10\r \r 124. In the result appeal of the Revenue is dismissed.\r \r Coming to ITA No. 1396/AHD/2018, an appeal by the Assessee for the AY 2011-12\r \r 125. The assessee has raised following grounds of appeal\r \r 1. On the facts and in the circumstances of the case, the learned CIT(Appeals) erred in confirming the addition of Rs.",0.0796328783035,Torrent_Pharmaceuticals_Ltd_vs_The_Deputy_Commissioner_Of_Income_on_22_February_2022.PDF


### Use chat gpt to search

In [48]:
summary_prompt = '''Summarise this result in a bulleted list to answer the search query a customer has sent.
    Search query: SEARCH_QUERY_HERE
    Search result: SEARCH_RESULT_HERE
    Summary:
    '''
summary_prepped = summary_prompt.replace('SEARCH_QUERY_HERE',prompt).replace('SEARCH_RESULT_HERE',result_df['result'][0])
summary = openai.Completion.create(engine=COMPLETIONS_MODEL,prompt=summary_prepped,max_tokens=500)
summary['choices'][0]['text']

' -Issue of ground of appeal raised by Revenue for AY 2010-11 is same as ITA No. 1285/Ahd/2017 & 1286/Ahd/2017 for AY 2009-10 and 2010-11.\n     -Ground of appeal by Revenue dismissed as the findings in ITA No. 1285&1286/Ahd/2017 would apply to AY 2011-12.\n     -Appeal by Assessee for AY 2012-13 raised ground of disallowance for Rs.7,26,245 made by Assessing Officer for Employees Provident Fund and ESI contributions made beyond the time limit prescribed.'