
## This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

## Expert Knowledge Worker On Corte case files 

### A question answering agent that is an expert knowledge worker

In [35]:
# imports
import os
import glob
from dotenv import load_dotenv
import gradio as gr
import pandas as pd
import pyarrow.parquet as pq

In [36]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [37]:
pd.read_parquet("/Users/jethin/Downloads/0000.parquet")

Unnamed: 0,judgement,summary_a1,summary_a2
0,this appeal is preferred against the judgment ...,FACTS\nthis appeal is preferred against the ju...,"FACTS\non 18.11.1994, at about 8.00 a.m. in th..."
1,this appeal is by special leave against the ju...,FACTS\nthis appeal is by special leave against...,FACTS\nthis appeal is by special leave against...
2,interpretation and or application of the provi...,FACTS\nthe government of gujarat in exercise o...,FACTS\nplot nos.17/7 and 17/8 were owned by re...
3,twelve accused persons were tried for offences...,FACTS\ntwelve accused persons were tried for o...,FACTS\ntwelve accused persons were tried for o...
4,it was just the other day that our brothers ra...,FACTS\nthe price which the manufacturer of bar...,FACTS\nthe price which the manufacturer of bar...
5,challenge in this appeal is to the order of a ...,"FACTS\nthe deceased is one mayurani,a sri lank...","FACTS\nthe deceased is one mayurani,a sri lank..."
6,the state of manipur is in appeal before us qu...,FACTS\none shri a.j.tayeng was the revenue com...,FACTS\none shri a.j.tayeng was the revenue com...
7,this is an appeal on a certificate article 133...,FACTS\none dwarka nath was the owner of consid...,FACTS\none dwarka nath was the owner of consid...
8,this appeal is directed against the judgment a...,FACTS\nthe respondent joined service as techni...,FACTS\nthe respondent joined service as techni...
9,these appeals arising out of a judgment and or...,FACTS\nthe appellant nos.1 and 2 are degree ho...,FACTS\nthe appellant nos.1 and 2 are degree ho...


In [38]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_dds"

In [39]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [40]:


def convert_parquet_to_documents(parquet_file_path):
    """
    Loads a parquet file and converts it to a documents list where each document
    contains the summary and judgment in a specific format.
    
    Args:
        parquet_file_path: Path to the parquet file
    
    Returns:
        documents: List of formatted documents
    """
    # Load the parquet file
    table = pq.read_table(parquet_file_path)
    df = table.to_pandas()
    
    # Initialize empty documents list
    documents = []
    
    # Process each row in the dataframe
    for i, row in df.iterrows():
        # Create formatted document text
        document_text = (
            f"CASE #{i+1}\n"
            f"{'=' * 80}\n\n"
            f"SUMMARY:\n"
            f"{row['summary_a1']}\n\n"
            f"JUDGEMENT:\n"
            f"{row['judgement']}\n\n"
            f"{'-' * 80}\n\n"
        )
        
        # Add document to the list
        documents.append(document_text)
    
    print(f"Converted {len(documents)} cases to document format")
    return documents

# Example usage
parquet_path = "/Users/jethin/Downloads/0000.parquet"
documents = convert_parquet_to_documents(parquet_path)

# Now you have your formatted documents in a list
print(f"First document sample:\n{documents[0]}...")

Converted 50 cases to document format
First document sample:
CASE #1

SUMMARY:
FACTS
this appeal is preferred against the judgment passed by the high court of punjab and haryana in criminal appeal no.181 sb of 2000, whereby the high court partly allowed the appeal filed by the appellants thereby confirming the conviction of the appellants with certain modifications.
case of the prosecution is that on the fateful day i.e. 18.11.1994, at about 8.00 a.m. in the morning the complainant jagdish (pw-5) along with his two sons namely sukhbir and mange ram (pw-6) were busy in cutting pullas (reeds) from the dola of their field.
at that time, jage ram (a-1) and his sons rajbir singh. raju (a-2), rakesh (a-3) and madan (a-4) armed with jaily, pharsi and lathis respectively, entered the land where the complainant was working with his sons and asked them not to cut the pullas as it was jointly held by both the parties.
wordy altercations ensued between the parties and jage ram insisted that he wou

In [42]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document

# Convert your string documents to LangChain Document objects
langchain_documents = []
for i, doc_text in enumerate(documents):
    # Create a Document object with the text as page_content and optional metadata
    doc = Document(
        page_content=doc_text,
        metadata={
            "doc_id": i,
            "doc_type": "legal_case"  # You can customize this metadata
        }
    )
    langchain_documents.append(doc)

# Now split the documents
text_splitter = CharacterTextSplitter(chunk_size=100000, chunk_overlap=20000)
chunks = text_splitter.split_documents(langchain_documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in langchain_documents)}")


Total number of chunks: 51
Document types found: {'legal_case'}


In [43]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings()

# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers
# Then replace embeddings = OpenAIEmbeddings()
# with:
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 51 documents


In [44]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 51 vectors with 1,536 dimensions in the vector store


In [45]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Now we will bring this up in Gradio using the Chat interface -

A quick and easy way to prototype a chat with an LLM

In [46]:
# Wrapping that in a function

def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [47]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.
