In [51]:
print("Vakeel AI Research Notebook")

Vakeel AI Research Notebook


In [72]:
### First Convertting the pdf to text
import os
import pdfplumber

def pdf_to_text(directory, output_txt_file):
    
    pdf_files = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    
    with open(output_txt_file, 'w', encoding='utf-8') as txt_file:
        for pdf_file in pdf_files:
            pdf_path = os.path.join(directory, pdf_file)
            try:
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        text = page.extract_text()
                        if text:
                            txt_file.write(text)
                            txt_file.write("\n" + "="*50 + "\n")  
            except Exception as e:
                print(f"Error processing {pdf_file}: {e}")
                continue  

directory = 'C:/Users/aswin/OneDrive/Documents/Data Science/Gen AI/Final LLMOPS LAW/test'
output_txt_file = 'combined_output3.txt'  
pdf_to_text(directory, output_txt_file)


In [73]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader('combined_output3.txt', encoding='utf-8')
extracted_data = loader.load()
print(f"Loaded {len(extracted_data)} documents from the text file.")

Loaded 1 documents from the text file.


In [74]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [75]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [76]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [77]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [78]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 1206


In [79]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "BAAI/bge-large-zh-v1.5"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [81]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='BAAI/bge-large-zh-v1.5', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [82]:
from dotenv import load_dotenv
import os


load_dotenv()


PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
hf_token = os.getenv("hf_token")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["hf_token"] = hf_token

In [11]:
pip install pinecone

Note: you may need to restart the kernel to use updated packages.


In [83]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [84]:
pc

<pinecone.pinecone.Pinecone at 0x1612d474fd0>

In [86]:
from pinecone import ServerlessSpec 

index_name = "test1-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=1024,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [26]:
!pip install langchain-pinecone


Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.11-py3-none-any.whl.metadata (6.1 kB)
Collecting langchain-tests<1.0.0,>=0.3.7 (from langchain-pinecone)
  Downloading langchain_tests-0.3.20-py3-none-any.whl.metadata (3.3 kB)
Collecting langchain-openai>=0.3.11 (from langchain-pinecone)
  Downloading langchain_openai-0.3.28-py3-none-any.whl.metadata (2.3 kB)
Collecting openai<2.0.0,>=1.86.0 (from langchain-openai>=0.3.11->langchain-pinecone)
  Downloading openai-1.97.1-py3-none-any.whl.metadata (29 kB)
Collecting pytest<9,>=7 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest-8.4.1-py3-none-any.whl.metadata (7.7 kB)
Collecting pytest-asyncio<1,>=0.20 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading pytest_asyncio-0.26.0-py3-none-any.whl.metadata (4.0 kB)
Collecting syrupy<5,>=4 (from langchain-tests<1.0.0,>=0.3.7->langchain-pinecone)
  Downloading syrupy-4.9.1-py3-none-any.whl.metadata (38 kB)
Collecting pytest-socket<1

In [87]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [89]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [90]:
retrieved_docs = retriever.invoke("What is Suraj Lamp ?")
retrieved_docs

[Document(id='b1d4b9b5-12d4-41e8-9572-25deb48f8a08', metadata={'source': 'combined_output3.txt'}, page_content='v. Central Bank of India & Ors9, where it was reiterated that title and ownership\nof immovable property can only be conveyed by a registered deed of sale. The\nfollowing observations are significant:\n“25. The observations made by this Court in Suraj Lamp (supra) in paras 16 and 19 are also\nrelevant.\n…..\n26. Suraj Lamp (supra) later came to be referred to and relied upon by this Court in\nShakeel Ahmed v. Syed Akhlaq Hussain, 2023 SCC OnLine SC 1526 wherein the Court\nafter referring to its earlier judgment held that the person relying upon the customary\ndocuments cannot claim to be the owner of the immovable property and consequently not\nmaintain any claims against a third-party. The relevant paras read as under:—\n“10. Having considered the submissions at the outset, it is to be emphasized that\nirrespective of what was decided in the case of Suraj Lamps and Industrie

In [91]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [92]:
system_prompt = (

    """
You are an Indian Legal Assistant, a specialized AI designed to provide accurate and helpful information about Indian laws, legal procedures, case precedents, and the Indian legal system. You have been augmented with a comprehensive knowledge base of Indian legal documents, statutes, court judgments, and legal commentary that you can retrieve and reference to provide accurate answers, Act as a legal AI advisor. Use only the retrieved legal documents to answer the query, citing the context explicit.
You are a legal expert specializing in laws and case analysis. Using the given legal context, provide a precise, concise, and detailed response that addresses the query. Answer the questions based on the provided context only.
What You Can Do

Retrieve Relevant Legal Information: Search for and reference specific sections of acts, landmark judgments, or legal principles relevant to user queries.
Explain Legal Concepts: Provide clear, accurate explanations of Indian legal concepts, procedures, and terminology in accessible language.
Analyze Legal Scenarios: Apply legal knowledge to analyze hypothetical scenarios or general legal questions.
Provide Procedural Guidance: Explain general legal procedures, filing requirements, and institutional frameworks.
Cite Sources: Always cite the specific legal texts, sections, and precedents you're referencing.
Summarize Case Law: Structure landmark judgments with complete detail including case name, citation, bench, legal issue, judgment, reasoning, and implications.
Provide Multiple Judgments: When asked for judgments on any legal topic, provide a minimum of 5 relevant case references with complete details from the knowledge base.
Handle Uncertainty: When information is incomplete or ambiguous, acknowledge limitations and explain different legal perspectives or interpretations.

What You Cannot Do

Provide Legal Advice: You cannot provide personalized legal advice. Always clarify that your information is educational and not a substitute for consulting a qualified legal professional.
Predict Case Outcomes: You cannot predict the outcome of specific ongoing legal cases.
Act as a Lawyer: You cannot represent users in legal matters or draft specific legal documents.
Guarantee Legal Information: You cannot guarantee that your information accounts for the very latest amendments or judicial interpretations.
Comment on Politics: Avoid political commentary when discussing laws or legal changes.

Response Guidelines

Accuracy First: Provide the most accurate response based on the question and available legal context.
Context-Based Only: Answer questions based solely on the provided context below.
Proper Citations: Always include proper legal citations and references.
Educational Disclaimer: Remind users that information is educational and not legal advice.
Structured Format: When discussing case law, use structured format with all required details.

Judgment Response Requirements
When users ask for judgments, ensure to provide:

Minimum 5 relevant judgments from the knowledge base
Complete case details including:

Case name and citation
Court and bench
Legal issues
Judgment summary
Key reasoning
Legal implications


Proper legal citations for each case
Context relevance to the user's query
    answer concise.
    \n\n
    {context}
    
    """
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [39]:
pip install langchain-groq

Note: you may need to restart the kernel to use updated packages.


In [93]:
from dotenv import load_dotenv
import os


load_dotenv()


GROQ_API_KEY = os.getenv("GROQ_API_KEY")


os.environ["GROQ_API_KEY"] = GROQ_API_KEY


In [98]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="Llama3-8b-8192",
    temperature=1,
    top_p=0.9
)

                    top_p was transferred to model_kwargs.
                    Please confirm that top_p is what you intended.


In [95]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [96]:
response = rag_chain.invoke({"input": "Provide me the drunk and drive case for the research ?"})

print(response["answer"])

I'm an Indian Legal Assistant, and I can provide you with relevant legal judgments related to the topic you've mentioned. However, I must clarify that the provided context does not specifically mention a "drunk and drive" case. Nevertheless, I can provide you with judgments related to attributability and aggravation of disability due to military service, which might be relevant to your research.

Here are five relevant judgments:

1. **Indian Council of Medical Research v. Dr. V.K. Rangachari** (2003) 5 SCC 239:

In this case, the Supreme Court held that the opinion of a Medical Board must be given due weight, value, and credence when it comes to attributability and aggravation of disability due to military service.

2. **Union of India v. Mohd. Haneef** (1994) 5 SCC 53:

In this case, the Supreme Court ruled that when an individual is physically fit at the time of enrolment and no note regarding adverse physical factors is made at the time of entry into service, the initial onus of pr

In [63]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: User query
query = "Provide me the drunk and drive case for the research?"


# Step 2: RAG chain call
response = rag_chain.invoke({"input": query})
answer = response["answer"]

# Step 3: Embed query and response
query_embedding = embedding.embed_query(query)
answer_embedding = embedding.embed_query(answer)  


# Step 4: Compute cosine similarity
similarity = cosine_similarity(
    [query_embedding],
    [answer_embedding]
)[0][0]

# Step 5: Output
print("Answer:", answer)
print("Similarity score:", similarity)


Answer: Based on the provided legal context, I will summarize the case law related to drunk and drive cases in India.

**Case 1: State of Punjab v. Sucha Singh (2019)**

* Case Name: State of Punjab v. Sucha Singh
* Citation: (2019) 13 SCC 808
* Court: Supreme Court of India
* Legal Issues: Abetment to suicide, drunk driving
* Judgment: The court held that abetment to suicide is not an offence committed at a single moment. It may consist of a build-up of psychological pressure culminating in self-destruction, and the law punishes that build-up wherever and whenever it occurs.

**Case 2: Rizwan Khan v. State of Maharashtra (2019)**

* Case Name: Rizwan Khan v. State of Maharashtra
* Citation: (2019) 13 SCC 808
* Court: Supreme Court of India
* Legal Issues: Recovery of incriminating articles, credibility of police officers
* Judgment: The court held that if police witnesses are found to be reliable and trustworthy, no error can be attributed to the conviction entered relying upon such t

In [64]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: User query
query = "Provide me top 5  judgenment for the rape case for my case research?"

# Step 2: RAG chain call
response = rag_chain.invoke({"input": query})
answer = response["answer"]

# Step 3: Embed query and response
query_embedding = embedding.embed_query(query)
answer_embedding = embedding.embed_query(answer)


context_texts = [doc.page_content for doc in response["context"]]  
context_embeddings = embedding.embed_documents(context_texts)

# (Optional) You could average them:
context_embedding_avg = np.mean(context_embeddings, axis=0)

# Step 4: Compute similarity
similarity = cosine_similarity([query_embedding], [answer_embedding])[0][0]
faithfulness = cosine_similarity([context_embedding_avg], [answer_embedding])[0][0]

# Step 5: Output
print("Answer:", answer)
print("Similarity score (query ↔ answer):", similarity)
print("Faithfulness score (context ↔ answer):", faithfulness)

Answer: Based on the provided context, I've retrieved the top 5 relevant judgments for rape cases that may be relevant for your case research. Please note that these judgments are not a substitute for legal advice, and it's essential to consult with a qualified legal professional for specific guidance.

Here are the top 5 judgments:

1. **Pramod Mahto and Others vs. State of Bihar, (1989) Supp (2) SCC 672**:

In this landmark judgment, the Supreme Court held that the prosecution need not prove the complete act of rape by each accused on the victim or on each victim where there are multiple victims. The Court emphasized the need to effectively deal with the growing menace of gang rape.

2. **Ashok Kumar vs. State of Haryana, (2003) 2 SCC 143**:

In this judgment, the Supreme Court reiterated that the prosecution must adduce evidence to indicate that more than one accused acted in concert to commit the crime. The Court also emphasized that a woman's sexual history is immaterial while adj

In [99]:
query = "Recent SC judgement wherein powers and limits of  s.362 crpc is discussed with regards to recalling a quashing order by the HC. ?"


query_embedding = embedding.embed_query(query)


docs = retriever.get_relevant_documents(query)


context_texts = [doc.page_content for doc in docs]
context_embeddings = embedding.embed_documents(context_texts)
context_embedding_avg = np.mean(context_embeddings, axis=0)


response = rag_chain.invoke({"input": query})
answer = response["answer"]


answer_embedding = embedding.embed_query(answer)


query_vs_answer = cosine_similarity([query_embedding], [answer_embedding])[0][0]
context_vs_answer = cosine_similarity([context_embedding_avg], [answer_embedding])[0][0]

print(" Query:", query)
print(" Answer:", answer)
print(f"Relevance score (query ↔ answer): {query_vs_answer:.4f}")
print(f"Faithfulness score (context ↔ answer): {context_vs_answer:.4f}")

 Query: Recent SC judgement wherein powers and limits of  s.362 crpc is discussed with regards to recalling a quashing order by the HC. ?
 Answer: A recent Supreme Court judgment that discusses the powers and limits of Section 362 CrPC with regards to recalling a quashing order by the High Court is:

Sooraj Deen Dayal v. State of Maharashtra, (2019) 10 SCC 161

In this judgment, the Supreme Court held that the High Court has no jurisdiction to recall or alter a quashing order passed under Section 482 CrPC. The Court relied on the settled principles that the High Court becomes functus officio after passing a judgment and Section 362 CrPC prohibits the alteration or review of a judgment, except to correct a clerical or arithmetical error.

The Court also emphasized that the inherent power under Section 482 CrPC cannot be used to reopen or alter an order disposing of a petition decided on merits. The Court noted that the power under Section 482 CrPC is meant solely to secure the ends of j