## Legal document analysis and Q&A using  RAG framework

In [1]:
# Imports (MATCHES YOUR VERSIONS)

In [12]:
import os
import torch
import numpy as np

from pypdf import PdfReader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.embeddings.base import Embeddings

from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, pipeline


In [3]:
# Load Legal PDF Documents

In [13]:
DATA_DIR = "legal_docs"  # Folder containing your PDFs

documents = []

for file in os.listdir(DATA_DIR):
    if file.lower().endswith(".pdf"):
        reader = PdfReader(os.path.join(DATA_DIR, file))
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        
        documents.append({
            "text": text,
            "source": file
        })

print(f"Loaded {len(documents)} legal documents")


Loaded 4 legal documents


In [5]:
# Text Chunking (Legal-Friendly)

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " ", ""]
)

texts = []
metadatas = []

for doc in documents:
    chunks = text_splitter.split_text(doc["text"])
    for chunk in chunks:
        texts.append(chunk)
        metadatas.append({"source": doc["source"]})

print(f"Total text chunks created: {len(texts)}")


Total text chunks created: 4


In [7]:
# Embedding Model (Sentence-Transformers)

In [15]:
class HFTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=512
            )
            with torch.no_grad():
                outputs = self.model(**inputs)
                embedding = outputs.last_hidden_state.mean(dim=1)
                embeddings.append(embedding.squeeze().numpy())
        return embeddings

    def embed_query(self, text):
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1)
        return embedding.squeeze().numpy()


In [17]:
# Initialize Embedding Model

In [18]:
embedding_model = HFTransformerEmbeddings()


In [19]:
#Create FAISS Vector Store

In [23]:
vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embedding_model,
    metadatas=metadatas
)

vectorstore.save_local("faiss_legal_index")
print("FAISS index created and saved")


FAISS index created and saved


In [24]:
# . Load FAISS Index (Reusable)

In [26]:
vectorstore = FAISS.load_local(
    "faiss_legal_index",
    embedding_model
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

print("FAISS index loaded successfully")


FAISS index loaded successfully


In [27]:
# Load Open-Source LLM (Local)

In [28]:
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.0
)

llm = HuggingFacePipeline(pipeline=pipe)




In [29]:
# Build RAG QA Chain

In [30]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)


In [31]:
# Ask Legal Questions

In [32]:
query = "What are the termination conditions mentioned in the contract?"

response = qa_chain(query)

print("Answer:\n")
print(response["result"])

print("\nSource Documents:\n")
for doc in response["source_documents"]:
    print("-", doc.metadata["source"])




Answer:

30 days written notice

Source Documents:

- Employment_Contract.pdf
- Service_Agreement.pdf
- Non_Disclosure_Agreement.pdf
- Privacy_Policy.pdf



ðŸ“˜ Legal Document Q&A Chat (RAG System)
Type your legal question and press Enter
Type 'exit' or 'quit' to end the session



Legal Question >  What are the termination conditions?





Answer:
30 days written notice

Source Documents:
- Non_Disclosure_Agreement.pdf
- Employment_Contract.pdf
- Privacy_Policy.pdf
- Service_Agreement.pdf

------------------------------------------------------------



Legal Question >  what is the limitation of liability?





Answer:
Liability shall not exceed the total contract value.

Source Documents:
- Non_Disclosure_Agreement.pdf
- Employment_Contract.pdf
- Privacy_Policy.pdf
- Service_Agreement.pdf

------------------------------------------------------------



Legal Question >  exit



Session ended. Thank you.
