## ***Installing Dependencies***

In [None]:
! pip install -U langchain-community

In [None]:
! pip install llama-cpp-python langchain PyPDF2 pypdf chromadb sentence-transformers ctransformers langchain_groq


## ***Importing Dependencies***

In [4]:
import os
import warnings
warnings.filterwarnings("ignore")
from pypdf import PdfReader as PyPdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import SystemMessage, HumanMessage
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq

## ***API Configuration***

In [5]:
os.environ["GROQ_API_KEY"] = "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"

## ***Initializing GROQ API Model - Mixtral-8x7b-32768***

In [6]:
llm_groq = ChatGroq(model_name="mixtral-8x7b-32768", api_key=os.getenv("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"))
#llm_groq = ChatGroq(model_name="gemma2-9b-it", api_key=os.getenv("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"))
#llm_groq = ChatGroq(model_name="Gemma 7B", api_key=os.getenv("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"))
#llm_groq = ChatGroq(model_name="Falcon 7B", api_key=os.getenv("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"))


## ***Preprocessing Layer***

#### ***Document Reading***

In [7]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        reader = PyPdfReader(pdf_path)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

pdf_files = {
    "doc1": "Project_input_1.pdf",
    "doc2": "Project_input_2.pdf"
}
documents = {name: extract_text_from_pdf(path) for name, path in pdf_files.items()}
print(f"{len(documents)} Document Loaded")



2 Document Loaded


## ***Building a RAG Pipeline (Summarization  - Legal Chat Bot  -FAQs)***

### ***Semantic Layer***

#### ***Step 1: Chunking the text in the Documents***

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.create_documents(list(documents.values()))
print("Chunking Completed Successfully")


Chunking Completed Successfully


#### ***Step 2: Creating Vector Embeddings***

In [10]:
#embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
embedding_model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")

documents_text = [chunk.page_content for chunk in chunks]
document_embeddings = embedding_model.embed_documents(documents_text)


  embedding_model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# import shutil
# shutil.rmtree("./chroma_db", ignore_errors=True)

#### ***Step 3: Storing Embeddings to Chroma DB***

In [11]:
#vector_db = FAISS.from_texts(documents_text, embedding_model)
vector_db = Chroma.from_texts(documents_text, embedding_model, persist_directory="./chroma_db")
print("Documents Embedded and Stored Successfully")

Documents Embedded and Stored Successfully


### ***Functional Layer***

#### ***Step 1: Summarization of Legal Preceddings***

In [12]:
def summarize_legal_document(doc_text, chunk_size=2000):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200)
    chunks = text_splitter.split_text(doc_text)

    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1}/{len(chunks)}...")

        prompt = (
            "You are an AI legal expert specializing in legal compliance and case law analysis. "
            "Summarize the legal documents in a crisp and structured case history . "
            " The Case story should highlighting key legal arguments, precedents and final judgments in a bulleted format. "
            "Use a professional and legal tone when necessary, focusing on clarity and brevity.\n\n"
            f"### Legal Document Chunk {i+1}:\n{chunk}\n\n"
            " Provide a structured legal summary below:"
        )

        try:
            response = llm_groq.invoke([HumanMessage(content=prompt)])
            summaries.append(response.content.strip())

        except Exception as e:
            print(f"Error summarizing chunk {i+1}: {e}")
            #summaries.append("Summary unavailable for this section.")

    return "\n\n".join(summaries)

case_histories = {name: summarize_legal_document(text) for name, text in documents.items()}


# # Older Prompt
# def summarize_legal_document(doc_text, chunk_size=2000):
#     # Step 1: Chunk the text to prevent exceeding Groq API token limits
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=200)
#     chunks = text_splitter.split_text(doc_text)

#     summaries = []

#     # Step 2: Process each chunk separately
#     for i, chunk in enumerate(chunks):
#         print(f"🔹 Summarizing chunk {i+1}/{len(chunks)}...")  # Progress tracking

#         prompt = (
#             "You are an AI legal expert. Summarize the following legal document into a structured case history:\n\n"
#             f"{chunk}\n\n"
#             "Provide key details such as case background, legal arguments, precedents, and final judgments."
#         )

#         try:
#             response = llm_groq.invoke([HumanMessage(content=prompt)])
#             summaries.append(response.content.strip())

#         except Exception as e:
#             print(f"Error summarizing chunk {i+1}: {e}")
#             summaries.append("Summary unavailable for this section.")

#     return "\n\n".join(summaries)  # Combine all chunk summaries

# # Step 3: Summarizing each document in the dataset
# case_histories = {name: summarize_legal_document(text) for name, text in documents.items()}

# # Step 4: Print summarized case histories
# print("📌 Summarized Case Histories:")
# for name, summary in case_histories.items():
#     print(f"\n🔹 {name}:\n{summary}")

Summarizing chunk 1/7...
Error summarizing chunk 1: Error code: 400 - {'error': {'message': 'The model `mixtral-8x7b-32768` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
Summarizing chunk 2/7...
Error summarizing chunk 2: Error code: 400 - {'error': {'message': 'The model `mixtral-8x7b-32768` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', 'type': 'invalid_request_error', 'code': 'model_decommissioned'}}
Summarizing chunk 3/7...
Error summarizing chunk 3: Error code: 400 - {'error': {'message': 'The model `mixtral-8x7b-32768` has been decommissioned and is no longer supported. Please refer to https://console.groq.com/docs/deprecations for a recommendation on which model to use instead.', '

In [19]:
def dict_to_bullets(case_histories):
    #return "\n".join(f"• {key}: {value}" for key, value in case_histories.items())
    return "\n".join(f"• {value}" for value in case_histories.values())
print(dict_to_bullets(case_histories))

• ## Case History: Galvez v. The State (A24A1657)

**Parties:**

* Appellant: Billy Milton Galvez
* Appellee: The State of Georgia

**Charges:**

* Malice murder (later reduced to voluntary manslaughter)
* Aggravated assault (one charge challenged)
* Possession of a firearm during the commission of a felony

**Facts:**

* On July 5, 2020, Alejandro Ramirez was shot and killed at Fusion Event Hall.
* Appellant Billy Milton Galvez was present at the scene.
* Evidence shows Galvez, his girlfriend Jennifer Cardona-Rodriguez, and friend Jorge Avila consumed alcohol and fired Galvez's handgun before arriving at Fusion.
* An altercation occurred between Cardona-Rodriguez and Ramirez at Fusion.

**Legal Arguments:**

* **Appellant's Argument 1:** The trial court erred by overruling Galvez's demurrer challenging the sufficiency of the indictment as to one of the aggravated assault charges.
* **Appellant's Argument 2:** The trial court erred in its jury instructions regarding the definition of a

#### ***Step 2: Legal Intelligent (AI) Chat Bot - RAG Based***


#### ***Training Trough System Prompt***

In [81]:
#llm_groq = ChatGroq(model_name="llama 3B", api_key=os.getenv("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"))
llm_groq = ChatGroq(model_name="mixtral-8x7b-32768", api_key=os.getenv("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"))


In [87]:
system_prompt = SystemMessage(
    content=(
       """You are a legal assistant AI specialized in providing clear, concise, and structured legal responses. When answering legal questions, follow these guidelines:
          Provide a Direct Answer: Start with a clear and factual response.
          Don't directly start answers. Start off with the Questions context
          Offer Context if Needed: Briefly explain relevant legal concepts or context.
          Structure Your Response: Use bullet points, numbered lists, or headings to ensure clarity.
          Reference Legal Outcomes: If applicable, mention verdicts, judgments, or rulings.
          Stay Objective and Professional: Maintain a neutral tone and avoid unnecessary opinions.
          Example:
          User: What was the final verdict given by the Gwinnett County jury?
          Answer: The final verdict given by the Gwinnett County jury was that
committing a violent injury upon him or by committing an act which placed the victim
in reasonable apprehension of immediately receiving a violent injury. In a consolidated
order on all pre-trial motions, the trial court denied Galvez’s demurrer.
"""
    )
)

#### ***Retrieving Embeddings from Chroma DB for AI Chat Bot***

In [91]:
retriever = vector_db.as_retriever()
qa_chain = RetrievalQA.from_chain_type(llm=llm_groq, retriever=retriever, chain_type="stuff")

#### ***Step 3: FAQs Storage***

In [92]:
#### _Step 3 is embedded part of Step 2_
faq_store = {}

def get_faq_answer(query):
    return faq_store.get(query.lower())

def store_faq(query, answer):
    if len(faq_store) >= 10:
        faq_store.pop(next(iter(faq_store)))
    faq_store[query.lower()] = answer

In [93]:
###Process User Queries (Legal AI Chatbot)
def ask_legal_question(query):
    """Handles user queries for legal information."""

    # Check if the question exists in FAQs
    faq_answer = get_faq_answer(query)
    if faq_answer:
        return f"FAQ Answer: {faq_answer}"

    # Search in vector database (Retrieve case laws)
    response = retriever.get_relevant_documents(query)
    if response:
        return f"Analysing 📄:\n{response[0].page_content}"

    # If no relevant document is found, use LLM for general legal knowledge
    print("No relevant document found, using 🔍 ")
    response_text = llm_groq.invoke([system_prompt, HumanMessage(content=query)])

    # Store the query-answer pair in FAQ
    store_faq(query, response_text.content)

    return response_text.content


In [94]:
def chatbot():
    """Runs the AI-powered legal chatbot."""
    print("🔹 Legal AI Chatbot (Type 'exit' to stop) 🔹")
    user_query = input("\nUser: ")
    print(user_query)
    if user_query.lower() != "exit":
        response = ask_legal_question(user_query)
        print(f"Legal Bot: {response}")

# Run Chatbot
chatbot()

🔹 Legal AI Chatbot (Type 'exit' to stop) 🔹

User: What was the final verdict given by the Gwinnett County jury?
What was the final verdict given by the Gwinnett County jury?
Legal Bot: Analysing 📄:
committing a violent injury upon him or by committing an act which placed the victim
in reasonable apprehension of immediately receiving a violent injury. In a consolidated
order on all pre-trial motions, the trial court denied Galvez’s demurrer. 
After a trial, Gwinnett County jury found Galvez guilty on Counts 3 and 6. With
respect to Count 2, the jury found Galvez not guilty of felony murder, but guilty of the
lesser-included offense of voluntary manslaughter. Galvez was acquitted on the
remaining counts. Galvez filed a motion for new trial, as amended. The trial court
denied the motion for new trial after a hearing, and this appeal followed. 
1. Galvez first argues that the trial court erred by overruling his demurrer to the
indictment. Specifically, he points out that in Count 3 of the 