In [23]:
# ipython-input-2-fbc641dca10d
import pandas as pd

# Load the IPC dataset
ipc_df = pd.read_csv("/content/ipc_sections.csv")

# Clean the data: handle missing values if necessary
ipc_df = ipc_df.dropna(subset=['Description', 'Offense', 'Punishment', 'Section'])

# ipython-input-5-fbc641dca10d
!pip install -U langchain-community # Install the necessary package
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter # Import RecursiveCharacterTextSplitter

# Convert the list of strings into Document objects
documents = [Document(page_content=text) for text in ipc_df['Description'].tolist()]

# Split the documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

# Generate embeddings using a transformer model
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
# Make sure to install and import necessary libraries
!pip install sentence_transformers chromadb
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Store the embeddings in a vector store (Chroma)
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

# ipython-input-7-fbc641dca10d
!pip install transformers
from transformers import pipeline

# Define and initialize the query_pipeline
query_pipeline = pipeline(
    "text-generation",
    model="google/flan-t5-base",  # Replace with your desired model
    tokenizer="google/flan-t5-base",  # Replace with your desired tokenizer
    max_new_tokens=100  # Adjust as needed
)

from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Now, you can use query_pipeline in HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=query_pipeline)

# Set up the retriever
retriever = vectordb.as_retriever()

# Set up the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

# ipython-input-8-fbc641dca10d
import time  # Import the time module

def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time.time()  # Use time.time() to get the current time
    result = qa.run(query)
    time_2 = time.time()  # Use time.time() to get the current time
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

# Example queries related to IPC sections
query = "What is the punishment for impersonating a soldier according to IPC?"
test_rag(qa, query)

query = "What are the key offenses in Section 140 of the IPC?"
test_rag(qa, query)







config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausal

Query: What is the punishment for impersonating a soldier according to IPC?



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 4.323 sec.

Result:  Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Description of IPC Section 140
According to section 140 of Indian penal code, Whoever, not being a soldier, sailor or airman in the Military, Naval or Air service of the Government of India, wears any garb or carries any token resembling any garb or token used by such a soldier, sailor or airman with the intention that it may be believed that he is such a soldier, sailor or airman, shall be punished with imprisonment of either description for a term which may extend to three months, or with fine which may extend to five hundred rupees, or with both.


IPC 140 in Simple Words
If someone who is not a military member wears a uniform or carries 

In [24]:
from transformers import pipeline

# Load a sentiment-analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Analyze sentiment for a specific IPC section
sentiment_result = sentiment_pipeline(ipc_df['Description'][0])
print(f"Sentiment of IPC Section 1: {sentiment_result}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Sentiment of IPC Section 1: [{'label': 'NEGATIVE', 'score': 0.9968149065971375}]


In [25]:
def interactive_test(qa):
    while True:
        query = input("Enter a query related to IPC or 'exit' to quit: ")
        if query.lower() == 'exit':
            break
        test_rag(qa, query)

# Call the function to start interactive testing
interactive_test(qa)

Enter a query related to IPC or 'exit' to quit: Someone broke into Jennie's house
Query: Someone broke into Jennie's house



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 3.742 sec.

Result:  Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Provided that such conduct shall not amount to stalking if the man who pursued it proves that—
it was pursued for the purpose of preventing or detecting crime and the man accused of stalking had been entrusted with the responsibility of prevention and detection of crime by the State; or
it was pursued under any law or to comply with any condition or requirement imposed by any person under any law; or
in the particular circumstances such conduct was reasonable and justified.
(2) Whoever commits the offence of stalking shall be punished on first conviction with imprisonment of either description f