In [3]:
import os
import fitz
from tqdm.auto import tqdm

pdf_path = "U:\\LEGALSENSE\\script\\Pakistan Penal Code.pdf"


def text_formatter(text: str) -> str:
    cleaned_text = text.replace("\n", "").strip()
    return cleaned_text


def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({"page_number": page_number - 4,
                               "page_char_count": len(text),
                               "page_word_count": len(text.split(" ")),
                               "page_sentence_count": len(text.split(". ")),
                               "page_token_count": len(text) / 4,
                               "text": text})
    return pages_and_text


pages_and_text = open_and_read_pdf(pdf_path=pdf_path)
pages_and_text[:2]

0it [00:00, ?it/s]

[{'page_number': -4,
  'page_char_count': 1651,
  'page_word_count': 283,
  'page_sentence_count': 14,
  'page_token_count': 412.75,
  'text': 'Pakistan Penal Code (Act XLV of 1860) Act XLV of 1860 October 6th, 1860 Amended by: Protection of Women (Criminal Laws Amendment) Act, 2006,Criminal Laws (Amendment) Act, 2004 (I of 2005),Criminal Law (Amendment) Ordinance (LXXXV of 2002),Criminal Laws (Reforms) Ordinance (LXXXVI of 2002),etc.  Whereas it is expedient to provide a general Penal Code for Pakistan:  It is enacted as follows:-   CHAPTER I INTRODUCTION 1. Title and extent of operation of the Code. This Act shall be called the Pakistan Penal Code, and shall take effect throughout Pakistan.   2. Punishment of offences committed within Pakistan. Every person shall be liable to punishment under this Code and not otherwise for every act or omission contrary to the provisions thereof, of which he shall be guilty within Pakistan.   3. Punishment of offences committed beyond, but which by 

In [4]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize

text = "My name is Huzaifa. I am a CS student at PU"
sentences = sent_tokenize(text)

print(sentences)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Huzaifa\AppData\Roaming\nltk_data...


['My name is Huzaifa.', 'I am a CS student at PU']


[nltk_data]   Package punkt is already up-to-date!


In [5]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(sent_tokenize(item["text"]))
    item["page_sentence_count"] = len(item["sentences"])
    

  0%|          | 0/164 [00:00<?, ?it/s]

In [6]:
import random

random.sample(pages_and_text, k=1)

[{'page_number': 124,
  'page_char_count': 2874,
  'page_word_count': 500,
  'page_sentence_count': 12,
  'page_token_count': 718.5,
  'text': '430. Mischief by injury to works of irrigation or by wrongfully diverting water: Whoever commits mischief by doing any act which causes, or which he knows to be likely to cause, a diminution of the supply of water for agricultural purposes, or for food or drink for human beings or for animals which are property, or for cleanliness or for carrying on any manufacture, shall be punished with imprisonment of either description for a term which may extend to five years, or with fine, or with both.   431. Mischief by injury to public road, bridge, river or channel: Whoever commits mischief by doing any act which renders or which he knows to be likely to render any public road, bridge, navigable river or navigable channel, natural or artificial, impassable or less safe for travelling or conveying property, shall be punished with imprisonment of either

In [7]:
chunk_size_num = 10

def split_list(input_list : list[str],
               slice_size : int = chunk_size_num) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_split = list(range(39))
split_list(test_split)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
 [30, 31, 32, 33, 34, 35, 36, 37, 38]]

In [8]:
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=chunk_size_num)
    item['num_chunks'] = len(item["sentence_chunks"])

  0%|          | 0/164 [00:00<?, ?it/s]

In [9]:
random.sample(pages_and_text, k=1)

[{'page_number': 118,
  'page_char_count': 2498,
  'page_word_count': 441,
  'page_sentence_count': 18,
  'page_token_count': 624.5,
  'text': "and buys shares in the Bank of Bengal for Z, instead of buying Company's paper, here, though Z should suffer loss, and should be entitled to bring a civil action against A, on account of that loss, yet A, not having acted dishonestly, has not committed criminal breach of trust.    (e) A, a revenue-officer, is entrusted with public money and is either directed by law, or bound by a contract, express or implied, with the Government, to pay into a certain treasury all the public money which he holds. A dishonestly appropriates the money. A has committed criminal breach of trust.    (f) A, a carrier, is entrusted by Z with property to be carried by land or by water. A dishonestly misappropriates the property. A has committed criminal breach of trust.       406. Punishment for criminal breach of trust: Whoever, commits criminal breach of trust snail

In [10]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,num_chunks
count,164.0,164.0,164.0,164.0,164.0,164.0
mean,77.5,2462.57,447.39,14.12,615.64,1.9
std,47.49,449.37,75.68,4.24,112.34,0.5
min,-4.0,1387.0,268.0,6.0,346.75,1.0
25%,36.75,2198.0,402.75,11.0,549.5,2.0
50%,77.5,2548.5,455.5,14.0,637.12,2.0
75%,118.25,2783.5,504.25,17.0,695.88,2.0
max,159.0,3370.0,605.0,27.0,842.5,3.0


In [11]:
import re

pages_and_chunks = []

for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        joined_sentence_chunk = "".join(sentence_chunk).replace(" "," ").strip()

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/164 [00:00<?, ?it/s]

312

In [12]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 49,
  'sentence_chunk': '(e) A, an interpreter or translator, gives or certifies, as a true interpretation or translation of a statement, which he is bound by oath to interpret or translate truly, that which is not and which he does not believe to be a true interpretation or translation.A has given false evidence.192.Fabricating false evidence: Whoever causes any circumstance to exist or makes any false entry in any book or record, or makes any document containing a false statement, intending that such circumstance, false entry or false statement may appear in evidence in a judicial proceeding, or in a proceeding taken by law before a public servant as such, or before an arbitrator, and that such circumstance, false entry or false statement, so appearing in evidence, may cause any person who in such proceeding is to form an opinion upon the evidence, to entertain an erroneous opinion touching any point material to the result of such proceeding, is said to fabricate fal

In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(r'U:\LEGALSENSE\models\all-mpnet-base-v2', device='cpu')



model.to('cpu')

for item in tqdm(pages_and_chunks):
    item["embedding"] = model.encode(item["sentence_chunk"])



No sentence-transformers model found with name U:\LEGALSENSE\models\all-mpnet-base-v2. Creating a new one with mean pooling.


  0%|          | 0/312 [00:00<?, ?it/s]

In [19]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks]
text_chunks[220]

'394.Voluntarily causing hurt in committing robbery: If any person, in committing or in attempting to commit robbery, voluntarily causes hurt, such person, and any other person jointly concerned in committing or attempting to commit such robbery, shall be punished with imprisonment for life, or with rigorous imprisonment for a term  157[which shall not be less than four years nor more than] 157 ten years, and shall also be liable to fine.395.Punishment for dacoity: Whoever commits dacoity shall be punished with imprisonment for life, or with rigorous imprisonment for a term which shall not be less than four years nor more than ten years and shall also be liable to fine.396.Dacoity with murder: If any one of five or more persons, who are conjointly committing dacoity, commits murder in so committing dacoity, everyone of those persons shall be punished with death, or imprisonment for life, or rigorous imprisonment for a term which  158[shall not be less than four years nor more than] 158

In [20]:
len(text_chunks)

312

In [None]:
pages_and_chunks[0]

{'page_number': -4,
 'sentence_chunk': 'Pakistan Penal Code (Act XLV of 1860) Act XLV of 1860 October 6th, 1860 Amended by: Protection of Women (Criminal Laws Amendment) Act, 2006,Criminal Laws (Amendment) Act, 2004 (I of 2005),Criminal Law (Amendment) Ordinance (LXXXV of 2002),Criminal Laws (Reforms) Ordinance (LXXXVI of 2002),etc.Whereas it is expedient to provide a general Penal Code for Pakistan:  It is enacted as follows:-   CHAPTER I INTRODUCTION 1.Title and extent of operation of the Code.This Act shall be called the Pakistan Penal Code, and shall take effect throughout Pakistan.2.Punishment of offences committed within Pakistan.Every person shall be liable to punishment under this Code and not otherwise for every act or omission contrary to the provisions thereof, of which he shall be guilty within Pakistan.3.Punishment of offences committed beyond, but which by law may be tried within Pakistan.Any person liable, by any Pakistan Law, to be tried for an offence committed beyond 

In [21]:
chunks_and_embeddings = pd.DataFrame(pages_and_chunks)
embeddings_save_path = "U:\\LEGALSENSE\\chunks_and_embeddings.csv"
chunks_and_embeddings.to_csv(embeddings_save_path, index=False)

In [22]:
chunks_and_embedding_load = pd.read_csv(embeddings_save_path)
chunks_and_embedding_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-4,Pakistan Penal Code (Act XLV of 1860) Act XLV ...,1136,180,284.0,[ 6.66384473e-02 -1.65656865e-01 5.20089492e-...
1,-4,4.Extension of Code to extra-territorial offen...,495,84,123.75,[ 3.14997286e-02 -1.35520577e-01 4.71762829e-...
2,-3,"Illustrations (a) A, a Pakistan subject, commi...",788,151,197.0,[ 7.03412145e-02 -5.17950505e-02 7.49710128e-...
3,-3,Definitions in the code to be understood subje...,1080,175,270.0,[ 6.31914884e-02 -1.68107554e-01 1.04110412e-...
4,-2,7.Sense of expression once explained.Every exp...,449,67,112.25,[ 4.37125079e-02 -1.20444097e-01 -4.21256348e-...


In [23]:
import random
import torch
import numpy as np
import pandas as pd

device = "cpu"

chunks_and_embeddings = pd.read_csv("U:\\LEGALSENSE\\chunks_and_embeddings.csv")


chunks_and_embeddings["embedding"] = chunks_and_embeddings["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

embeddings = torch.tensor(np.stack(chunks_and_embeddings["embedding"].tolist(), axis=0),dtype=torch.float32)

pages_and_chunks = chunks_and_embeddings.to_dict(orient="records")

chunks_and_embeddings


Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,-4,Pakistan Penal Code (Act XLV of 1860) Act XLV ...,1136,180,284.00,"[0.0666384473, -0.165656865, 0.0520089492, -0...."
1,-4,4.Extension of Code to extra-territorial offen...,495,84,123.75,"[0.0314997286, -0.135520577, 0.0471762829, -0...."
2,-3,"Illustrations (a) A, a Pakistan subject, commi...",788,151,197.00,"[0.0703412145, -0.0517950505, 0.0749710128, -0..."
3,-3,Definitions in the code to be understood subje...,1080,175,270.00,"[0.0631914884, -0.168107554, 0.104110412, -0.0..."
4,-2,7.Sense of expression once explained.Every exp...,449,67,112.25,"[0.0437125079, -0.120444097, -0.0421256348, 0...."
...,...,...,...,...,...,...
307,158,151 The following was omitted by Offence of ...,679,124,169.75,"[0.159574226, -0.0425061993, -0.0432828255, 0...."
308,158,"XXX of 1981, S. 2.160 Substituted by Crimina...",764,129,191.00,"[0.0954134166, -0.0862223208, 0.0442153588, -0..."
309,158,169 Substituted by Federal Laws (Revision an...,93,15,23.25,"[-0.00428213924, 0.0372241735, 0.0644299909, -..."
310,159,170 Sections 489-A to 489-D inserted by Curr...,1204,226,301.00,"[0.0386597216, 0.0375630111, -0.000216331377, ..."


In [24]:
embeddings.shape

torch.Size([312, 768])

In [25]:
from sentence_transformers import util, SentenceTransformer

model = SentenceTransformer(r'U:\LEGALSENSE\models\all-mpnet-base-v2', device='cpu')

No sentence-transformers model found with name U:\LEGALSENSE\models\all-mpnet-base-v2. Creating a new one with mean pooling.


In [26]:
query = "killing"
print(f"query: {query}")

query_embedding = model.encode(query, convert_to_tensor=True, device=device)


dot_score = util.dot_score(a = query_embedding, b = embeddings)[0]

top_result_dotproduct = torch.topk(dot_score, k = 5)
top_result_dotproduct


query: killing


torch.return_types.topk(
values=tensor([2.8542, 2.7402, 2.5780, 2.5176, 2.5054]),
indices=tensor([202, 198, 282, 220,  19]))

In [27]:
pages_and_chunks[19]

{'page_number': 4,
 'sentence_chunk': 'Illustration A attacks Z under such circumstances of grave provocation that his killing of Z would be only culpable homicide not amounting to murder.B having ill-will towards Z and intending to kill him, and not having been subject to the provocation, assist A in killing Z.Here, though A and B are both engaged in causing Z\'s death, B is guilty of murder, and A is guilty only of culpable homicide.39."Voluntarily": A person is said to cause an effect "voluntarily" when he causes it by means whereby he intended to cause it, or by means which, at the time of employing those means, he knew or had reason to believe to be likely to cause it.Illustration A sets fire, by night, to an inhabited house in a large town, for the purpose of facilitating robbery and thus causes the death of a person.Here, A may not have intended to cause death, and may even be sorry that death has been caused by his act; yet, if he knew that he was likely to cause death; he has 

In [28]:
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [29]:
query = "killing"
print(f"query: {query}")
print("results:")

for score, idx in zip(top_result_dotproduct[0], top_result_dotproduct[1]):
    print(f"score: {score:.4f}")
    print("text")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print(f"page_number: {pages_and_chunks[idx]['page_number']}")
    print("\n")


query: killing
results:
score: 2.8542
text
Of Rape  151[] 151  152[
page_number: 106


score: 2.7402
text
146  367.Kidnapping or abducting in order to subject person to grievous hurt,
slavery, etc.:
page_number: 104


score: 2.5780
text
Ingredients: This section has the following essentials:---  1.Threatening a
person with any injury--  (i)  to this person, reputation, or property; or (ii)
to the person or reputation of any one in whom that person is
interested.2.Threat must be with intent---  (a) to cause harm to that person, or
(b) to cause that person to do any act which he is not legally bound to do as
the means of avoiding the execution of such threat, or  (c) to cause that person
to omit to do any act which that person is legally entitled to do as the means
of avoiding the execution of such threat.
page_number: 145


score: 2.5176
text
394.Voluntarily causing hurt in committing robbery: If any person, in committing
or in attempting to commit robbery, voluntarily causes hurt, such

In [32]:
def retrieve_resources(query: str,
                       embeddings: torch.tensor,
                       model: SentenceTransformer,
                       n_return: int = 5):
    query_embedding = model.encode(query, convert_to_tensor=True)

    dot_score = util.dot_score(query_embedding, embeddings)[0]

    score, indices = torch.topk(input=dot_score, k=n_return)

    return score, indices

def print_top_results_score(query: str,
                            embeddings: torch.tensor,
                            model: SentenceTransformer,
                            pages_and_chunks: list[dict] = pages_and_chunks,
                            n_return: int = 5):
    score, indices = retrieve_resources(query=query,
                                        embeddings=embeddings,
                                        model=model,
                                        n_return=n_return)
    for score, idx in zip(score, indices):
        print(f"score: {score:.4f}")
        print("text")
        print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
        print(f"page_number: {pages_and_chunks[idx]['page_number']}")
        print("\n")
    

In [34]:
query = "killing a man"
print_top_results_score(query=query, embeddings=embeddings, model=model)

score: 3.5459
text
146  367.Kidnapping or abducting in order to subject person to grievous hurt,
slavery, etc.:
page_number: 104


score: 3.5260
text
Of Rape  151[] 151  152[
page_number: 106


score: 3.3608
text
394.Voluntarily causing hurt in committing robbery: If any person, in committing
or in attempting to commit robbery, voluntarily causes hurt, such person, and
any other person jointly concerned in committing or attempting to commit such
robbery, shall be punished with imprisonment for life, or with rigorous
imprisonment for a term  157[which shall not be less than four years nor more
than] 157 ten years, and shall also be liable to fine.395.Punishment for
dacoity: Whoever commits dacoity shall be punished with imprisonment for life,
or with rigorous imprisonment for a term which shall not be less than four years
nor more than ten years and shall also be liable to fine.396.Dacoity with
murder: If any one of five or more persons, who are conjointly committing
dacoity, commits mu

In [35]:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch




model_id = "tiiuae/falcon-rw-1b"


tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                torch_dtype = torch.float16,
                                                low_cpu_mem_usage=False,)





RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-688d0d07-494e802b2b47ca4e2f2eff29;0c9dcd34-096d-4b3b-a131-f54b45ade9e4)

Repository Not Found for url: https://huggingface.co/api/models/tiiuae/falcon-rw-1b/tree/main/additional_chat_templates?recursive=False&expand=False.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
User Access Token "gemma-access" is expired

In [None]:
llm_model

FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(50304, 2048)
    (h): ModuleList(
      (0-23): 24 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (query_key_value): FalconLinear(in_features=2048, out_features=6144, bias=True)
          (dense): FalconLinear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): FalconLinear(in_features=2048, out_features=8192, bias=True)
          (act): GELUActivation()
          (dense_4h_to_h): FalconLinear(in_features=8192, out_features=2048, bias=True)
        )
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    (rotary_emb): FalconRotaryEmbedding()
  )
  (lm_head): Li

In [None]:
prompt = "what is the punishment for killing a"

inputs_idx = tokenizer(prompt, return_tensors="pt").to("cpu")

output = llm_model.generate(
    input_ids=inputs_idx["input_ids"],
    max_new_tokens=100
)

print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


tensor([[10919,   318,   262,  9837,   329,  5170,   257,  1048,    30,   198,
           464,  9837,   329,  5170,   257,  1048,   318,  1918,    13,   198,
          2061,   318,   262,  9837,   329,  5170,   257,  1048,    30,   198,
           464,  9837,   329,  5170,   257,  1048,   318,  1918,    13,   198,
          2061,   318,   262,  9837,   329,  5170,   257,  1048,    30,   198,
           464,  9837,   329,  5170,   257,  1048,   318,  1918,    13,   198,
          2061,   318,   262,  9837,   329,  5170,   257,  1048,    30,   198,
           464,  9837,   329,  5170,   257,  1048,   318,  1918,    13,   198,
          2061,   318,   262,  9837,   329,  5170,   257,  1048,    30,   198,
           464,  9837,   329,  5170,   257,  1048,   318,  1918,    13,   198,
          2061,   318,   262,  9837,   329,  5170,   257]])


In [None]:
output_decoded = tokenizer.decode(output[0])

print(output_decoded)

what is the punishment for killing a person?
The punishment for killing a person is death.
What is the punishment for killing a person?
The punishment for killing a person is death.
What is the punishment for killing a person?
The punishment for killing a person is death.
What is the punishment for killing a person?
The punishment for killing a person is death.
What is the punishment for killing a person?
The punishment for killing a person is death.
What is the punishment for killing a


In [None]:
def prompt_formatter(query: str,
                     context_item: list[dict]) -> str:
    context = "- "+ "\n-".join([item["sentence_chunk"] for item in context_item])

    prompt = context
    return prompt

query = "what is the punishment for killing a"

score, indices = retrieve_resources(query=query,
                                    embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query, context_item=context_items)

print(prompt)

NameError: name 'retrieve_resources' is not defined