In [26]:
import pandas as pd
import logging

from groq import Groq
from dotenv import load_dotenv
import os

from tqdm import tqdm
load_dotenv()

True

In [20]:
df = pd.read_csv("./datasets/IPC.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,IPC_Section,Description,Offense,Punishment,Cognizable,Bailable,Court
0,0,IPC Section 140,Description of IPC Section 140\nAccording to s...,Wearing the dress or carrying any token used b...,3 Months or Fine or Both,Cognizable,Bailable,Any Magistrate
1,1,IPC Section 127,Description of IPC Section 127\nAccording to s...,Receiving property taken by war or depredation...,7 Years + Fine + forfeiture of property,Cognizable,Non-Bailable,Court of Session
2,2,IPC Section 128,Description of IPC Section 128\nAccording to s...,Public servant voluntarily allowing prisoner o...,Imprisonment for Life or 10 Years + Fine,Cognizable,Non-Bailable,Court of Session
3,3,IPC Section 129,Description of IPC Section 129\nAccording to s...,Public servant negligently suffering prisoner ...,Simple Imprisonment 3 Years + Fine,Cognizable,Bailable,Magistrate First Class
4,4,IPC Section 130,Description of IPC Section 130\nAccording to s...,"Aiding escape of, rescuing or harbouring, such...",Imprisonment for Life or 10 Years + Fine,Cognizable,Non-Bailable,Court of Session


In [21]:
client = Groq(api_key = os.environ.get("GROQ_API_KEY"))

SUMMARIZER_PROMPT = "Summarize the following legal content in a crisp manner with the important details kept intact. Only give the summary, without the starting  line Here is a crisp summary"

SUMMARIZER_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"

logging.info("Model initialized with the instructed prompt")

INFO:root:Model initialized with the instructed prompt


In [27]:
summaries = []
for idx in tqdm(range(len(df['Description'])), desc="Summarizing the document"):
    completion = client.chat.completions.create(
    model=SUMMARIZER_MODEL,
    messages=[
      {
        "role": "user",
        "content": f"{SUMMARIZER_PROMPT}: {df['Description'][idx]}"
      }
    ],
    temperature=1,
    max_completion_tokens=1024,
    top_p=1,
    stream=True,
    stop=None,
    )


    full_summary = ""
    for chunk in completion:
        if chunk.choices[0].delta.content:
            full_summary += chunk.choices[0].delta.content

    summaries.append(full_summary)

Summarizing the document:   0%|          | 0/445 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
Summarizing the document:   0%|          | 1/445 [00:00<02:12,  3.35it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
Summarizing the document:   0%|          | 2/445 [00:00<01:40,  4.41it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
Summarizing the document:   1%|          | 3/445 [00:00<01:37,  4.52it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
Summarizing the document:   1%|          | 4/445 [00:00<01:44,  4.24it/s]INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 2.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai

In [28]:
df['summary'] = summaries

In [29]:
df['complete_desc'] = df['Description'] + "Offense" + df['Offense'] + "Punishment" + df['Punishment']
df.head(5)

Unnamed: 0.1,Unnamed: 0,IPC_Section,Description,Offense,Punishment,Cognizable,Bailable,Court,summary,complete_desc
0,0,IPC Section 140,Description of IPC Section 140\nAccording to s...,Wearing the dress or carrying any token used b...,3 Months or Fine or Both,Cognizable,Bailable,Any Magistrate,"Under Section 140 of the Indian Penal Code, im...",Description of IPC Section 140\nAccording to s...
1,1,IPC Section 127,Description of IPC Section 127\nAccording to s...,Receiving property taken by war or depredation...,7 Years + Fine + forfeiture of property,Cognizable,Non-Bailable,Court of Session,Whoever receives property knowing it was taken...,Description of IPC Section 127\nAccording to s...
2,2,IPC Section 128,Description of IPC Section 128\nAccording to s...,Public servant voluntarily allowing prisoner o...,Imprisonment for Life or 10 Years + Fine,Cognizable,Non-Bailable,Court of Session,"Under Section 128 of the Indian Penal Code, a ...",Description of IPC Section 128\nAccording to s...
3,3,IPC Section 129,Description of IPC Section 129\nAccording to s...,Public servant negligently suffering prisoner ...,Simple Imprisonment 3 Years + Fine,Cognizable,Bailable,Magistrate First Class,"Under Section 129 of IPC, a public servant in ...",Description of IPC Section 129\nAccording to s...
4,4,IPC Section 130,Description of IPC Section 130\nAccording to s...,"Aiding escape of, rescuing or harbouring, such...",Imprisonment for Life or 10 Years + Fine,Cognizable,Non-Bailable,Court of Session,Section 130 of the Indian Penal Code states th...,Description of IPC Section 130\nAccording to s...


In [40]:
df.dropna(inplace=True)

In [41]:
df.drop(labels=['IPC_Section','Description', 'Offense', 'Punishment', 'Cognizable', 'Court', 'Bailable'], axis = 1)

Unnamed: 0.1,Unnamed: 0,summary,complete_desc
0,0,"Under Section 140 of the Indian Penal Code, im...",Description of IPC Section 140\nAccording to s...
1,1,Whoever receives property knowing it was taken...,Description of IPC Section 127\nAccording to s...
2,2,"Under Section 128 of the Indian Penal Code, a ...",Description of IPC Section 128\nAccording to s...
3,3,"Under Section 129 of IPC, a public servant in ...",Description of IPC Section 129\nAccording to s...
4,4,Section 130 of the Indian Penal Code states th...,Description of IPC Section 130\nAccording to s...
...,...,...,...
439,439,Section 507 of the Indian Penal Code deals wit...,Description of IPC Section 507\nAccording to s...
440,440,Section 508 of the Indian Penal Code states th...,Description of IPC Section 508\nAccording to s...
441,441,Section 509 of the Indian Penal Code deals wit...,Description of IPC Section 509\nAccording to s...
442,442,"Section 510 of IPC penalizes anyone who, while...",Description of IPC Section 510\nAccording to s...


##### Finetuning of Flant5-small

In [42]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset

In [43]:
dataset = Dataset.from_pandas(df[['complete_desc','summary']])

In [44]:
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [45]:
max_input_length = 512
max_target_length = 512

def preprocess(example):
    inputs = "summarize the legal document with the legal terms intact: " + example['complete_desc']
    model_inputs = tokenizer(inputs, max_length = max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example['summary'], max_length=max_target_length,truncation=True)
    
    model_inputs["labels"] = labels['input_ids']
    return model_inputs


tokenized_dataset = dataset.map(preprocess, batched=False)

Map: 100%|██████████| 382/382 [00:00<00:00, 1268.34 examples/s]


In [49]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-small-legal-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    fp16=True,
    num_train_epochs=5,
    weight_decay=0.01,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2
)

In [50]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [51]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.021713
2,No log,0.952377
3,1.278100,0.918632
4,1.278100,0.894588
5,1.278100,0.889443




TrainOutput(global_step=955, training_loss=1.2034822633753273, metrics={'train_runtime': 2496.0666, 'train_samples_per_second': 0.765, 'train_steps_per_second': 0.383, 'total_flos': 199763309752320.0, 'train_loss': 1.2034822633753273, 'epoch': 5.0})

##### Use the fine-tuned model

In [53]:
trainer.save_model("./flan-t5-small-legal-finetuned")

input_text = "summarize: " + df['complete_desc'][13]
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True).input_ids
output = model.generate(input_ids, max_length=512)
print("Generated Summary:", tokenizer.decode(output[0], skip_special_tokens=True))

Generated Summary: Section 138 of the Indian Penal Code states that anyone who abets an act of insubordination by an officer, soldier, sailor, or airman, in the Army, Navy or air Force, of the Government of India, can be punished with up to six months in jail, a fine, or both.
