In [13]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [14]:
from transformers import BitsAndBytesConfig
!pip install -U bitsandbytes



In [87]:
import json
import re
import torch
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
# login(token="")

In [16]:
dataset = load_dataset("DevToAI/indian_laws_llama2_supported")
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 24607
    })
})

In [17]:
dataset['train'][0]['text']

'<s>[INST] Instruction:\nWhat was the original wording of the clause in relation to the "Legislature of the State of Mizoram"? [/INST] Response:\nThe original wording of the clause in relation to the "Legislature of the State of Mizoram" was "fifteen years." </s>'

In [18]:
def convert_llama2_to_llama3_1(dataset):
    data = {}

    for split in dataset:
        formatted_texts = []
        
        for example in dataset[split]["text"]:
            if "[INST]" in example and "[/INST]" in example:
                user_content = example.split("[INST]")[1].split("[/INST]")[0].strip()
                assistant_content = example.split("[/INST]")[1].strip().replace("</s>", "").strip()

                user_content = re.sub(r"^Instruction:\s*", "", user_content)
                assistant_content = re.sub(r"^Response:\s*", "", assistant_content)

                formatted_texts.append(json.dumps([
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": assistant_content}
                ], ensure_ascii=False))

        data[split] = Dataset.from_dict({"text": formatted_texts})

    return DatasetDict(data)

dataset = convert_llama2_to_llama3_1(dataset)


In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 24607
    })
})

In [20]:
dataset["train"][3]["text"]

'[{"role": "user", "content": "How did the abolishment of the privy purse affect rulers and their successors financially?"}, {"role": "assistant", "content": "The abolishment of the privy purse affected rulers and their successors financially by ceasing to pay any sum as privy purse from the commencement of the Constitution (Twenty-sixth Amendment) Act, 1971. This effectively ended the financial support previously provided to rulers and their successors by the government."}]'

In [21]:
dataset = dataset['train'].train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 22146
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2461
    })
})

In [22]:
train_dataset = dataset['train']
validation_dataset = dataset['test']

In [23]:
model_name = "Qwen/Qwen2.5-3B-Instruct" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = "float16",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto",
)

lora_config = LoraConfig(
    r = 4,
    lora_alpha = 8,
    lora_dropout = 0.05,
    target_modules = ["q_proj", "v_proj"],
    bias = "none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("QLoRA model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

QLoRA model loaded successfully!


In [24]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True, max_length=600)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(["text"])

Map:   0%|          | 0/22146 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [25]:
def format_for_causal_lm(example):
    example["labels"] = example["input_ids"].copy() 
    return example

tokenized_train_dataset = tokenized_train_dataset.map(format_for_causal_lm)
tokenized_validation_dataset = tokenized_validation_dataset.map(format_for_causal_lm)

Map:   0%|          | 0/22146 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
training_args = TrainingArguments(
     output_dir = "./results",
        lr_scheduler_type = "cosine",
        learning_rate = 2e-5,
        weight_decay = 0.05,
        warmup_ratio = 0.03,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 16, 
        num_train_epochs = 2,
        eval_strategy = "steps",
        eval_steps = 100,
        logging_steps = 100,
        optim = "adamw_bnb_8bit",
        report_to = "none",
        fp16 = True,
        logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    data_collator = data_collator,
)
train_result = trainer.train()

Step,Training Loss,Validation Loss
100,284.8732,15.438684
200,161.2467,2.383721
300,9.9001,0.346761
400,5.3872,0.31851
500,5.1796,0.307745
600,5.0244,0.303802


In [36]:
!zip -r checkpoint_v2_692.zip /kaggle/working/results/checkpoint-692

  adding: kaggle/working/results/checkpoint-692/ (stored 0%)
  adding: kaggle/working/results/checkpoint-692/README.md (deflated 66%)
  adding: kaggle/working/results/checkpoint-692/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/results/checkpoint-692/optimizer.pt (deflated 11%)
  adding: kaggle/working/results/checkpoint-692/adapter_config.json (deflated 53%)
  adding: kaggle/working/results/checkpoint-692/training_args.bin (deflated 51%)
  adding: kaggle/working/results/checkpoint-692/scheduler.pt (deflated 56%)
  adding: kaggle/working/results/checkpoint-692/rng_state.pth (deflated 25%)
  adding: kaggle/working/results/checkpoint-692/trainer_state.json (deflated 71%)


In [37]:
eval_results = trainer.evaluate()

In [42]:
eval_results

{'eval_loss': 0.30329880118370056,
 'eval_runtime': 677.1692,
 'eval_samples_per_second': 3.634,
 'eval_steps_per_second': 0.91,
 'epoch': 1.9969297453494672}

In [66]:
import math
perplexity = math.exp(eval_results['eval_loss'])
perplexity

1.3543190761331088

In [73]:
from peft import PeftModel

model_path = "/kaggle/working/results/checkpoint-692"
fine_tuned_model = PeftModel.from_pretrained(base_model, model_path, torch_dtype="auto")

In [74]:
device = torch.device("cuda")
def generate_response(prompt, model):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [86]:
prompt = json.dumps([
    {"role": "system", "content": "You are a Indian legal expert providing concise summaries."}, 
    {"role": "user", "content": "What is the penalty for using a forged document in India as genuine?"},  
    {"role": "assistant", "content": ""}  
], ensure_ascii=False)

response = generate_response(prompt, fine_tuned_model)

print("Response:", response)

Response: [{"role": "system", "content": "You are a Indian legal expert providing concise summaries."}, {"role": "user", "content": "What is the penalty for using a forged document in India as genuine?"}, {"role": "assistant", "content": ""}] In India, if you use a forged document as genuine, the penalties can vary depending on the nature of the document and the consequences. However, generally, the punishment can be imprisonment up to 3 years or fine up to ₹50,000 (Indian Rupees Fifty Thousand) or both. For specific cases, the court may impose additional penalties based on the severity of the offense. Therefore, it is important to consult with a legal professional for accurate guidance on this matter."}
