In [1]:
import torch
import bitsandbytes as bnb
print(bnb.__version__)
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

0.45.2
2.6.0+cu126
True
12.6
0
NVIDIA GeForce RTX 4050 Laptop GPU


In [2]:
from datasets import load_dataset

dataset = load_dataset("jizzu/llama2_indian_law_v3", split="train")
print(f"Dataset size: {len(dataset)}") 
print(dataset[0])

  from .autonotebook import tqdm as notebook_tqdm


Dataset size: 24607
{'text': '<s>[INST] What was the original wording of the clause in relation to the "Legislature of the State of Mizoram"? [/INST] The original wording of the clause in relation to the "Legislature of the State of Mizoram" was "fifteen years." </s>'}


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [3]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [00:42<00:00, 21.46s/it]


In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [6]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_args = TrainingArguments(
    output_dir="./mistral-indian-law",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    max_steps=3076,  
    logging_steps=50,
    save_steps=1000,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)


  trainer = SFTTrainer(
Converting train dataset to ChatML: 100%|██████████| 22146/22146 [00:01<00:00, 12670.23 examples/s]
Applying chat template to train dataset: 100%|██████████| 22146/22146 [00:00<00:00, 24719.77 examples/s]
Applying chat template to train dataset: 100%|██████████| 22146/22146 [00:04<00:00, 5133.53 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 2461/2461 [00:00<00:00, 15717.97 examples/s]
Applying chat template to eval dataset: 100%|██████████| 2461/2461 [00:00<00:00, 28180.05 examples/s]
Applying chat template to eval dataset: 100%|██████████| 2461/2461 [00:00<00:00, 4951.26 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [7]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
50,1.3724
100,1.2408
150,1.1278
200,1.0287
250,1.026
300,0.972
350,0.9894
400,0.9837
450,0.9243
500,0.9617


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=3076, training_loss=0.8566171233148662, metrics={'train_runtime': 87603.2312, 'train_samples_per_second': 0.281, 'train_steps_per_second': 0.035, 'total_flos': 1.28591791875072e+17, 'train_loss': 0.8566171233148662})

In [8]:
model.save_pretrained("mistral-indian-law-final")
tokenizer.save_pretrained("mistral-indian-law-final")

('mistral-indian-law-final\\tokenizer_config.json',
 'mistral-indian-law-final\\special_tokens_map.json',
 'mistral-indian-law-final\\tokenizer.json')