In [1]:
# ### Install Dependencies
!pip install transformers==4.56.1 peft==0.17.0 accelerate==1.10.0 trl==0.23.1 \
bitsandbytes==0.47.0 datasets==4.0.0 huggingface-hub==0.34.4 safetensors==0.6.2 \
pandas==2.2.2 matplotlib==3.10.0 numpy==2.0.2

Collecting transformers==4.56.1
  Downloading transformers-4.56.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.17.0
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Collecting accelerate==1.10.0
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting trl==0.23.1
  Downloading trl-0.23.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes==0.47.0
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting huggingface-hub==0.34.4
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.56.1-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m131.3 MB/s[0m eta [36m0:00:00

In [2]:

# %% import into the dev env
import os
import torch
from contextlib import nullcontext
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer


In [3]:
# We will load
# Quantize the Phi-3 Mini 4K model to 4-bit to save VRAM and improve training efficiency.
# The quantized weights occupy roughly 2 GB GPU memory.

bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)

repo_id = 'microsoft/Phi-3-mini-4k-instruct'

model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    device_map="cuda:0",
    quantization_config=bnb_config
)

print(f"Memory footprint (MB): {model.get_memory_footprint()/1e6:.2f}")


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Memory footprint (MB): 2206.34


In [4]:
# Low-Rank Adapters (LoRA) enable fine-tuning only a small subset of parameters.

model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

model = get_peft_model(model, config)

trainable_params, total_params = model.get_nb_trainable_parameters()
print(f"Trainable parameters: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M")

Trainable parameters: 12.58M / 3833.66M


In [25]:
# loading and preparing our
# The IIT Bombay English–Hindi Parallel Corpus:
# https://huggingface.co/datasets/cfilt/iitb-english-hindi

dataset = load_dataset("cfilt/iitb-english-hindi", split="train[:1%]")  # use 2% subset for quick training
print(dataset)

print(dataset[0])

Dataset({
    features: ['translation'],
    num_rows: 16591
})
{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [26]:
def extract_pairs(example):
    return {
        "prompt": example["translation"]["en"],
        "completion": example["translation"]["hi"]
    }

dataset = dataset.map(extract_pairs)
dataset = dataset.remove_columns(["translation"])
dataset = dataset.filter(lambda x: len(x["prompt"].strip()) > 0 and len(x["completion"].strip()) > 0)

def format_dataset(examples):
    return {
        "messages": [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]}
        ]
    }

dataset = dataset.map(format_dataset)
dataset = dataset.remove_columns(['prompt', 'completion'])
print(dataset[0]['messages'])
print(f"\n✅ Total samples: {len(dataset)}")
messages=dataset[0]['messages']
messages


Map:   0%|          | 0/16591 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16591 [00:00<?, ? examples/s]

Map:   0%|          | 0/16591 [00:00<?, ? examples/s]

[{'content': 'Give your application an accessibility workout', 'role': 'user'}, {'content': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें', 'role': 'assistant'}]

✅ Total samples: 16591


[{'content': 'Give your application an accessibility workout', 'role': 'user'},
 {'content': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
  'role': 'assistant'}]

In [27]:
# setup tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

print("Chat template preview:")
print(tokenizer.apply_chat_template(messages, tokenize=False))


Chat template preview:
<|user|>
Give your application an accessibility workout<|end|>
<|assistant|>
अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें<|end|>
<|endoftext|>


In [28]:
from trl import SFTConfig

sft_config = SFTConfig(
    # Memory-speed balance
    gradient_checkpointing=False,
    gradient_accumulation_steps=1,
    per_device_train_batch_size=8,
    auto_find_batch_size=False,

    # Dataset
    max_length=128,
    packing=True,
    packing_strategy="wrapped",

    # Training
    num_train_epochs=3,
    learning_rate=3e-4,
    optim="paged_adamw_8bit",

    # Precision and logging
    fp16=True,
    logging_steps=10,
    logging_dir="./logs",
    output_dir="./phi3-mini-en-hi-adapter",
    report_to="none"
)


Supervised Fine-tuning(SFT)
1. Its actually doing the full finetuning of the model onto the 16K dataset.

In [29]:
# now finetuning with sft trainer
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

trainer.train()


Tokenizing train dataset:   0%|          | 0/16591 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/16591 [00:00<?, ? examples/s]

Step,Training Loss
10,0.9042
20,0.9378
30,0.9426
40,0.9328
50,0.8114
60,0.8348
70,0.8686
80,0.8608
90,0.819
100,0.7675


TrainOutput(global_step=1509, training_loss=0.39151088128112027, metrics={'train_runtime': 4100.9136, 'train_samples_per_second': 2.944, 'train_steps_per_second': 0.368, 'total_flos': 3.4629791174885376e+16, 'train_loss': 0.39151088128112027, 'entropy': 0.2112814337015152, 'num_tokens': 1542072.0, 'mean_token_accuracy': 0.9500218894746568, 'epoch': 3.0})

In [30]:
# prompts and generating translation from english to hindi
def gen_prompt(tokenizer, sentence):
    converted_sample = [{"role": "user", "content": sentence}]
    prompt = tokenizer.apply_chat_template(converted_sample, tokenize=False, add_generation_prompt=True)
    return prompt

def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=True):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
    model.eval()
    ctx = torch.autocast(device_type=model.device.type, dtype=model.dtype) \
          if model.dtype in [torch.float16, torch.bfloat16] else nullcontext()
    with ctx:
        generation_output = model.generate(**tokenized_input,
                                           eos_token_id=tokenizer.eos_token_id,
                                           max_new_tokens=max_new_tokens)
    return tokenizer.decode(generation_output[0], skip_special_tokens=skip_special_tokens)


In [31]:
# Example-1
sentence = "The weather is very pleasant today."
prompt = gen_prompt(tokenizer, sentence)
print(generate(model, tokenizer, prompt))

The weather is very pleasant today. विशेषता है दिन के साथ. 


In [41]:
# Example-2
sentence = "Are you mad ?"
prompt = gen_prompt(tokenizer, sentence)
print(generate(model, tokenizer, prompt))

Are you mad ? क्या आपको बेगम में जारी रखने के लिए अत्यधिक मदद देंगे


In [33]:
# %%
trainer.save_model("local-phi3-mini-en-hi-adapter")

In [34]:
from huggingface_hub import login
login()  # Uncomment and run if pushing to Hub

trainer.push_to_hub()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-mini-en-hi-adapter/tokenizer.model: 100%|##########|  500kB /  500kB            

  ...i-adapter/adapter_model.safetensors:   1%|1         |  558kB / 50.4MB            

  ...ini-en-hi-adapter/training_args.bin:   1%|1         |  67.0B / 6.10kB            

CommitInfo(commit_url='https://huggingface.co/ajeet9843/phi3-mini-en-hi-adapter/commit/f4d272041b458060049edef004279308fd32939a', commit_message='End of training', commit_description='', oid='f4d272041b458060049edef004279308fd32939a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ajeet9843/phi3-mini-en-hi-adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='ajeet9843/phi3-mini-en-hi-adapter'), pr_revision=None, pr_num=None)

## References
1. https://huggingface.co/datasets/cfilt/iitb-english-hindi