# ALD-01 — TinyLlama 1.1B QLoRA Fine-tuning
**Indian Bilingual LLM | English + Hindi | Colab Notebook**

---
Pipeline: Install → Dataset → 4-bit Load → LoRA → Train → Merge → GGUF → Evaluate

## 1. Install Dependencies

In [None]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install -q transformers datasets peft bitsandbytes trl accelerate sentencepiece

In [None]:
# Optional: clone your GitHub repo
# !git clone https://github.com/YOUR_USERNAME/ALD-01-LLM.git
# %cd ALD-01-LLM

import os
os.makedirs("data", exist_ok=True)
os.makedirs("outputs/ald01-lora", exist_ok=True)

## 2. Load and Prepare Dataset

In [None]:
from datasets import load_dataset

DATASET_PATH = "data/ald01_dataset.json"

dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

def format_prompt(sample):
    instruction = sample["instruction"]
    inp         = sample.get("input", "")
    output      = sample["output"]
    if inp:
        text = f"### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n{output}"
    else:
        text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    return {"text": text}

dataset = dataset.map(format_prompt)
print(f"Dataset size: {len(dataset)}")
print(dataset[0]["text"][:300])

## 3. Load Base Model in 4-bit Quantization

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit              = True,
    bnb_4bit_quant_type       = "nf4",
    bnb_4bit_compute_dtype    = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    bnb_4bit_use_double_quant = True,
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
tokenizer.pad_token    = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config = bnb_config,
    device_map          = "auto",
    trust_remote_code   = True,
)
model = prepare_model_for_kbit_training(model)
print("Model loaded in 4-bit.")

## 4. Configure LoRA Adapter

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r              = 16,
    lora_alpha     = 32,
    lora_dropout   = 0.05,
    target_modules = ["q_proj", "v_proj", "k_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    bias           = "none",
    task_type      = "CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 5. Configure Training Arguments

In [None]:
from transformers import TrainingArguments

OUTPUT_DIR = "outputs/ald01-lora"

training_args = TrainingArguments(
    output_dir                  = OUTPUT_DIR,
    num_train_epochs            = 3,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    gradient_checkpointing      = True,
    learning_rate               = 2e-4,
    lr_scheduler_type           = "cosine",
    warmup_ratio                = 0.05,
    logging_steps               = 10,
    save_steps                  = 100,
    save_total_limit            = 2,
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    optim                       = "paged_adamw_8bit",
    report_to                   = "none",
)

## 6. Run QLoRA Fine-Tuning with SFTTrainer

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model              = model,
    train_dataset      = dataset,
    tokenizer          = tokenizer,
    args               = training_args,
    dataset_text_field = "text",
    max_seq_length     = 1024,
    packing            = False,
)

trainer.train()
print("Training complete.")

## 7. Save LoRA Adapter

In [None]:
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"LoRA adapter saved to: {OUTPUT_DIR}")

# Save to Google Drive (uncomment in Colab)
# from google.colab import drive
# drive.mount('/content/drive')
# !cp -r {OUTPUT_DIR} "/content/drive/MyDrive/ALD-01/"
# print("Adapter copied to Google Drive.")

## 8. Merge LoRA Adapter into Base Model

In [None]:
from peft import PeftModel

MERGED_DIR = "outputs/ald01-merged"

merge_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype       = torch.float16,
    device_map        = "cpu",
    trust_remote_code = True,
)
merge_model = PeftModel.from_pretrained(merge_model, OUTPUT_DIR)
merge_model = merge_model.merge_and_unload()

merge_model.save_pretrained(MERGED_DIR, safe_serialization=True)
tokenizer.save_pretrained(MERGED_DIR)
print(f"Merged model saved to: {MERGED_DIR}")

## 9. Convert to GGUF Format (Q4_K_M)

In [None]:
%%bash
# Clone and build llama.cpp
git clone --depth 1 https://github.com/ggerganov/llama.cpp /content/llama.cpp
pip install -q -r /content/llama.cpp/requirements.txt
make -C /content/llama.cpp -j2 llama-quantize

# Step 1: Convert HF → GGUF (f16)
python /content/llama.cpp/convert_hf_to_gguf.py outputs/ald01-merged \
    --outfile outputs/ald01-f16.gguf \
    --outtype f16

# Step 2: Quantize → Q4_K_M (~650 MB)
/content/llama.cpp/llama-quantize \
    outputs/ald01-f16.gguf \
    outputs/ald01-Q4_K_M.gguf \
    Q4_K_M

echo "GGUF quantization complete."
ls -lh outputs/*.gguf

## 10. Evaluate ALD-01 Responses

In [None]:
from transformers import pipeline as hf_pipeline

eval_pipe = hf_pipeline(
    "text-generation",
    model              = MERGED_DIR,
    torch_dtype        = torch.float16,
    device_map         = "auto",
    max_new_tokens     = 256,
    do_sample          = True,
    temperature        = 0.7,
    repetition_penalty = 1.1,
)

TEST_CASES = [
    ("Hindi Response",        "Machine learning ko simple Hindi mein explain karo."),
    ("Office Email",          "Write a professional email to inform a client about a 1-week project delay due to server migration."),
    ("React Component",       "Write a minimal React functional component that displays a loading spinner."),
    ("Logical Reasoning",     "If all A are B, and all B are C, are all A necessarily C? Explain step by step."),
]

for label, instruction in TEST_CASES:
    prompt = f"### Instruction:\n{instruction}\n\n### Response:\n"
    out    = eval_pipe(prompt)[0]["generated_text"]
    print(f"\n{'='*60}")
    print(f"[{label}]")
    print("-" * 40)
    print(out[len(prompt):].strip())

## Troubleshooting

| Error | Fix |
|---|---|
| `CUDA out of memory` | Reduce `per_device_train_batch_size` to 1, increase `gradient_accumulation_steps` to 8 |
| `bitsandbytes not available` | Run: `!pip install bitsandbytes --upgrade` and restart runtime |
| `ModuleNotFoundError: trl` | Run: `!pip install trl --upgrade` and restart |
| Model outputs gibberish | Check tokenizer `pad_token` is set to `eos_token` |
| Training loss not decreasing | Increase `num_train_epochs`, check dataset format (instruction/output fields) |
| Colab session crashes | Enable GPU: Runtime → Change Runtime Type → T4 GPU |