<a href="https://colab.research.google.com/github/bravetux/FineTuning_AI/blob/main/Assignment1/finetune_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook implements a Supervised Fine-Tuning (SFT) pipeline:

**The LLM:** It uses Mistral 7B (mistralai/Mistral-7B-v0.1), a powerful open-source Large Language Model.

We load it in 4-bit mode so it fits on the A100 High RAM

**The Dataset:** It takes your small, custom dataset (dataset.json) containing specific examples (Instructions → Outputs) for your chosen task.

**The Method:** It uses LoRA (Low-Rank Adaptation). Instead of retraining the huge model from scratch (which is impossible on Colab), it freezes the main model and trains tiny "adapter" layers to learn your specific task.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q -U trl peft bitsandbytes transformers accelerate datasets

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import sys

## 1. Configuration

In [None]:
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
NEW_MODEL = "mistral-customer-support-finetuned"
DATASET_FILE = "dataset.json"

## 2. Dataset Selection
Choose which dataset to use for training.

In [None]:
DATASETS = {
    "1": {
        "name": "Local dataset.json",
        "path": "dataset.json",
        "split": "train",
        "type": "local"
    },
    "2": {
        "name": "Bitext Customer Support (Hugging Face)",
        "path": "bitext/Bitext-customer-support-llm-chatbot-training-dataset",
        "split": "train",
        "type": "hf"
    },
    "3": {
        "name": "Flytech Python Codes (Hugging Face)",
        "path": "flytech/python-codes-25k",
        "split": "train",
        "type": "hf"
    },
    "4": {
        "name": "Databricks Dolly 15k (Hugging Face)",
        "path": "databricks/databricks-dolly-15k",
        "split": "train",
        "type": "hf"
    }
}

print("\n--- Select Dataset ---")
for key, info in DATASETS.items():
    print(f"{key}. {info['name']}")

# Automatically select the local dataset as requested
choice = "1"
selected = DATASETS[choice]
print(f"Selected: {selected['name']}")


--- Select Dataset ---
1. Local dataset.json
2. Bitext Customer Support (Hugging Face)
3. Flytech Python Codes (Hugging Face)
4. Databricks Dolly 15k (Hugging Face)
Selected: Bitext Customer Support (Hugging Face)


## 3. Load Model & Tokenizer

In [None]:
print(f"Loading model: {MODEL_NAME}...")

# Quantization Config
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Check GPU
device_map = {"": 0} if torch.cuda.is_available() else "cpu"
print(f"Using device: {device_map}")

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config if torch.cuda.is_available() else None,
        device_map=device_map,
        trust_remote_code=True
    )
except Exception as e:
    print(f"Error loading model with quantization: {e}")
    print("Attempting to load without quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map=device_map
    )

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading model: mistralai/Mistral-7B-v0.1...
Using device: {'': 0}


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 4. Load Dataset

In [None]:
print(f"Loading dataset {selected['name']}...")

if selected["type"] == "local":
    dataset = load_dataset("json", data_files=selected["path"], split=selected["split"])
else:
    dataset = load_dataset(selected["path"], split=selected["split"])

def format_instruction(sample):
    # SFTTrainer passes a batch (dict of lists)
    output_texts = []

    # Get batch size safely
    first_key = next(iter(sample))
    batch_size = len(sample[first_key])

    # Determine format based on choice
    for i in range(batch_size):
        try:
            if choice == "1": # Local
                instr = sample['instruction'][i]
                output = sample['output'][i]
                # Safe access for optional 'input' column
                input_val = sample['input'][i] if 'input' in sample else ""

                if input_val:
                    prompt = f"<s>[INST] {instr}\n{input_val} [/INST] {output} </s>"
                else:
                    prompt = f"<s>[INST] {instr} [/INST] {output} </s>"

            elif choice == "2": # Bitext
                prompt = f"<s>[INST] {sample['instruction'][i]} [/INST] {sample['response'][i]} </s>"

            elif choice == "3": # Flytech
                instr = sample['instruction'][i]
                output = sample['output'][i]
                input_val = sample['input'][i] if 'input' in sample else ""
                prompt = f"<s>[INST] {instr}\n{input_val} [/INST] {output} </s>"

            elif choice == "4": # Dolly
                context = sample['context'][i] if 'context' in sample else ""
                instr = sample['instruction'][i]
                if context:
                    full_instr = f"{instr}\nContext: {context}"
                else:
                    full_instr = instr
                prompt = f"<s>[INST] {full_instr} [/INST] {sample['response'][i]} </s>"

            else: # Fallback
                prompt = f"<s>[INST] {sample.get('instruction', [''])[i]} [/INST] {sample.get('output', [''])[i]} </s>"

        except KeyError as e:
            # print(f"KeyError formatting sample: {e}")
            prompt = ""
        except IndexError as e:
            # print(f"IndexError formatting sample: {e}")
            prompt = ""

        output_texts.append(prompt)
    return output_texts

Loading dataset Bitext Customer Support (Hugging Face)...


README.md: 0.00B [00:00, ?B/s]

Bitext_Sample_Customer_Support_Training_(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26872 [00:00<?, ? examples/s]

## 5. Training Configuration

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Revert to TrainingArguments for stability and compatibility
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit" if torch.cuda.is_available() else "adamw_torch",
    save_steps=25,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True if torch.cuda.is_available() else False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=50, # Limit to 200 steps for faster training
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    dataloader_num_workers=4,
    gradient_checkpointing=True,
)

## 6. Train

In [None]:
import sys
import torch
import gc
from trl import SFTTrainer
from peft import PeftModel, prepare_model_for_kbit_training

# --- 1. Reset and Cleanup ---
# Clear previous trainer instances and release memory to prevent OOM
try:
    del trainer
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()

# Unload existing PEFT adapters if present
try:
    if isinstance(model, PeftModel):
        print("Unloading existing PEFT adapter...")
        model = model.unload()
except NameError:
    pass # model not defined yet
except Exception as e:
    print(f"Note: cleanup check skipped: {e}")

# --- 2. Surgical Fix for BFloat16 on T4 ---
# The T4 GPU does not support BFloat16 for gradient scaling.
# We must find all BFloat16 params and cast them to Float32.
print("Scanning model for incompatible BFloat16 parameters...")
count = 0
for name, param in model.named_parameters():
    if param.dtype == torch.bfloat16:
        param.data = param.data.to(torch.float32)
        count += 1
if count > 0:
    print(f"Fixed {count} parameters by casting to Float32.")

# Prepare model (this also helps cast LayerNorms)
model = prepare_model_for_kbit_training(model)

# --- 3. Data Processing ---
print("Processing dataset manually...")
def process_batch(batch):
    return {"text": format_instruction(batch)}

if "text" not in dataset.column_names:
    dataset = dataset.map(process_batch, batched=True)

# --- 4. Initialize Trainer ---
print("Initializing Trainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)

# --- 5. Final Safety Check ---
# LoRA adapters initialized by SFTTrainer might inherit BFloat16 from config.
# We scan the wrapped model one last time.
print("Final check for BFloat16 in Trainer model...")
for name, param in trainer.model.named_parameters():
    if param.dtype == torch.bfloat16:
        param.data = param.data.to(torch.float32)

# --- 6. Train ---
print("Starting training...")
trainer.train()

Scanning model for incompatible BFloat16 parameters...
Processing dataset manually...
Initializing Trainer...




Final check for BFloat16 in Trainer model...
Starting training...


Step,Training Loss
5,1.222
10,1.1531
15,1.102
20,1.1261
25,1.1569
30,1.0821
35,1.0622
40,1.04
45,1.0282
50,0.9162


TrainOutput(global_step=50, training_loss=1.088877182006836, metrics={'train_runtime': 124.3555, 'train_samples_per_second': 1.608, 'train_steps_per_second': 0.402, 'total_flos': 1429862573555712.0, 'train_loss': 1.088877182006836, 'epoch': 0.007442691277165823})

## 7. Save Model

In [None]:
print("Saving model...")
trainer.save_model(NEW_MODEL)
tokenizer.save_pretrained(NEW_MODEL)
print(f"Model and tokenizer saved to {NEW_MODEL}")

Saving model...
Model and tokenizer saved to mistral-customer-support-finetuned


**200 Steps**
```
Scanning model for incompatible BFloat16 parameters...
Processing dataset manually...
Initializing Trainer...
/usr/local/lib/python3.12/dist-packages/peft/mapping_func.py:72: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/peft/tuners/tuners_utils.py:282: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!
  warnings.warn(
Final check for BFloat16 in Trainer model...
Starting training...
 [200/200 08:12, Epoch 0/1]
Step	Training Loss
5	1.222100
10	1.153100
15	1.102100
20	1.126000
25	1.156700
30	1.082200
35	1.062100
40	1.040600
45	1.027600
50	0.916200
55	0.890600
60	0.759400
65	0.803300
70	0.968300
75	0.882700
80	0.863100
85	0.841000
90	0.941000
95	0.849500
100	0.879400
105	0.761600
110	0.715600
115	0.737900
120	0.823100
125	0.816400
130	0.866900
135	0.850000
140	0.857300
145	0.817600
150	0.935300
155	0.684800
160	0.709300
165	0.661400
170	0.852100
175	0.857200
180	0.801400
185	0.885900
190	0.825900
195	0.781500
200	0.861200
TrainOutput(global_step=200,
training_loss=0.8917294180393219, metrics={'train_runtime': 495.6587,
'train_samples_per_second': 1.614, 'train_steps_per_second': 0.404
'total_flos': 5588132819460096.0,
'train_loss': 0.8917294180393219, 'epoch': 0.02977076510866329})

```





# 50 Data Set
```
Scanning model for incompatible BFloat16 parameters...
Processing dataset manually...
Initializing Trainer...
/usr/local/lib/python3.12/dist-packages/peft/mapping_func.py:72: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/peft/tuners/tuners_utils.py:282: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!
  warnings.warn(
Final check for BFloat16 in Trainer model...
Starting training...
 [50/50 02:01, Epoch 0/1]
Step	Training Loss
5	1.222000
10	1.153100
15	1.102000
20	1.126100
25	1.156900
30	1.082100
35	1.062200
40	1.040000
45	1.028200
50	0.916200
TrainOutput(global_step=50, training_loss=1.088877182006836, metrics={'train_runtime': 124.3555, 'train_samples_per_second': 1.608, 'train_steps_per_second': 0.402, 'total_flos': 1429862573555712.0, 'train_loss': 1.088877182006836, 'epoch': 0.007442691277165823})


```

