### csv_to_jsonl.py

In [None]:

# import csv
# import json

# input_csv = "qa.csv"
# output_jsonl = "qa.jsonl"

# with open(input_csv, mode='r', encoding='utf-8') as csv_file, open(output_jsonl, mode='w', encoding='utf-8') as jsonl_file:
#     reader = csv.DictReader(csv_file)
#     for row in reader:
#         json.dump(row, jsonl_file, ensure_ascii=False)
#         jsonl_file.write('\n')


In [None]:
# LORA_ADAPTER_DIR = "./gemma-qa-lora-final"
# MERGED_MODEL_DIR = "./gemma-qa-merged-4bit"


# def format_example(example):
#     """Convert each sample to the instruction-tuning prompt format."""
#     return {
#         "text": (
#             "### دستور العمل:\n"
#             f"{example['instruction']}\n\n"
#             "### ورودی:\n"
#             f"{example['input']}\n\n"
#             "### خروجی:\n"
#             f"{example['output']}"
#         )
#     }


# def answer_the_question(text: str) -> str:
#     """
#     Returns an answer of the supplied question.
#     Deterministic greedy decoding is applied.
#     """
#     prompt = (
#         "### دستور العمل:\nجواب بده به سوال\n\n"
#         f"### ورودی:\n{text}\n\n"
#         "### پاسخ:\n"
#     )
#     inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens = MAX_NEW_TOKENS,
#             do_sample      = False,                 # greedy decoding
#             eos_token_id   = tokenizer.eos_token_id,
#         )

#     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     # Extract text following the final '### Response:' tag
#     return decoded.split("### پاسخ:")[-1].strip()



# print("\nQuestion → Answer  (Gemma-3.4B, merged).")
# print("Enter a question. Type 'q' or 'quit' to exit.\n")

# while True:
#     question = input("سوال>").strip()
#     if question.lower() in {"q", "quit"}:
#         print("Session terminated.")
#         break
#     if not question:
#         continue
#     answer = answer_the_question(question)
#     print(f"جواب> {answer}\n")

### fine_tune.py

In [None]:
import os
import torch
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset

# ================================
# 🔧 Configuration
# ================================

# Base pretrained model (QLoRA-compatible) to be fine-tuned
BASE_MODEL_PATH = "./models/gemma-3-4b-it"

# JSONL dataset containing fields: instruction, input, output
DATASET_PATH = "french_translations_1000.jsonl"

# Directory for the final, cleaned LoRA adapter
FINAL_MODEL_DIR = "./gemma-french-lora-final"

# Directory for temporary Trainer checkpoints
TEMP_CHECKPOINT_DIR = "./gemma-french-lora-tmp"

# Maximum sequence length accepted by the model
MAX_SEQ_LENGTH = 512

# Micro-batch size per GPU
BATCH_SIZE = 1

# Gradient accumulation to simulate a larger batch size
GRAD_ACCUM_STEPS = 4

# Learning rate for LoRA updates
LEARNING_RATE = 2e-4

# Number of complete passes through the dataset
NUM_EPOCHS = 3

# Frequency (in steps) of checkpoint creation
SAVE_STEPS = 300

# Precision flags
USE_FP16 = False   # Enable if the model is loaded in float16
USE_BF16 = True    # Enable if the model is loaded in bfloat16

# Transformer sub-modules to be adapted by LoRA
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",  # Attention projections
    "gate_proj", "up_proj", "down_proj"      # Feed-forward network
]

# Disable Weights & Biases logging (optional)
os.environ["WANDB_DISABLED"] = "true"

# ================================
# 🔢 Step 1 – Load the quantised base model
# ================================
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_PATH,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.bfloat16 if USE_BF16 else torch.float16,
    load_in_4bit=True,
)

# ================================
# 🧠 Step 2 – Attach LoRA adapters
# ================================
model = FastLanguageModel.get_peft_model(
    model,
    r=8,                       # LoRA rank
    lora_alpha=16,             # Scaling factor
    lora_dropout=0.05,         # Regularisation
    bias="none",               # Bias parameters kept frozen
    target_modules=TARGET_MODULES,
    use_gradient_checkpointing=True,  # Memory optimisation
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

# ================================
# 📚 Step 3 – Load and format the dataset
# ================================
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

def format_example(example):
    """Convert each sample to the instruction-tuning prompt format."""
    return {
        "text": (
            "### Instruction:\n"
            f"{example['instruction']}\n\n"
            "### Input:\n"
            f"{example['input']}\n\n"
            "### Response:\n"
            f"{example['output']}"
        )
    }

dataset = dataset.map(format_example)

# ================================
# 🏋️ Step 4 – Fine-tune with SFTTrainer
# ================================
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=TrainingArguments(
        output_dir=TEMP_CHECKPOINT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        learning_rate=LEARNING_RATE,
        logging_steps=10,
        num_train_epochs=NUM_EPOCHS,
        save_steps=SAVE_STEPS,
        save_total_limit=1,
        fp16=USE_FP16,
        bf16=USE_BF16,
    ),
)

trainer.train()

# ================================
# 💾 Step 5 – Save the final LoRA adapter
# ================================
model.save_pretrained(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

print(f"\n✅ Fine-tuning completed. Adapter saved to: {FINAL_MODEL_DIR}")


### merge_and_save.py

In [None]:
# merge_and_save_4bit.py  – merge LoRA → keep 4-bit
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
from peft import PeftModel

BASE_MODEL_PATH  = "./models/gemma-3-4b-it"
LORA_ADAPTER_DIR = "./gemma-french-lora-final"
MERGED_MODEL_DIR = "./gemma-french-merged-4bit"
MAX_SEQ_LENGTH   = 512
DTYPE            = torch.bfloat16                 # still used for activations

# 1. Load base model in 4-bit
base_model, tok = FastLanguageModel.from_pretrained(
    BASE_MODEL_PATH,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,          # affects forward pass, not weight dtype
    load_in_4bit=True,
)

# 2. Wrap with PEFT and attach LoRA
peft_model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_DIR)

# 3. Merge LoRA into 4-bit backbone
print("🔄  Merging (4-bit) …")
merged = peft_model.merge_and_unload()          # stays 4-bit
# 4. **Do NOT cast dtype** – keep quantised weights
merged.save_pretrained(MERGED_MODEL_DIR, safe_serialization=True)
tok.save_pretrained(MERGED_MODEL_DIR)
print("✅  4-bit merged model saved.")


### inference_model.py

In [None]:
"""
inference_translate.py
--------------------------------------------------
Loads the merged Gemma-3.4B checkpoint (LoRA already integrated) and
offers an interactive terminal loop:

    • An English sentence is entered.
    • A deterministic French translation is returned.
    • Typing “q” or “quit” terminates the session.
"""

import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# ---------------------------- configuration ----------------------------
MERGED_MODEL_DIR  = "./gemma-french-merged-4bit"  # produced by merge_and_save.py
MAX_SEQ_LENGTH    = 512
MAX_NEW_TOKENS    = 60
DEVICE            = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE             = torch.bfloat16            # precision used when saving
# -----------------------------------------------------------------------

# Load the standalone merged checkpoint (no LoRA adapter required)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = MERGED_MODEL_DIR,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype          = DTYPE,
    load_in_4bit   = False,      # full-precision merged weights
)


def translate_en_to_fr(text: str) -> str:
    """
    Returns a French translation of the supplied English sentence.
    Deterministic greedy decoding is applied.
    """
    prompt = (
        "### Instruction:\nTranslate to French\n\n"
        f"### Input:\n{text}\n\n"
        "### Response:\n"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens = MAX_NEW_TOKENS,
            do_sample      = False,                 # greedy decoding
            eos_token_id   = tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract text following the final '### Response:' tag
    return decoded.split("### Response:")[-1].strip()

# --------------------------- interactive loop --------------------------
print("\nEnglish → French translator (Gemma-3.4B, merged).")
print("Enter an English sentence. Type 'q' or 'quit' to exit.\n")

while True:
    src = input("EN> ").strip()
    if src.lower() in {"q", "quit"}:
        print("Session terminated.")
        break
    if not src:
        continue
    fr = translate_en_to_fr(src)
    print(f"FR> {fr}\n")
