In [90]:
#!/usr/bin/env python
"""
LoRA fine‑tuning of a 4‑bit quantized causal LM,
using pre‑tokenization & Transformers Trainer + PEFT.
"""

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)

model_id      = "microsoft/Phi-3-mini-4k-instruct"  
dataset_path  = "dependency_dataset.jsonl" 
cuda_idx = torch.cuda.current_device()


# LoRA hyperparams
lora_r        = 16
lora_alpha    = 32
lora_dropout  = 0.05

target_modules = ['down_proj', 'gate_up_proj', 'o_proj', 'qkv_proj']

# Training hyperparams
num_epochs    = 100
per_device_bs = 4
grad_accum    = 8
learning_rate = 2e-4 
max_seq_length = 1024 

# 2. Load tokenizer & 4‑bit model
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_fast=True,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading 4‑bit quantized model...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={ "": cuda_idx },
    trust_remote_code=True,
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


# 3. Attach LoRA adapters

print("Configuring LoRA adapters...")
lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load dataset
print(f"Loading dataset from {dataset_path}...")
raw_ds = load_dataset("json", data_files=dataset_path, split="train")

# Format prompts
def formatting_prompts_func(ex):
    """
    Prompt formatter for microsoft/Phi-3-mini-4k-instruct.
    Uses tokenizer.apply_chat_template to build:
      [User:] prompt + input
      [Assistant:] output
    """
    # Combine the instruction and facts into one user message
    user_content = f"{ex['prompt']}\n{ex['input']}"

    # Apply Phi‑3’s chat template
    formatted = tokenizer.apply_chat_template(
        [
            {"role": "user",    "content": user_content},
            {"role": "assistant","content": ex["output"]},
        ],
        add_generation_prompt=False, 
        tokenize=False,
    )

    # Return as the single “text” field for downstream tokenization
    return {"text": formatted}

print("Applying prompt formatting...")
ds_text = raw_ds.map(
    formatting_prompts_func,
    remove_columns=list(raw_ds.features),
)


# pretokenize
def tokenize_for_causal(examples):
    # Tokenize & pad/truncate
    batch = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_seq_length,
    )
    # Causal LM: labels = input_ids
    batch["labels"] = batch["input_ids"].copy()
    return batch

print("Tokenizing dataset...")
tokenized_ds = ds_text.map(
    tokenize_for_causal,
    batched=True,
    remove_columns=["text"],
)

# │ 7. Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,
    return_tensors="pt",
)




Loading tokenizer...
Loading 4‑bit quantized model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Configuring LoRA adapters...
trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
Loading dataset from dependency_dataset.jsonl...
Applying prompt formatting...
Tokenizing dataset...


In [91]:
# ╭───────────────────────────────╮
# │ 8. Training arguments         │
# ╰───────────────────────────────╯
train_args = TrainingArguments(
    output_dir="./lora_adapter",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=per_device_bs,
    gradient_accumulation_steps=grad_accum,
    learning_rate=learning_rate,
    bf16=True,
    logging_steps=20,
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    max_grad_norm=0.3,
)


In [92]:
uniq = set()
for n,_ in model.named_modules():
    if n.endswith("proj"):
        base_name = n.split(".")[-1] 
        uniq.add(base_name)
print(sorted(uniq))

['down_proj', 'gate_up_proj', 'o_proj', 'qkv_proj']


In [93]:

# ╭───────────────────────────────╮
# │ 9. Trainer & train            │
# ╰───────────────────────────────╯
print("Starting training with Trainer...")
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()

print("Done! LoRA adapter saved in ./lora_adapter")

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training with 🤗 Trainer...


Step,Training Loss
20,1.0046
40,0.279
60,0.0646
80,0.0288
100,0.023
120,0.0213
140,0.02
160,0.0197
180,0.0197
200,0.0195


Done! LoRA adapter saved in ./lora_adapter


In [96]:
import json, torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DynamicCache
if not hasattr(DynamicCache, "get_max_length"):
    def _get_max_length(self):  # self.get_max_cache_shape() returns (batch, heads, seq_len, hidden)
        shape = getattr(self, "get_max_cache_shape", lambda: None)()
        return shape[2] if shape else None
    DynamicCache.get_max_length = _get_max_length

# ── paths 
adapter_dir   = "./lora_adapter" 
base_model_id = model_id
patterns_file = "./fact_patterns.jsonl"
device        = "cuda" if torch.cuda.is_available() else "cpu"

# ── load tokenizer & base model in 4‑bit
tok = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(base, adapter_dir, device_map="auto")
model.eval()


def format_chat(prompt, _input):
    user = f"{prompt}\n{_input}"
    return tok.apply_chat_template(
        [{"role": "user", "content": user},
         {"role": "assistant", "content": ""}],
        add_generation_prompt=True,
        tokenize=False,
    )

# ── load patterns
patterns = []
with open(patterns_file) as fp:
    for line in fp:
        patterns.append(json.loads(line))
results = []
# ── run inference 
for i, ex in enumerate(patterns, 1):
    chat_str = format_chat(ex["prompt"], ex["input"])
    inputs   = tok(chat_str, return_tensors="pt").to(device)
    out_ids  = model.generate(**inputs, max_new_tokens=8, do_sample=False)
    answer   = tok.decode(out_ids[0][inputs["input_ids"].shape[-1]:],
                          skip_special_tokens=True).strip()
    results.append({"pattern": ex["input"], "model_answer": answer})

print(results)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[{'pattern': 'TP Marcus (48, U.S. citizen) has a daughter Lily (23, U.S. citizen). Lily is a full‑time graduate student away at school but lives with Marcus on breaks (>5 months). Marcus pays 85% of Lily’s total support, including tuition. Lily earned $3,200 from campus work and files single.', 'model_answer': 'Yes'}, {'pattern': 'TP Ellen (42, U.S. citizen) supports her nephew Zack (17, U.S. citizen). Zack lived with Ellen only 5 months; the rest of the year he lived with friends. Ellen provided 40% of Zack’s support; Zack’s part‑time job covered the rest. Zack files single and earned $7,500.', 'model_answer': 'Yes'}, {'pattern': 'TP Raj (34, U.S. citizen) and two siblings each pay roughly one‑third of their mother Meena’s (68, U.S. citizen) support. Meena’s own income is $1,500 (below $4,700). The children signed a written multiple‑support agreement designating Raj to claim Meena.', 'model_answer': 'Yes'}, {'pattern': 'TP Luis (55, U.S. citizen) lets his cousin Paco (35, U.S. citizen

In [97]:
import pandas as pd, pathlib, json

df_model = pd.DataFrame(results)
csv_path = pathlib.Path("./model_predictions.csv")
df_model.to_csv(csv_path, index=False)
print("Saved model outputs ➜", csv_path)

Saved model outputs ➜ data/model_predictions.csv
