In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import torch, numpy as np, math

In [13]:
# !nvidia-smi --query-gpu=gpu_name,compute_cap --format=csv

In [14]:
# !pip show bitsandbytes

In [15]:
# %pip install bitsandbytes==0.43.3

In [16]:
# import bitsandbytes as bnb
# print("Bitsandbytes version:", bnb.__version__)

# from bitsandbytes.cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle

# try:
#     print("CUDA handle:", get_cuda_lib_handle())
#     print("Compute capabilities:", get_compute_capabilities())
#     print("GPU kernels loaded OK ‚úÖ")
# except Exception as e:
#     print("‚ö†Ô∏è CUDA load error:", e)


In [17]:
# Loading the dataset

data_files = {
    "train": "fine_tune_data/bbt_train.jsonl",
    "validation": "fine_tune_data/bbt_val.jsonl", 
    # "test": "fine_tune_data/bbt_test.jsonl"
}
ds = load_dataset("json", data_files=data_files)
ds

DatasetDict({
    train: Dataset({
        features: ['ep', 'scene', 'turn_idx', 'target_speaker', 'prompt', 'target'],
        num_rows: 36701
    })
    validation: Dataset({
        features: ['ep', 'scene', 'turn_idx', 'target_speaker', 'prompt', 'target'],
        num_rows: 4738
    })
})

In [18]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # light and friendly to 8GB
tok = AutoTokenizer.from_pretrained(model_id)
base = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
# base.gradient_checkpointing_enable()  # saves RAM - throws error because gpu not new enough

config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=16, lora_dropout=0.05, target_modules=["q_proj","v_proj"])
model = get_peft_model(base, config)
model.enable_input_require_grads()
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [19]:
max_len = 512

def build_example(ex):
    prompt = ex.get("prompt","")
    target = ex.get("target","")
    x = prompt + target
    enc_full   = tok(x, max_length=max_len, truncation=True)
    enc_prompt = tok(prompt, max_length=max_len, truncation=True)

    input_ids = enc_full["input_ids"]
    labels    = input_ids.copy()

    # mask prompt tokens
    n_prompt = len(enc_prompt["input_ids"])
    labels[:n_prompt] = [-100]*min(n_prompt, len(labels))

    return {
        "input_ids": input_ids,
        "attention_mask": enc_full["attention_mask"],
        "labels": labels
    }

cols = ["input_ids","attention_mask","labels"]
train_tok = ds["train"].map(build_example, remove_columns=ds["train"].column_names)
val_tok   = ds["validation"].map(build_example, remove_columns=ds["validation"].column_names)
# test_tok = ds["test"].map(build_example, remove_columns=ds["test"].column_names)


# set format to torch
train_tok.set_format(type="torch", columns=cols)
val_tok.set_format(type="torch", columns=cols)
# test_tok.set_format(type="torch", columns=cols)


In [20]:
# args = TrainingArguments(
#     output_dir="./bbt-lora",
#     num_train_epochs=2,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     gradient_accumulation_steps=16,      # effective batch ~16
#     fp16=True,
#     learning_rate=2e-4,
#     logging_steps=50,
#     evaluation_strategy="steps",
#     eval_steps=200,
#     save_steps=200,
#     save_total_limit=2,
#     report_to="none",
#     gradient_checkpointing=True,
#     lr_scheduler_type="cosine",
#     warmup_ratio=0.03
# )


In [21]:
args = TrainingArguments(
    output_dir="./bbt-lora",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    fp16=True,
    learning_rate=2e-4,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    report_to="none",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},  # üëà avoids the warning
    lr_scheduler_type="cosine",
    warmup_ratio=0.03
)

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # shift like in LM; logits is [bs, seq, vocab]
    shift_logits = logits[:, :-1, :]
    shift_labels = labels[:, 1:]
    loss_mask = (shift_labels != -100)
    # quick perplexity approx: cross-entropy on masked positions
    import torch
    shift_labels = torch.tensor(shift_labels)
    loss_mask = torch.tensor(loss_mask)
    shift_logits = torch.tensor(shift_logits)
    ce = torch.nn.functional.cross_entropy(
        shift_logits[loss_mask],
        shift_labels[loss_mask],
        reduction="mean"
    )
    ppl = float(math.exp(ce.item())) if ce.isfinite() else float("inf")
    return {"ppl": ppl}

from transformers import Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
)
trainer.train()


Step,Training Loss,Validation Loss
200,2.4232,2.359891
400,2.3599,2.337797
600,2.3743,2.331409
800,2.3849,2.328696
1000,2.3453,2.32207
1200,2.3071,2.318727
1400,2.2977,2.312997
1600,2.3403,2.309741
1800,2.3145,2.306627
2000,2.2718,2.303357




TrainOutput(global_step=4586, training_loss=2.3056248940602173, metrics={'train_runtime': 71480.9068, 'train_samples_per_second': 1.027, 'train_steps_per_second': 0.064, 'total_flos': 6.476606989686374e+16, 'train_loss': 2.3056248940602173, 'epoch': 1.9992915724367184})

In [23]:
def generate_next(prompt, max_new_tokens=80):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True, top_p=0.9, temperature=0.7,
            eos_token_id=tok.eos_token_id
        )
    return tok.decode(out[0], skip_special_tokens=True)

test_prompt = "<Leonard>: Hey, you coming to lunch?\n<Howard>:"
print(generate_next(test_prompt))

<Leonard>: Hey, you coming to lunch?
<Howard>:No. I‚Äôm going to work. I‚Äôm doing this. The world will be better off. It‚Äôs the only way. I have a lot of work to do. I‚Äôm not going to talk to you. I‚Äôm sorry. I just wanted to make it clear. If you‚Äôre not going to talk to me, I‚Äôm not going to talk to


In [24]:
model.save_pretrained("./bbt-lora/adapter")
tok.save_pretrained("./bbt-lora/adapter")

# Optional: merge LoRA into base weights (creates a full-size model)
merged = model.merge_and_unload()
merged.save_pretrained("./bbt-lora/merged")
tok.save_pretrained("./bbt-lora/merged")



('./bbt-lora/merged/tokenizer_config.json',
 './bbt-lora/merged/special_tokens_map.json',
 './bbt-lora/merged/tokenizer.json')