In [None]:
pip install torch torchvision transformers accelerate datasets peft bitsandbytes trl==0.15.2 triton cut_cross_entropy unsloth_zoo sentencepiece huggingface_hub hf_transfer

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl==0.15.2
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.4.4-py3-none-any.whl.metadata (8.0 kB)
Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/CS685 Final/")
print(os.listdir())

['sft_data_train.jsonl', 'sft_data_test.jsonl']


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
import json

In [None]:
# 4‑bit quantization config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    "NickyNicky/experimental-Mistral-1b-V00",
    quantization_config=quant_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("NickyNicky/experimental-Mistral-1b-V00", padding_side="left")

# Fixes value error saying that there is no padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.48G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.48G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# Preprocess for k‑bit training
model = prepare_model_for_kbit_training(model)

# LoRA adapter config
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
# Wrap model with LoRA adapters
model = get_peft_model(model, lora_config)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer


SYSTEM_PROMPT = ("[INST] You are a board-certified physician. Answer the following multiple choice question: \n")

# 1. Load raw JSONL
ds = load_dataset("json", data_files={"train":"sft_data_train.jsonl","test":"sft_data_test.jsonl"})

# Saving 10 percent for eval dataset
split_ds = ds["train"].train_test_split(test_size=0.1, seed=42)
train_set = split_ds["train"]
eval_set  = split_ds["test"]

def format_example(ex):
    prompt  = SYSTEM_PROMPT + ex["prompt"] + "[/INST]"
    answer  = f"Answer: {ex['response']}"
    return prompt + "\n" + answer

def preprocess(batch):
    # build prompt+eos and response+eos
    prompts   = [SYSTEM_PROMPT + p + "[/INST]" + tokenizer.eos_token for p in batch["prompt"]]
    responses = ["Answer: " + r + tokenizer.eos_token for r in batch["response"]]

    enc_p = tokenizer(prompts,   truncation=True, max_length=1024, padding=False)
    enc_r = tokenizer(responses, truncation=True, max_length=128,  padding=False)

    input_ids, attention_mask, labels = [], [], []
    for p_ids, r_ids in zip(enc_p["input_ids"], enc_r["input_ids"]):
        ids = p_ids + r_ids
        input_ids.append(ids)
        attention_mask.append([1] * len(ids))
        labels.append([-100] * len(p_ids) + r_ids)   # mask prompt

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }



"""

tokenized = {}
tokenized["train"] = train_set.map(preprocess, batched=True, remove_columns=["prompt", "response"])
tokenized["eval"] = eval_set.map(preprocess, batched=True, remove_columns=["prompt", "response"])
tokenized["test"] = ds["test"].map(preprocess, batched=True, remove_columns=["prompt", "response"])
"""

'\n\ntokenized = {}\ntokenized["train"] = train_set.map(preprocess, batched=True, remove_columns=["prompt", "response"])\ntokenized["eval"] = eval_set.map(preprocess, batched=True, remove_columns=["prompt", "response"])\ntokenized["test"] = ds["test"].map(preprocess, batched=True, remove_columns=["prompt", "response"])\n'

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

model.gradient_checkpointing_enable()

training_args = SFTConfig(
    output_dir="mistral-1b-qa-qlora2",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=6e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    max_grad_norm=1.0,
    logging_steps = 10,
    eval_strategy = "steps",
    eval_steps = 10,
    save_strategy = "steps",
    save_steps = 10,
    fp16=True,
    load_best_model_at_end=True,
    max_seq_length=2048,
    packing=False,

)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=eval_set,
    formatting_func=format_example,
)

trainer.train()

# Save LoRA adapters and tokenizer
model.save_pretrained("mistral-1b-qa-qlora2")
tokenizer.save_pretrained("mistral-1b-qa-qlora2")

  trainer = SFTTrainer(


Applying formatting function to train dataset:   0%|          | 0/7916 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/7916 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/7916 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7916 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7916 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/880 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/880 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/880 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/880 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/880 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
10,10.4711,9.508812
20,8.8693,8.350829
30,8.1587,7.925789
40,7.7515,7.531629
50,7.43,7.309631
60,7.1908,6.993793
70,6.7255,6.448228
80,6.2986,6.130108
90,5.9847,5.881272
100,5.8132,5.694041


In [None]:
def collate_prompt(batch):
    """Take tokenized examples (with labels) → batched PROMPT ids/masks
       plus raw answer+explanation ids for later PPL."""
    prompt_ids, prompt_masks = [], []
    ans_ids, expl_ids = [], []     # store for later perplexity

    for ex in batch:
        ids   = ex["input_ids"]
        lbl   = ex["labels"]

        # prompt ends just BEFORE the first non-100 label
        split = lbl.index(next(x for x in lbl if x != -100))
        # 'Answer', ':', letter  ->  we want generation to START at that letter
        prompt = ids[:split + 2]      # prompt + 'Answer:' (no letter yet)
        letter = ids[split + 2]       # gold letter token (for later PPL)
        expl   = ids[split + 3 :]     # explanation tokens

        prompt_ids.append(torch.tensor(prompt))
        prompt_masks.append(torch.ones(len(prompt), dtype=torch.long))

        ans_ids.append(torch.tensor([letter]))
        expl_ids.append(torch.tensor(expl))

    pad = lambda seqs, val: pad_sequence(seqs, batch_first=True, padding_value=val)
    return {
        "prompt_ids":   pad(prompt_ids, tokenizer.pad_token_id),
        "prompt_masks": pad(prompt_masks, 0),
        "ans_ids":      ans_ids,   # list[1-D]  keep ragged
        "expl_ids":     expl_ids,  # list[1-D]
    }

In [None]:
import torch, re, math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer)

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id  = "mistral-1b-qa-qlora2"
model     = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = "left"
model.eval()

EOS_ID   = tokenizer.eos_token_id
PAD_ID   = tokenizer.pad_token_id or EOS_ID

loader = DataLoader(ds["test"],
                    batch_size=12,
                    shuffle=False,
                    collate_fn=collate_prompt)

pred_letters = []   # model’s chosen A/B/C/D
gold_letters = []   # ground-truth letters
ppl_vals     = []   # per-example perplexities

for batch in loader:
  prompt_ids  = batch["prompt_ids"].to(device)
  prompt_mask = batch["prompt_masks"].to(device)

  gen_ids = model.generate(
      input_ids      = prompt_ids,
      attention_mask = prompt_mask,
      max_new_tokens = 64,
      do_sample = True,
      temperature    = 0.7,
      top_p          = 0.9,
      repetition_penalty = 1.2,
      no_repeat_ngram_size = 4,
      eos_token_id   = tokenizer.eos_token_id,
      pad_token_id   = tokenizer.eos_token_id,
  )

  # decode NEW tokens only (batched)
  new_tokens = gen_ids[:, prompt_ids.size(1):]
  pred_texts = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)

  # extract letters with one regex pass each
  pred_letters_batch = [
      re.search(r"\b([A-D])(?=[\.\s])", txt).group(1)
      if re.search(r"\b([A-D])(?=[\.\s])", txt) else ""
      for txt in pred_texts
  ]
  pred_letters.extend(pred_letters_batch)

  # Build a padded tensor of ctx+expl for the whole batch
  ctx_seqs, lbl_seqs = [], []
  for prompt, gold_letter, expl in zip(prompt_ids.cpu(), batch["ans_ids"], batch["expl_ids"]):
      ctx   = torch.cat([prompt, gold_letter])         # prompt + gold letter
      inp   = torch.cat([ctx, expl])                   # full sequence
      lbl   = inp.clone()
      lbl[:ctx.size(0)] = -100                         # mask prompt+letter
      ctx_seqs.append(inp)
      lbl_seqs.append(lbl)

  max_len = max(t.numel() for t in ctx_seqs)
  pad = lambda lst, val: pad_sequence(lst, batch_first=True,
                                      padding_value=val)
  input_batch  = pad(ctx_seqs, tokenizer.pad_token_id).to(device)
  label_batch  = pad(lbl_seqs, -100).to(device)

  with torch.no_grad():
      loss = model(input_ids=input_batch, labels=label_batch).loss
      # loss is mean over unmasked tokens already
      ppl_vals.extend(torch.exp(loss).repeat(input_batch.size(0)).cpu().tolist())

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

KeyboardInterrupt: 

In [None]:
import torch, re, math
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer)

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id  = "mistral-1b-qa-qlora2"
model     = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.eval()

EOS_ID   = tokenizer.eos_token_id
PAD_ID   = tokenizer.pad_token_id or EOS_ID

# Collator that pads input_ids, attention_mask, labels together

def collate_test(batch):
    ids, attn, lbl = [], [], []
    for ex in batch:
        ids .append(torch.tensor(ex["input_ids"],      dtype=torch.long))
        attn.append(torch.tensor(ex["attention_mask"], dtype=torch.long))
        lbl .append(torch.tensor(ex["labels"],         dtype=torch.long))

    maxlen = max(t.size(0) for t in ids)
    pad_   = lambda seqs, val: pad_sequence(seqs, batch_first=True, padding_value=val)

    return {
        "input_ids":      pad_(ids,  PAD_ID).to(device),
        "attention_mask": pad_(attn, 0).to(device),
        "labels":         pad_(lbl, -100).to(device)
    }

loader = DataLoader(
    tokenized["test"],         # your saved split
    batch_size=1,
    shuffle=False,
    collate_fn=collate_test
)

pred_letters, gold_letters, ppl_vals = [], [], []

for batch in tqdm(loader, desc="eval"):
    inp_ids  = batch["input_ids"]
    attn_mask= batch["attention_mask"]
    labels   = batch["labels"]

    # Determine for each example where the prompt ends
    # Boolean mask of where the answer/explanation starts
    is_answer = labels.ne(-100)
    # position of first True in each row
    first_idx = is_answer.float().argmax(dim=1)    # shape [B]

    # we generate with ONLY the prompt tokens
    max_prompt = first_idx.max().item() + 1            # longest prompt in batch
    prompt_ids   = torch.stack([
        torch.cat([row[:p+1],              # prompt + eos
                   row.new_zeros(max_prompt-p-1)])      # pad
        for row, p in zip(inp_ids, first_idx)
    ])
    prompt_mask  = (prompt_ids != 0) & (prompt_ids != PAD_ID)

    # Generate predictions
    gen = model.generate(
        input_ids = prompt_ids,
        attention_mask = prompt_mask,
        do_sample = True,
        max_new_tokens = 64,
        early_stopping = True,
        pad_token_id = EOS_ID,
        top_p = 0.92,
        top_k = 50,
        repetition_penalty = 1.5,       # discourages copying
        no_repeat_ngram_size = 4,       # hard constraint
        eos_token_id = tokenizer.eos_token_id,
    )

    # Extract predicted letter
    for g, p_len in zip(gen, first_idx):
        txt = tokenizer.decode(g[p_len+1:]).strip()
        print("Model output: " + txt)
        m   = re.search(r"\b([A-D])(?=[\.\s])", txt)
        pred_letters.append(m.group(1) if m else "")

    # Gold letter & explanation perplexity (no re‑tokenising)
    for ids, p_len in zip(inp_ids, first_idx):
        # find first *letter* token after 'Answer' ':'  (it’s the 3rd new token)
        gold_letter_tok = ids[p_len + 3]          # p_len = '▁Answer', p_len+1 = '▁:'
        gold_char = tokenizer.decode([gold_letter_tok]).strip()[0]
        gold_letters.append(gold_char)
        print("Correct answer: " + gold_char)

        # Explanation token IDs (after the answer letter)
        ctx_ids  = ids[: p_len + 3].unsqueeze(0)     # [1, C]
        expl_ids = ids[p_len + 3 :].unsqueeze(0)     # [1, E]

        # Build input and labels
        input_ids = torch.cat([ctx_ids, expl_ids], dim=1)       # [1, C+E]
        labels     = input_ids.clone()                          # copy
        labels[:, : ctx_ids.size(1)] = -100                     # mask prompt+answer
        with torch.no_grad():
            loss = model(input_ids=input_ids, labels=labels).loss
        ppl_vals.append(math.exp(loss.item()))

# ------------------------------------------------------------------
# Metrics
# ------------------------------------------------------------------
acc = accuracy_score(gold_letters, pred_letters)
ppl = sum(ppl_vals)/len(ppl_vals)

print(f"Letter accuracy : {acc*100:.2f}%")
print(f"Avg explain PPL : {ppl:.2f}")


eval:   0%|          | 0/1078 [00:00<?, ?it/s]



Model output: Answer: B0. Decrethrophol of a varoma due to the left ventilation from anuseum, can lead to increased by her hypotidase.<|im_end|>
Correct answer: B
Model output: Answer: D. Erophy
Explanation: ThisMyчкаylilicfiniteVP-negativeyroidism, the absence of hemoglobin and increased risk of anativeic pneumonia, leading to prevent infections of gastroesia and myococcus. Hteral
Correct answer: D
Model output: Answer: C. A. Oing and elevated Barb
Explanation: The presence of renal también DEFin is appropriate due to lacerb infections in the right;odies-thase likeك’s syndrome, commonly presents with her chronic leukdfelanoma or potentialstride ab
Correct answer: B
Model output: Answer: C. The presence for this condition, fever, where the child, erythrophadenopathycin. Intric acid or pancreat, as it is appropriate for this diagnosistextttQuick suggest anemia due to acute abdominal symptoms with impaired院 2P-
Correct answer: D
Model output: Answer: B. Increased malignal cystic in the p

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cp -r mistral-1b-qa-qlora2/ /content/drive/MyDrive