#Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install unsloth
!pip install transformers peft accelerate safetensors


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
import importlib
sys.path.append('/content/drive/MyDrive/DPO/DPO on Colab')
import med_dpo_loss

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import (
    prepare_model_for_kbit_training,
    PeftModel,
    LoraConfig
)
import torch
from torch.nn import Linear
import json
from torch.utils.data import Dataset, DataLoader

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA A100-SXM4-40GB


#Define Model


In [44]:

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/DPO/DPO on Colab/final_merged_model", use_fast=True)  #


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/DPO/DPO on Colab/final_merged_model",
    is_trainable=True
)

from peft import LoraConfig
new_lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model.add_adapter(
    adapter_name="task2_adapter",
    peft_config=new_lora_config
)


model.set_adapter("task2_adapter")

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Now trainable parameters: {trainable:,}")
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params     = sum(p.numel() for p in model.parameters())

print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters:     {total_params:,}")
print(f"→ {100 * trainable_params/total_params:.2f}% of all parameters are trainable")

Now trainable parameters: 1,605,632
Trainable parameters: 1,605,632
Total parameters:     1,071,277,056
→ 0.15% of all parameters are trainable


#Load Dataset

In [4]:
class JSONLDataset(Dataset):
    def __init__(self, filepath):
        self.samples = []
        with open(filepath, 'r') as f:
            for line in f:
                self.samples.append(json.loads(line))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        '''Only if custom implementation works'''
        sample = self.samples[idx]
        prompt = sample['prompt']
        chosen = prompt + sample['chosen_response']
        rejected = prompt + sample['rejected_response']

        score_keys = ['accuracy', 'safety', 'explanation_depth']
        chosen_scores = torch.tensor([sample['chosen_scores'][k] for k in score_keys], dtype=torch.float)
        rejected_scores = torch.tensor([sample['rejected_scores'][k] for k in score_keys], dtype=torch.float)
        return {
            'prompt_chosen_response': chosen,
            'prompt_rejected_response': rejected,
            'chosen_scores': chosen_scores,
            'rejected_scores': rejected_scores,
        }


train_data = JSONLDataset('/content/drive/MyDrive/DPO/DPO on Colab/qwen3_dpo_scored_data.jsonl')
train_dataloader = DataLoader(train_data, batch_size = 1, shuffle = True)

for sample in train_dataloader:
  print(sample)
  break

{'prompt_chosen_response': ['Question: A preterm neonate, born at 28 weeks of gestation, is in the neonatal intensive care unit as he developed respiratory distress during the 4th hour after birth. On the 2nd day of life, he required ventilator support. Today, on the 5th day of life, he developed generalized purpura and a hemorrhagic aspirate from the stomach. His laboratory workup is suggestive of thrombocytopenia, prolonged prothrombin time, and prolonged activated partial thromboplastin time. Which of the following statements is correct regarding the coagulation system of this patient?\n\nOptions:\nA. Serum levels of fibrinogen in a preterm infant born at 32 weeks of gestation are typically normal, as compared to an adult.\nB. An extremely premature infant has markedly elevated levels of protein C, as compared to an adult.\nC. There is a physiologic increase in levels of antithrombin III in neonates.\nD. Administration of vitamin K to the mother during labor results in a reduction i

#Test

In [None]:
from datasets import load_dataset
import torch
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

dataset = load_dataset('json', data_files={"train": "/content/drive/MyDrive/DPO/DPO on Colab/qwen3_dpo_scored_data.jsonl"})
ex = dataset['train'][0]

prompt = ex['prompt']
print(prompt)
print('------------')

inputs = tokenizer(prompt,
                   return_tensors="pt",
                   padding=True,
                   truncation=True,
                   max_length=2048).to(device = "cuda")



outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    do_sample=False,
    pad_token_id=tokenizer.pad_token_id,
)

response = tokenizer.decode(outputs[0])

print(response)


In [None]:
importlib.reload(med_dpo_loss)
from med_dpo_loss import MedDPOLoss
from tqdm.auto import tqdm

dtype = next(model.parameters()).dtype
device = model.device

print(device)
#print(dtype)


def train(model, tokenizer, dataloader, optimizer, epochs = 1):
    model.train()

    loss_fn = MedDPOLoss()

    total_loss = 0.0
    seen_so_far = 0

    for epoch in range(epochs):
      loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)
      i = 1

      for batch in dataloader:
        if i % 50 == 0: print(f"through {i} pairs: loss = {total_loss / seen_so_far}")
        optimizer.zero_grad()

        chosen_inputs = tokenizer(batch['prompt_chosen_response'], return_tensors='pt',
                                  padding = True,
                                  truncation = True,
                                  max_length = 2048).to(device = 'cuda')
        chosen_inputs = {key: value.to(device='cuda', dtype=torch.long) for key, value in chosen_inputs.items()}
        chosen_rewards = batch['chosen_scores']

        rejected_inputs = tokenizer(batch['prompt_rejected_response'], return_tensors='pt',
                                    padding = True,
                                    truncation = True,
                                    max_length = 2048).to(device = 'cuda')
        rejected_rewards = batch['rejected_scores']

        chosen_outputs = model(**chosen_inputs)

        rejected_outputs = model(**rejected_inputs)

        chosen_logits = chosen_outputs.logits
        rejected_logits = rejected_outputs.logits

        chosen_rewards = chosen_rewards.to(device)
        rejected_rewards = rejected_rewards.to(device)

        per_examples_loss = loss_fn(chosen_logits, rejected_logits,
                       chosen_rewards, rejected_rewards)

        loss = per_examples_loss.mean()


        loss.backward()
        optimizer.step()

        i+=1
        loop.set_postfix(batch=i, loss=loss.item(), refresh=False)

        total_loss += loss.item()
        seen_so_far += 1

optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-4)

train(model, tokenizer, train_dataloader, optimizer)

#Generate responses from aligned model

In [None]:
import json

import torch
from tqdm import tqdm

def perform_inference(model, tokenizer, prompts, batch_size=16, max_new_tokens=128):
    model.to(device="cuda")
    model.eval()
    results = []

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch_prompts = prompts[i:i + batch_size]

        # Tokenize and pad to longest sequence in batch
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device="cuda")


        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,  # deterministic generation (change if you want randomness)
                pad_token_id=tokenizer.pad_token_id
            )
        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        decoded_outputs = tokenizer.batch_decode(outputs[:, -max_new_tokens:], skip_special_tokens=True)
        if i == 1: print(decoded_outputs[0])
        for prompt, response in zip(batch_prompts, decoded_outputs):
            results.append({"prompt": prompt, "response": response})

    return results

def load_prompts_from_jsonl(file_path):
    prompts = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                data = json.loads(line)
                prompts.append(data['prompt'])
    return prompts

def save_results_to_json(results, output_file):
    with open(output_file, 'w') as f:
        json.dump(results, f)

model_name = "unsloth/gemma-3-1b-it-unsloth-bnb-4bit"
prompts_file = "/content/drive/MyDrive/DPO/DPO on Colab/sft_model_train_outputs_for_dpo.jsonl"
output_file = "/content/drive/MyDrive/DPO/DPO on Colab/qwen3_dpo_inference_results.json"

prompts = load_prompts_from_jsonl(prompts_file)
results = perform_inference(model, tokenizer, prompts)
save_results_to_json(results, output_file)