In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
huggingface_token = os.getenv("HF_TOKEN_read")

from huggingface_hub import login
login(token=huggingface_token)

In [3]:
from transformers import BitsAndBytesConfig
import torch

quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)

In [4]:
from peft import LoraConfig
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj","k_proj","o_proj","gate_proj","up_proj","down_proj"]
)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



In [5]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    torch_dtype=torch.bfloat16,
    quantization_config = quantization_config,
    device_map = "auto"
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.add_special_tokens({"pad_token": "<|finetune_right_pad_id|>"})
model.config.pad_token_id = tokenizer.pad_token_id # 128004

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [7]:
from datasets import load_dataset, concatenate_datasets

EQA_dataset = load_dataset("BaekSeungJu/Ophthalmology-EQA-v3", split="train")
EQA_dataset = EQA_dataset.shuffle(seed=42)

def EQA_format_chat_template(row):
    system_instruction = "You are an expert ophthalmologist. Please provide accurate and medically answers to the user's ophthalmology-related question."
    
    row_json = [{"role": "system", "content": system_instruction },
               {"role": "user", "content": row["question"]},
               {"role": "assistant", "content": row["answer"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

mapped_EQA_dataset = EQA_dataset.map(
    EQA_format_chat_template,
    num_proc= 4,
)

split_EQA_dataset = mapped_EQA_dataset.train_test_split(test_size=0.01, seed=42)
train_EQA_dataset = split_EQA_dataset['train']
test_EQA_dataset = split_EQA_dataset['test']

print(f"Train dataset size: {len(train_EQA_dataset)}")
print(f"Test dataset size: {len(test_EQA_dataset)}")

README.md:   0%|          | 0.00/317 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49300 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/49300 [00:00<?, ? examples/s]

Train dataset size: 48807
Test dataset size: 493


In [8]:
MCQA_dataset = load_dataset("BaekSeungJu/Ophthalmology-MCQA-v3", split="train")
MCQA_dataset = MCQA_dataset.shuffle(seed=42)

def MCQA_format_chat_template(row):
    # System instruction
    system_instruction = (
        "You are an expert ophthalmologist. Please provide accurate and "
        "medically sound answers to the user's ophthalmology-related question."
    )
    
    # 3. Define helper functions
    def MCQA_Alphabet_Selection_func(answer, option_a, option_b, option_c, option_d, option_e):
        if answer == option_a:
            return "A"
        elif answer == option_b:
            return "B"
        elif answer == option_c:
            return "C"
        elif answer == option_d:
            return "D"
        elif answer == option_e:
            return "E"
        return ""  # fallback if none match
    
    def MCQA_Instruction_formatting_func(question, option_a, option_b, option_c, option_d, option_e):
        """Format the question and the available answer choices."""
        # Check how many valid options there are to display
        if option_c == "":
            # Only A and B
            return (f"Question:\n{question}\n\nOptions:\n"
                    f"A) {option_a}\nB) {option_b}")
        elif option_e == "":
            # Options A, B, C, D
            return (f"Question:\n{question}\n\nOptions:\n"
                    f"A) {option_a}\nB) {option_b}\nC) {option_c}\nD) {option_d}")
        else:
            # Options A, B, C, D, E
            return (f"Question:\n{question}\n\nOptions:\n"
                    f"A) {option_a}\nB) {option_b}\nC) {option_c}\nD) {option_d}\nE) {option_e}")
    
    def MCQA_Response_formatting_func(option_a, option_b, option_c, option_d, option_e, explanation, answer):
        """Format the explanation and the final answer."""
        letter = MCQA_Alphabet_Selection_func(answer, option_a, option_b, option_c, option_d, option_e)
        return f"Explanation:\n{explanation}\n\nAnswer:\n{letter}) {answer}"
    
    # 4. Build the chat template JSON
    row_json = [
        {"role": "system", "content": system_instruction},
        {
            "role": "user",
            "content": MCQA_Instruction_formatting_func(
                row["question"],
                row["option_a"],
                row["option_b"],
                row["option_c"],
                row["option_d"],
                row["option_e"]
            )
        },
        {
            "role": "assistant",
            "content": MCQA_Response_formatting_func(
                row["option_a"],
                row["option_b"],
                row["option_c"],
                row["option_d"],
                row["option_e"],
                row["explanation"],
                row["answer"]
            )
        }
    ]
    
    # 5. Convert or tokenize the row_json however you like
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    return row

mapped_MCQA_dataset = MCQA_dataset.map(
    MCQA_format_chat_template,
    num_proc= 4,
)
split_MCQA_dataset = mapped_MCQA_dataset.train_test_split(test_size=0.01, seed=42)
train_MCQA_dataset = split_MCQA_dataset['train']
test_MCQA_dataset = split_MCQA_dataset['test']

print(f"Train dataset size: {len(train_MCQA_dataset)}")
print(f"Test dataset size: {len(test_MCQA_dataset)}")

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51745 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/51745 [00:00<?, ? examples/s]

Train dataset size: 51227
Test dataset size: 518


In [10]:
train_dataset = concatenate_datasets([train_EQA_dataset, train_MCQA_dataset])
test_dataset = concatenate_datasets([test_EQA_dataset, test_MCQA_dataset])
train_dataset=train_dataset.shuffle(seed=42)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 101371
Test dataset size: 1025


In [11]:
print(train_dataset[1]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 25 Jan 2025

You are an expert ophthalmologist. Please provide accurate and medically answers to the user's ophthalmology-related question.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the effect of damage to the oculomotor parasympathetic fibers on the pupil?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Damage to the oculomotor parasympathetic fibers results in miosis, which is the constriction of the pupil.<|eot_id|>


In [12]:
from trl import DataCollatorForCompletionOnlyLM

data_collator_param = {}
response_template = "<|start_header_id|>assistant<|end_header_id|>"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer, mlm=False)
data_collator_param["data_collator"] = collator

In [13]:
local_output_dir = "../../Ophtimus_LoRA/Ophtimus_8B_Instruct_checkpoint"

import os
os.makedirs(local_output_dir, exist_ok=True)

In [14]:
%load_ext tensorboard
%tensorboard --logdir '{local_output_dir}/runs'

In [15]:
# Training setup
from trl import SFTTrainer,SFTConfig
from transformers import TrainingArguments

training_arguments = SFTConfig(
  output_dir=local_output_dir,
  report_to = "tensorboard",
  per_device_train_batch_size = 8,
  per_device_eval_batch_size = 8,
  gradient_accumulation_steps = 8,
  warmup_steps = 10,
  num_train_epochs=5,
  eval_steps=25,
#   save_steps=100,
  evaluation_strategy="steps",
  save_strategy="epoch",
  learning_rate = 2e-4,
  logging_steps = 1,
  optim = "adamw_torch",
  weight_decay = 0.01,
  max_seq_length = 1024,
  lr_scheduler_type = "constant_with_warmup",
  seed = 42,
  gradient_checkpointing = True,
  gradient_checkpointing_kwargs={'use_reentrant':True}
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    peft_config = peft_config,
    args = training_arguments,
    **data_collator_param
)

  trainer = SFTTrainer(


Map:   0%|          | 0/101371 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

In [None]:
train_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


In [None]:
import pandas as pd
lossPD = pd.DataFrame(trainer.state.log_history)
lossPD.to_csv(f"./{local_output_dir}/loss.csv",index=False)