In [10]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [None]:
%pip install -U bitsandbytes 


In [11]:
import os
import random
import torch
import wandb
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
    TrainerCallback 
)
from peft import (
    LoraConfig,
    get_peft_model,
)
from trl import SFTTrainer, setup_chat_format


In [12]:
# Set your Hugging Face token and WandB token
hf_token = os.getenv("HUGGINGFACE_TOKEN", "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")
wandb_token = os.getenv("WANDB_TOKEN", "c3fb602caddeba8090c8842a9616b1a0fbd64d2a")

# Login to Hugging Face Hub
login(token=hf_token)

# Initialize WandB
wandb.login(key=wandb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on Medical Dataset', 
    job_type="training", 
    anonymous="allow"
)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [14]:
# Set base model and dataset
base_model = "HPAI-BSC/Llama3-Aloe-8B-Alpha"
new_model_base = "llama-3-8b-chat-doctor"

# Set torch dtype and attention implementation based on GPU capability
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"
    
print(torch_dtype)    


torch.float16


In [15]:
# 8-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Setup chat format and apply LoRA config
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)


In [17]:
# Datasets information
datasets_info = [
    {"number": 1, "name": "ruslanmv/ai-medical-chatbot", "instruction_key": "Patient", "output_key": "Doctor"},
    {"number": 2, "name": "lavita/medical-qa-datasets", "config": "all-processed", "instruction_key": "input", "output_key": "output"},
    {"number": 3, "name": "Malikeh1375/medical-question-answering-datasets", "config": "all-processed", "instruction_key": "input", "output_key": "output"}
]


In [18]:
# Specify the dataset number here
dataset_number = 1  # Change this number to train on a different dataset

# Find the dataset info based on the dataset number
dataset_info = next(item for item in datasets_info if item["number"] == dataset_number)

# Function to preprocess a single dataset
def preprocess_dataset(dataset_name, config, instruction_key, output_key):
    dataset = load_dataset(dataset_name, config, split="all")
    dataset = dataset.map(lambda row: {
        "text": tokenizer.apply_chat_template(
            [{"role": "user", "content": row[instruction_key]},
             {"role": "assistant", "content": row[output_key]}],
            tokenize=False
        )
    }, num_proc=4)
    return dataset

In [19]:
# Function to create a smaller evaluation subset from the validation set
def create_eval_subset(dataset, size=100):
    indices = random.sample(range(len(dataset)), size)
    return dataset.select(indices)


In [20]:
# Load and preprocess the selected dataset
config = dataset_info.get("config", None)
dataset = preprocess_dataset(dataset_info["name"], config, dataset_info["instruction_key"], dataset_info["output_key"])


In [21]:
# Split the dataset into training and validation sets
train_dataset = dataset.train_test_split(test_size=0.1)
validation_dataset = train_dataset["test"]

In [23]:
# Define training arguments
training_arguments = TrainingArguments(
    output_dir=f"{new_model_base}-{dataset_info['number']}",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=3,  # Start with 3 epochs
    eval_strategy="steps",  # Use eval_strategy instead of evaluation_strategy
    eval_steps=400,  # Evaluate every 200 steps
    save_steps=400,  # Save checkpoints every 200 steps
    logging_steps=50,  # Log every 50 steps
    warmup_steps=100,  # Adjust warmup steps
    logging_strategy="steps",
    learning_rate=2e-4,
    bf16=False,  # Use bf16 instead of fp16
    group_by_length=True,
    report_to="wandb"
)

In [24]:
# Custom callback to sample different evaluation subsets
class RandomSubsetEvalCallback(TrainerCallback):
    def __init__(self, eval_dataset, subset_size):
        self.eval_dataset = eval_dataset
        self.subset_size = subset_size

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % args.eval_steps == 0:
            indices = random.sample(range(len(self.eval_dataset)), self.subset_size)
            subset = self.eval_dataset.select(indices)
            trainer.evaluate(eval_dataset=subset)
        return control


In [None]:
# Setting up SFT trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset["train"],
    eval_dataset=validation_dataset,  # Use the full validation dataset for sampling
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    callbacks=[RandomSubsetEvalCallback(validation_dataset, 100)]  # Custom callback for random subsets
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/231224 [00:00<?, ? examples/s]

In [None]:
# Start training
trainer.train()


In [None]:
# Save the model to Hugging Face Hub
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)


In [None]:
# Push the model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_auth_token=hf_token)
tokenizer.push_to_hub(new_model, use_auth_token=hf_token)
