In [None]:
%pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

# 1. Install required packages (run once)

In [None]:
%pip install transformers datasets peft accelerate
%pip install python-dotenv

# 2. Import libs

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# 3. Load dataset

In [None]:
dataset = load_dataset("GBaker/MedQA-USMLE-4-options")

print(dataset)

# 4. Preprocess dataset

In [None]:
def preprocess(example):
    # for MCQ, format input and output text
    input_text = example['question']
    output_text = example['answer']
    return {"input": input_text, "output": output_text}

In [None]:
train_dataset = dataset["train"].map(preprocess)

In [None]:
print(train_dataset)

# 5. Hugging face token for loading model

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='.env')

HF_TOEKN = os.getenv('HF_TOEKN')


login(token=HF_TOEKN)

# 6. Load tokenizer and model

In [None]:
from transformers import AutoModelForCausalLM

# model_name = "meta-llama/Llama-3.2-3B"
model_name = "meta-llama/Llama-3.2-1B" # see if training becomes faster?


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# No need to save locally
# model.save_pretrained("./llama-3b")
# tokenizer.save_pretrained("./llama-3b")

# 6. Apply LoRA PEFT


In [None]:
lora_config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.1, 
    bias="none"
)

model = get_peft_model(model, lora_config)

# 8. Tokenize inputs for training

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_func(examples):
    # Prepare inputs (prompt + answer)
    inputs = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["input"], examples["output"])]
    tokenized = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors=None,
    )
    # For causal LM, labels = input_ids (model shifts internally)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


print("Before", train_dataset.column_names)
train_dataset = train_dataset.map(tokenize_func, batched=True, remove_columns=train_dataset.column_names)
print("After", train_dataset.column_names)


# 8. Setup training args

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3b-lora-finetuned",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=1,
    save_steps=30,
    save_total_limit=2,
    fp16=False,
    remove_unused_columns=False
)

# 9. Check if running on CPU OR GPU (For Apple silicon)
If MPS available and MPS built are True and model_device shows mps, you are running on your Apple Silicon GPU.


In [None]:
import torch

print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())

model_device = next(model.parameters()).device
print("Model device:", model_device)

# 10. Create Trainer and train

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

print("fp16:", training_args.fp16)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset
)


trainer.train()

# 11. Generate gguf file
To create and use it with Ollama

In [None]:
model.save_pretrained_gguf("chat_nedicine", tokenizer, quantization_method="f16")

# 12. Download Model (for running on colab)

In [None]:
# from google.colab import files
# files.download('/content/chat_doc_gpt_model/unsloth.F16.gguf')