In [None]:
!pip install -q --upgrade transformers datasets peft accelerate bitsandbytes safetensors huggingface_hub sentencepiece


In [None]:
import os, random, math
from pathlib import Path
import torch
import numpy as np


seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cuda


In [None]:

HF_TOKEN = os.environ.get("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")
if not HF_TOKEN:
    HF_TOKEN = input("Paste your HF token (it won't be shown): ").strip()
    os.environ["HF_TOKEN"] = HF_TOKEN



Paste your HF token (it won't be shown): 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


In [None]:
from datasets import load_dataset

print("Loading dataset...")
ds = load_dataset("lavita/AlpaCare-MedInstruct-52k")
print(ds)

train_samples = 1000
eval_samples = 100

def format_example(example):
    instr = (example.get("instruction") or "").strip()
    resp = (example.get("output") or "").strip()
    text = f"### Instruction:\n{instr}\n\n### Response:\n{resp}\n\nDisclaimer: This is educational only — consult a qualified clinician."
    return {"text": text}

ds = ds.map(format_example, remove_columns=ds["train"].column_names)
ds["train"] = ds["train"].shuffle(seed=seed).select(range(min(train_samples, len(ds["train"]))))
val_ds = ds["train"].train_test_split(test_size=0.1, seed=seed)["test"].select(range(min(eval_samples, len(ds["train"]))))
print("Train size:", len(ds["train"]), "Val size:", len(val_ds))



Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/944 [00:00<?, ?B/s]

data/train-00000-of-00001-297892d5d4e8a0(…):   0%|          | 0.00/36.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 52002
    })
})


Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Train size: 1000 Val size: 100


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training  # IMPORTANT import

MODEL_NAME = "tiiuae/falcon-rw-1b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "</s>"})

bnb_config = BitsAndBytesConfig(load_in_8bit=True)
print("Loading base model (8-bit)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)
model.resize_token_embeddings(len(tokenizer))
print("Model loaded.")


Loading base model (8-bit)...


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


Model loaded.


In [None]:

proj_names = []
for name, module in model.named_modules():
    if any(k in name for k in ["q_proj","k_proj","v_proj","o_proj","gate_proj","down_proj","up_proj","query_key_value","wq","wk","wv","wo","dense"]):
        proj_names.append(name)

print("Found projection-like modules (sample):", proj_names[:200])


Found projection-like modules (sample): ['transformer.word_embeddings', 'transformer.h.0.self_attention.query_key_value', 'transformer.h.0.self_attention.dense', 'transformer.h.0.mlp.dense_h_to_4h', 'transformer.h.0.mlp.dense_4h_to_h', 'transformer.h.1.self_attention.query_key_value', 'transformer.h.1.self_attention.dense', 'transformer.h.1.mlp.dense_h_to_4h', 'transformer.h.1.mlp.dense_4h_to_h', 'transformer.h.2.self_attention.query_key_value', 'transformer.h.2.self_attention.dense', 'transformer.h.2.mlp.dense_h_to_4h', 'transformer.h.2.mlp.dense_4h_to_h', 'transformer.h.3.self_attention.query_key_value', 'transformer.h.3.self_attention.dense', 'transformer.h.3.mlp.dense_h_to_4h', 'transformer.h.3.mlp.dense_4h_to_h', 'transformer.h.4.self_attention.query_key_value', 'transformer.h.4.self_attention.dense', 'transformer.h.4.mlp.dense_h_to_4h', 'transformer.h.4.mlp.dense_4h_to_h', 'transformer.h.5.self_attention.query_key_value', 'transformer.h.5.self_attention.dense', 'transformer.h.5.m

In [None]:
max_length = 512
def tokenize_fn(examples):
    out = tokenizer(examples["text"], truncation=True, max_length=max_length, padding="max_length")
    out["labels"] = out["input_ids"].copy()
    return out

tokenized_train = ds["train"].map(tokenize_fn, batched=True, remove_columns=["text"])
tokenized_val = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

tokenized_train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
tokenized_val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
print("Tokenization done.")


Tokenization done.


In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value", "dense"],  # works for many Falcon variants
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,359,296 || all params: 1,313,890,304 || trainable%: 0.1796


In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
import types

# ✅ 1. Disable caching everywhere
model.config.use_cache = False
if hasattr(model, "transformer") and hasattr(model.transformer, "use_cache"):
    model.transformer.use_cache = False

# ✅ 2. Disable gradient checkpointing if active
if hasattr(model, "gradient_checkpointing_disable"):
    try:
        model.gradient_checkpointing_disable()
        print("🧩 Gradient checkpointing disabled (safe for Falcon).")
    except Exception as e:
        print("Warning: could not disable checkpointing ->", e)

# ✅ 3. Monkey-patch forward to ignore cache arguments safely
def safe_forward(self, *args, **kwargs):
    kwargs.pop("past_key_values", None)
    kwargs.pop("use_cache", None)
    return self.__original_forward__(*args, **kwargs)

# attach if not already patched
if not hasattr(model, "__original_forward__"):
    model.__original_forward__ = model.forward
    model.forward = types.MethodType(safe_forward, model)
    print("🔒 Patched Falcon forward() to skip cache safely.")

# ✅ 4. Training setup
training_args = TrainingArguments(
    output_dir="results",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    eval_steps=100,
    save_total_limit=1,
    eval_strategy="steps",
    save_strategy="epoch",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

print("🚀 Starting Falcon LoRA training (patched safe mode)...")
trainer.train()


You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


🧩 Gradient checkpointing disabled (safe for Falcon).
🚀 Starting Falcon LoRA training (patched safe mode)...




Step,Training Loss,Validation Loss
100,1.3171,1.351796
200,1.1957,1.297185
300,1.2443,1.251138
400,1.3009,1.239361
500,1.2493,1.216155
600,1.3409,1.193759
700,1.2234,1.178784
800,1.2316,1.163176
900,1.2789,1.150776
1000,1.1649,1.145121




TrainOutput(global_step=1000, training_loss=1.3071149625778198, metrics={'train_runtime': 724.4512, 'train_samples_per_second': 1.38, 'train_steps_per_second': 1.38, 'total_flos': 3720075018240000.0, 'train_loss': 1.3071149625778198, 'epoch': 1.0})

In [None]:
OUTPUT_DIR = "lora_adapter"
print("Saving LoRA adapter to", OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Saved adapter and tokenizer in", OUTPUT_DIR)


Saving LoRA adapter to lora_adapter




Saved adapter and tokenizer in lora_adapter
