In [None]:
!pip install -U bitsandbytes
import os
os.kill(os.getpid(), 9)



In [None]:
pip install torch transformers datasets peft bitsandbytes accelerate tqdm

In [2]:
!pip install bitsandbytes>=0.41.0

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import shutil
from google.colab import files
import math

model_name = "roneneldan/TinyStories-1M"
block_size = 128
batch_size = 8
epochs = 20
lr = 3e-4

adapter_path = "tinystories-1m-qlora-adapters"
merged_path = "tinystories-1m-qlora-merged"

# grab shakespeare data
dataset = load_dataset("text", data_files="https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(examples):
    return tokenizer(examples["text"], add_special_tokens=False)

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

def group_texts(examples):
    all_ids = sum(examples["input_ids"], [])
    all_masks = sum(examples["attention_mask"], [])
    total_len = (len(all_ids) // block_size) * block_size

    return {
        "input_ids": [all_ids[i:i+block_size] for i in range(0, total_len, block_size)],
        "attention_mask": [all_masks[i:i+block_size] for i in range(0, total_len, block_size)],
        "labels": [all_ids[i:i+block_size] for i in range(0, total_len, block_size)]
    }

lm_dataset = tokenized.map(group_texts, batched=True)

def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch]),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch]),
        "labels": torch.tensor([x["labels"] for x in batch])
    }

train_loader = DataLoader(lm_dataset["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(lm_dataset["test"], batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# check perplexity
def calc_perplexity(model, loader):
    model.eval()
    total_loss = 0
    count = 0

    with torch.no_grad():
        for batch in loader:
            ids = batch["input_ids"].to("cuda")
            mask = batch["attention_mask"].to("cuda")
            labels = batch["labels"].to("cuda")

            out = model(input_ids=ids, attention_mask=mask, labels=labels)
            total_loss += out.loss.item() * ids.size(0)
            count += ids.size(0)

    avg_loss = total_loss / count
    return math.exp(avg_loss), avg_loss

# baseline check
print("\ngetting baseline scores...")
base_model_check = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
base_ppl, base_loss = calc_perplexity(base_model_check, test_loader)

print(f"baseline - loss: {base_loss:.4f}, perplexity: {base_ppl:.2f}")

del base_model_check
torch.cuda.empty_cache()

# setup 4bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

optimizer = AdamW(model.parameters(), lr=lr)

# train
print(f"\ntraining for {epochs} epoch(s)...")
model.train()

for epoch in range(epochs):
    print(f"\nepoch {epoch+1}")
    running_loss = 0

    for batch in tqdm(train_loader):
        ids = batch["input_ids"].to("cuda")
        mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        out = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = out.loss
        running_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"avg loss: {running_loss/len(train_loader):.4f}")

model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)

print("\nmerging model...")
base = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")
model = PeftModel.from_pretrained(base, adapter_path)
model = model.merge_and_unload()

model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

# eval after training
print("\nchecking finetuned model...")
ft_ppl, ft_loss = calc_perplexity(model, test_loader)

print(f"finetuned - loss: {ft_loss:.4f}, perplexity: {ft_ppl:.2f}")

# compare
loss_change = ((base_loss - ft_loss) / base_loss) * 100
ppl_change = ((base_ppl - ft_ppl) / base_ppl) * 100

print(f"\n--- results ---")
print(f"loss: {base_loss:.4f} -> {ft_loss:.4f} ({loss_change:+.1f}%)")
print(f"perplexity: {base_ppl:.2f} -> {ft_ppl:.2f} ({ppl_change:+.1f}%)")

# test generation
print("\n--- sample generations ---")
base_gen = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

prompts = ["Once upon a time", "To be or not to be"]

for p in prompts:
    print(f"\nprompt: {p}")

    inp = tokenizer(p, return_tensors="pt").to("cuda")
    with torch.no_grad():
        base_out = base_gen.generate(**inp, max_new_tokens=50, temperature=0.9, do_sample=True)
        ft_out = model.generate(**inp, max_new_tokens=50, temperature=0.9, do_sample=True)

    print(f"before: {tokenizer.decode(base_out[0], skip_special_tokens=True)}")
    print(f"after: {tokenizer.decode(ft_out[0], skip_special_tokens=True)}")

del base_gen
torch.cuda.empty_cache()

# download
shutil.make_archive(merged_path, "zip", merged_path)
files.download(f"{merged_path}.zip")
print("\ndone!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]


getting baseline scores...
baseline - loss: 9.3344, perplexity: 11320.67
trainable params: 16,384 || all params: 3,762,368 || trainable%: 0.4355

training for 20 epoch(s)...

epoch 1


100%|██████████| 260/260 [00:17<00:00, 14.82it/s]


avg loss: 7.8677

epoch 2


100%|██████████| 260/260 [00:17<00:00, 14.68it/s]


avg loss: 7.5784

epoch 3


100%|██████████| 260/260 [00:17<00:00, 15.14it/s]


avg loss: 7.4836

epoch 4


100%|██████████| 260/260 [00:25<00:00, 10.38it/s]


avg loss: 7.3916

epoch 5


100%|██████████| 260/260 [00:17<00:00, 14.73it/s]


avg loss: 7.3061

epoch 6


100%|██████████| 260/260 [00:17<00:00, 14.61it/s]


avg loss: 7.2646

epoch 7


100%|██████████| 260/260 [00:17<00:00, 15.05it/s]


avg loss: 7.2366

epoch 8


100%|██████████| 260/260 [00:17<00:00, 14.67it/s]


avg loss: 7.2144

epoch 9


100%|██████████| 260/260 [00:17<00:00, 15.02it/s]


avg loss: 7.1961

epoch 10


100%|██████████| 260/260 [00:17<00:00, 14.71it/s]


avg loss: 7.1802

epoch 11


100%|██████████| 260/260 [00:17<00:00, 15.23it/s]


avg loss: 7.1649

epoch 12


100%|██████████| 260/260 [00:17<00:00, 14.66it/s]


avg loss: 7.1549

epoch 13


100%|██████████| 260/260 [00:17<00:00, 15.17it/s]


avg loss: 7.1437

epoch 14


100%|██████████| 260/260 [00:17<00:00, 14.68it/s]


avg loss: 7.1348

epoch 15


100%|██████████| 260/260 [00:17<00:00, 14.99it/s]


avg loss: 7.1233

epoch 16


100%|██████████| 260/260 [00:17<00:00, 15.02it/s]


avg loss: 7.1185

epoch 17


100%|██████████| 260/260 [00:17<00:00, 14.70it/s]


avg loss: 7.1116

epoch 18


100%|██████████| 260/260 [00:17<00:00, 15.02it/s]


avg loss: 7.1028

epoch 19


100%|██████████| 260/260 [00:17<00:00, 14.58it/s]


avg loss: 7.0977

epoch 20


100%|██████████| 260/260 [00:17<00:00, 15.07it/s]


avg loss: 7.0909

merging model...


`torch_dtype` is deprecated! Use `dtype` instead!



checking finetuned model...
finetuned - loss: 7.4547, perplexity: 1728.04

--- results ---
loss: 9.3344 -> 7.4547 (+20.1%)
perplexity: 11320.67 -> 1728.04 (+84.7%)

--- sample generations ---


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



prompt: Once upon a time


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


before: Once upon a time, in a big room, there were two sisters. The sisters in the kitchen were very happy, they were always playing with their toys.

after: Once upon a time, was and and leave his dad,Not mind, let his promise his chest, he can hurt and not and prevent himself that stay your mind.You must be?But, because do be stupid:But my son, no dollar: let your

prompt: To be or not to be


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


before: To be or not to be. I am sorry and she won't be mad at me. I am sorry, Max. I will find Max and True. Please, Max. I will stop you and hug them."

Max and Max are happy. They hug Max.
after: To be or not to be very important than he he be knew well before,"?Sure, he always weigh no - no his boat is not his cub, if you will keep his mind and promise to his dad, that and that?"And he do no people who because of


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


done!


In [5]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import shutil
from google.colab import files

# config stuff
model_name = "roneneldan/TinyStories-1M"
block_size = 128
batch_size = 8
epochs = 20
lr = 3e-4

adapter_path = "tinystories-1m-qlora-adapters"
merged_path = "tinystories-1m-qlora-merged"

# load tiny shakespeare
dataset = load_dataset("text", data_files="https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

# tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# tokenize the data
def tokenize_fn(examples):
    return tokenizer(examples["text"], add_special_tokens=False)

tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# chunk into blocks
def group_texts(examples):
    all_ids = sum(examples["input_ids"], [])
    all_masks = sum(examples["attention_mask"], [])

    total_len = (len(all_ids) // block_size) * block_size

    result = {
        "input_ids": [all_ids[i:i+block_size] for i in range(0, total_len, block_size)],
        "attention_mask": [all_masks[i:i+block_size] for i in range(0, total_len, block_size)],
        "labels": [all_ids[i:i+block_size] for i in range(0, total_len, block_size)]
    }
    return result

lm_dataset = tokenized.map(group_texts, batched=True)

# dataloader
def collate_fn(batch):
    return {
        "input_ids": torch.tensor([x["input_ids"] for x in batch]),
        "attention_mask": torch.tensor([x["attention_mask"] for x in batch]),
        "labels": torch.tensor([x["labels"] for x in batch])
    }

train_loader = DataLoader(lm_dataset["train"], batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# load model in 4bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# add lora adapters
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

optimizer = AdamW(model.parameters(), lr=lr)

# training
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        labels = batch["labels"].to("cuda")

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# save the lora adapters
model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)

print("Merging adapters with base model...")

# reload base in fp16 and merge
base_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda")
model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()

# save merged model
model.save_pretrained(merged_path)
tokenizer.save_pretrained(merged_path)

# zip and download
shutil.make_archive(merged_path, "zip", merged_path)
files.download(f"{merged_path}.zip")

# quick test
model.eval()
prompt = "Sun rises in the east"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

with torch.no_grad():
    output = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=80,
        temperature=0.9,
        do_sample=True,
        top_p=0.95
    )

print("\nGenerated text:")
print(tokenizer.decode(output[0], skip_special_tokens=True))

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

trainable params: 16,384 || all params: 3,762,368 || trainable%: 0.4355
Epoch 1/20


100%|██████████| 260/260 [00:16<00:00, 16.05it/s]


Epoch 2/20


100%|██████████| 260/260 [00:17<00:00, 14.56it/s]


Epoch 3/20


100%|██████████| 260/260 [00:17<00:00, 15.09it/s]


Epoch 4/20


100%|██████████| 260/260 [00:16<00:00, 15.61it/s]


Epoch 5/20


100%|██████████| 260/260 [00:16<00:00, 16.21it/s]


Epoch 6/20


100%|██████████| 260/260 [00:17<00:00, 15.09it/s]


Epoch 7/20


100%|██████████| 260/260 [00:18<00:00, 14.05it/s]


Epoch 8/20


100%|██████████| 260/260 [00:21<00:00, 12.24it/s]


Epoch 9/20


100%|██████████| 260/260 [00:18<00:00, 14.03it/s]


Epoch 10/20


100%|██████████| 260/260 [00:16<00:00, 15.57it/s]


Epoch 11/20


100%|██████████| 260/260 [00:16<00:00, 16.00it/s]


Epoch 12/20


100%|██████████| 260/260 [00:16<00:00, 16.13it/s]


Epoch 13/20


100%|██████████| 260/260 [00:16<00:00, 15.57it/s]


Epoch 14/20


100%|██████████| 260/260 [00:16<00:00, 16.18it/s]


Epoch 15/20


100%|██████████| 260/260 [00:16<00:00, 16.03it/s]


Epoch 16/20


100%|██████████| 260/260 [00:16<00:00, 15.47it/s]


Epoch 17/20


100%|██████████| 260/260 [00:16<00:00, 16.12it/s]


Epoch 18/20


100%|██████████| 260/260 [00:16<00:00, 15.96it/s]


Epoch 19/20


100%|██████████| 260/260 [00:16<00:00, 15.56it/s]


Epoch 20/20


100%|██████████| 260/260 [00:16<00:00, 16.05it/s]


Merging adapters with base model...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Generated text:
Sun rises in the east side,Sure, please be be be happy - when he knew that?Relupid and promise you will stay him well that that were no,"Poor worry because are terrible.A be foolish but in and stop.I can lose his fate to?Sure please no worry but surrender for this punishment –No good his coin with the prison,Of worry, no, please to be happy –He be
