<a href="https://colab.research.google.com/github/aalizelau/Clone-Yourself/blob/main/notebooks/whatsapp_finetune_with_bnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install uv

In [None]:
!uv pip -q install peft
!uv pip -q install transformers
!uv pip -q install torch
!uv pip -q install bitsandbytes
!uv pip -q install datasets
!uv pip -q install evaluate

#Load Model and Tokenizer

In [None]:
import transformers
from transformers import (AutoTokenizer,BitsAndBytesConfig)
import torch
from datasets import (load_dataset, Dataset)
import bitsandbytes as bnb
from transformers import AutoModelForCausalLM
import evaluate

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
from transformers import AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [None]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

In [None]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,523,136 || all params: 7,618,139,648 || trainable%: 0.0331


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(tokenizer.pad_token)
print(tokenizer.eos_token)

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

<|endoftext|>
<|im_end|>


#Test original model

In [None]:
prompt = """<|im_start|>user
寶貝，你在做什麼?<|im_end|>
<|im_start|>assistant
""".strip()

In [None]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      max_new_tokens = 50,
      temperature = 0.5,
      top_p = 0.65,
      repetition_penalty =1.2,
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
寶貝，你在做什麼?
assistant 我在这里等待您的指令或问题，您可以和我聊天、提问或者告诉我您想要了解的内容。
CPU times: user 2.58 s, sys: 116 ms, total: 2.7 s
Wall time: 3.62 s


#DataSet

In [None]:
from google.colab import files
uploaded = files.upload()

Saving chat_dataset.jsonl to chat_dataset.jsonl


In [None]:
!head /content/chat_dataset.jsonl

In [None]:
import pandas as pd

df = pd.read_json("/content/chat_dataset.jsonl", lines=True)
# df = df.head(500)
print(df.head())

In [None]:
from datasets import Dataset
full_dataset = Dataset.from_pandas(df)
print(full_dataset)

Dataset({
    features: ['instruction', 'response'],
    num_rows: 1793
})


In [None]:
def format_chat_example(data_point):
    return f"""
<|im_start|>user
{data_point["instruction"]}<|im_end|>
<|im_start|>assistant
{data_point["response"]}<|im_end|>
""".strip()

def preprocess(data_point):
    full_prompt = format_chat_example(data_point)
    return tokenizer(full_prompt, truncation=True)

tokenized_dataset = full_dataset.map(preprocess)

Map:   0%|          | 0/1793 [00:00<?, ? examples/s]

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"Train size: {len(train_dataset)}, Eval size: {len(eval_dataset)}")

Train size: 1613, Eval size: 180


#Training

In [None]:
bleu = evaluate.load("bleu")
acc = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels,
    # after the argmax(-1) has been calculated by preprocess_logits_for_metrics
    # but we need to shift the labels
    labels = labels[:, 1:]
    preds = preds[:, :-1]

    # -100 is a default value for ignore_index used by DataCollatorForCompletionOnlyLM
    mask = labels == -100
    # replace -100 with a value that the tokenizer can decode
    labels[mask] = tokenizer.pad_token_id
    preds[mask] = tokenizer.pad_token_id

    # bleu takes in text, so we have to translate from token ids to text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # accuracy takes in lists of integers,
    # and we want to evaluate only the parts that are not -100,
    # hence the mask negation (~)
    accuracy = acc.compute(predictions=preds[~mask], references=labels[~mask])

    return {**bleu_score, **accuracy}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="experiments",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    weight_decay=0.1,
    fp16=True,
    logging_steps=4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    optim="paged_adamw_8bit",
    label_names=["labels"],
    # evaluation_strategy="steps",
    # eval_steps=4,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss
4,4.7397
8,4.7057


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

Step,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length,Accuracy
4,4.7397,,,,,,,,
8,4.7057,,,,,,,,
9,4.7057,4.691307,0.064501,"[0.24781572676727562, 0.06950880444856349, 0.05339265850945495, 0.0375]",0.841677,0.852981,1259.0,1476.0,0.332414


Trainer is attempting to log a value of "[0.24781572676727562, 0.06950880444856349, 0.05339265850945495, 0.0375]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 4.691307067871094,
 'eval_bleu': 0.06450090450322706,
 'eval_precisions': [0.24781572676727562,
  0.06950880444856349,
  0.05339265850945495,
  0.0375],
 'eval_brevity_penalty': 0.8416769441943641,
 'eval_length_ratio': 0.8529810298102981,
 'eval_translation_length': 1259,
 'eval_reference_length': 1476,
 'eval_accuracy': 0.33241399332268834}