In [1]:
# Install the libraries
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/huggingface/trl.git
!pip install -q datasets
!pip install -q wandb --upgrade

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

import os

os.environ["WANDB_API_KEY"] = user_secrets.get_secret("wandb")
os.environ["WANDB_PROJECT"] = "SmolLM-135M-magpie-ultra-v1"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.[0m[31m
[0m

In [2]:
# setting up the config for 4-bit quantization of Qlora
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "HuggingFaceTB/SmolLM-135M-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    device_map = "auto",
)

tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [3]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # parameters specific to llama
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM",
    use_dora = True,
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 2597760 || all params: 84028608 || trainable%: 3.09151854568387


In [6]:
# Load the dataset from HF
from datasets import load_dataset

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

dataset = load_dataset("argilla/magpie-ultra-v0.1", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/50.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/263M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/264M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
print(dataset["text"][0])

<|im_start|>user
Cloudflare hosts a popular web page that can be attacked. A botnet containing 100 nodes is launched to attack the server. Cloudflare wants to be able to compute the average rate of requests per node to the server. In order to do this, they record the following data over a 1 hour time period:

Node 1 makes 50 requests
Node 2 makes 10 requests
Node 5 makes 1500 requests
Node 13 makes 20 requests
Node 100 makes 50 requests

The rest of the nodes don't record any requests during this time. Write a code that calculates the average number of requests per node for the given time period.

## Step 1: Define the problem and the given data
We are given the number of requests made by certain nodes in a botnet over a 1 hour time period. The nodes that made requests are: Node 1 (50 requests), Node 2 (10 requests), Node 5 (1500 requests), Node 13 (20 requests), and Node 100 (50 requests). The rest of the nodes (95 nodes) did not make any requests.

## Step 2: Calculate the total numb

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers

tokenizer.pad_token = tokenizer.eos_token
max_seq_length = 2048

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args=TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        gradient_checkpointing_kwargs = {"use_reentrant": False},
        gradient_checkpointing = True,
        num_train_epochs = 3,
        warmup_ratio = 0.02,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        optim = "paged_adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb",
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 500,
        save_total_limit = 5,
        run_name = "run-1",
        ddp_find_unused_parameters = False,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mliuhongyuan3000[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.4959
2,1.3031
3,1.4171
4,1.2987
5,1.5287
