In [3]:
import os
import torch
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel,PeftConfig
import os
import gc
gc.collect()
torch.cuda.empty_cache()



In [5]:
os.environ['HF_HOME'] = 'cache/'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA']= "1"
os.environ['TRANSFORMERS_CACHE'] = 'cache/'

free_in_GB = int(torch.cuda.mem_get_info()[0] / 1024**3)
max_memory = f"{free_in_GB-2}GB"
print(max_memory)

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
print(max_memory)

4GB
{0: '4GB'}


In [6]:
train_dataset = Dataset.from_json('data/mmlu/five_shot_mmlu_test.json').select(range(50))
val_dataset = Dataset.from_json('data/mmlu/five_shot_mmlu_val.json').select(range(50))

In [7]:
print(torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

1
GPU 0: NVIDIA GeForce RTX 4070 Laptop GPU


In [8]:
PROMPT_TEMPLATE = """

### Question:
{question}

### Response:
{output}"""


def apply_prompt_template(row):
    prompt = PROMPT_TEMPLATE.format(
        question=row["input"],
        output=row["output"],
    )
    return {"prompt": prompt}

train_dataset = train_dataset.map(apply_prompt_template)
val_dataset = val_dataset.map(apply_prompt_template)


In [9]:
base_model_id = "HuggingFaceTB/SmolLM2-1.7B"

# You can use a different max length if your custom dataset has shorter/longer input sequences.
MAX_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=MAX_LENGTH,
    padding_side="left",
    add_eos_token=True,
    max_memory=max_memory
)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_and_pad_to_fixed_length(sample):
    result = tokenizer(
        sample["prompt"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result


tokenized_train_dataset = train_dataset.map(tokenize_and_pad_to_fixed_length)
val_dataset = val_dataset.map(tokenize_and_pad_to_fixed_length)

assert all(len(x["input_ids"]) == MAX_LENGTH for x in tokenized_train_dataset)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [7]:
quantization_config = BitsAndBytesConfig(
    # Load the model with 4-bit quantization
    load_in_4bit=True,
    # Use double quantization
    bnb_4bit_use_double_quant=True,
    # Use 4-bit Normal Float for storing the base model weights in GPU memory
    bnb_4bit_quant_type="nf4",
    # De-quantize the weights to 16-bit (Brain) float before the forward/backward pass
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=quantization_config,max_memory=max_memory)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [8]:
# Enabling gradient checkpointing, to make the training further efficient
model.gradient_checkpointing_enable()
# Set up the model for quantization-aware training e.g. casting layers, parameter freezing, etc.
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    # This is the rank of the decomposed matrices A and B to be learned during fine-tuning. A smaller number will save more GPU memory but might result in worse performance.
    r=32,
    # This is the coefficient for the learned ΔW factor, so the larger number will typically result in a larger behavior change after fine-tuning.
    lora_alpha=64,
    # Drop out ratio for the layers in LoRA adaptors A and B.
    lora_dropout=0.1,
    # We fine-tune all linear layers in the model. It might sound a bit large, but the trainable adapter size is still only **1.16%** of the whole model.
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    # Bias parameters to train. 'none' is recommended to keep the original model performing equally when turning off the adapter.
    bias="none",
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()


  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


trainable params: 37,814,272 || all params: 1,749,190,656 || trainable%: 2.1618153441588017


In [None]:
training_args = TrainingArguments(
    # Replace with your output destination
    output_dir="models",
    # For the following arguments, refer to https://huggingface.co/docs/transformers/main_classes/trainer
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    bf16=True,
    learning_rate=2e-5,
    lr_scheduler_type="constant",
    max_steps=50,
    save_steps=10,
    logging_steps=10,
    warmup_steps=5,
    ddp_find_unused_parameters=False,
    do_eval=True,
    eval_strategy='steps',
    eval_on_start=True,
    num_train_epochs=1
)

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=val_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    args=training_args,
)

# use_cache=True is incompatible with gradient checkpointing.
peft_model.config.use_cache = False

max_steps is given, it will override any value given in num_train_epochs


In [12]:
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

2025/03/04 15:59:27 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id c2e1a0bc3d7a45c18644bbb1876def6f: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='models\runs\Mar04_15-58-29_MSI' for run ID='c2e1a0bc3d7a45c18644bbb1876def6f'. Attempted logging new value 'models\runs\Mar04_15-59-25_MSI'.
2025/03/04 15:59:27 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id c2e1a0bc3d7a45c18644bbb1876def6f: Failed to log run data: Exception: Changing param values is not allowed. Param with key='max_steps' was already logged with value='500' for run ID='c2e1a0bc3d7a45c18644bbb1876def6f'. Attempted logging new value '50'.


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.7959359884262085, 'eval_runtime': 5.5054, 'eval_samples_per_second': 9.082, 'eval_steps_per_second': 1.271, 'epoch': 0}
{'loss': 1.3191, 'grad_norm': 0.7870001196861267, 'learning_rate': 2e-05, 'epoch': 1.6}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.7183959484100342, 'eval_runtime': 4.8791, 'eval_samples_per_second': 10.248, 'eval_steps_per_second': 1.435, 'epoch': 1.6}
{'loss': 1.1246, 'grad_norm': 0.8748283386230469, 'learning_rate': 2e-05, 'epoch': 3.2}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.634220838546753, 'eval_runtime': 4.7832, 'eval_samples_per_second': 10.453, 'eval_steps_per_second': 1.463, 'epoch': 3.2}
{'loss': 0.763, 'grad_norm': 0.9286556243896484, 'learning_rate': 2e-05, 'epoch': 4.8}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.5648105144500732, 'eval_runtime': 4.9451, 'eval_samples_per_second': 10.111, 'eval_steps_per_second': 1.416, 'epoch': 4.8}
{'loss': 0.5348, 'grad_norm': 1.581502914428711, 'learning_rate': 2e-05, 'epoch': 6.4}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.5104739665985107, 'eval_runtime': 4.8274, 'eval_samples_per_second': 10.357, 'eval_steps_per_second': 1.45, 'epoch': 6.4}
{'loss': 0.3279, 'grad_norm': 7.132233619689941, 'learning_rate': 2e-05, 'epoch': 8.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 1.4905585050582886, 'eval_runtime': 4.9006, 'eval_samples_per_second': 10.203, 'eval_steps_per_second': 1.428, 'epoch': 8.0}
{'train_runtime': 267.6622, 'train_samples_per_second': 1.494, 'train_steps_per_second': 0.187, 'train_loss': 0.8139023447036743, 'epoch': 8.0}


TrainOutput(global_step=50, training_loss=0.8139023447036743, metrics={'train_runtime': 267.6622, 'train_samples_per_second': 1.494, 'train_steps_per_second': 0.187, 'total_flos': 1012855209984000.0, 'train_loss': 0.8139023447036743, 'epoch': 8.0})

In [13]:
trainer.model.save_pretrained('models/SmolLM2-1.7B-custom')



In [10]:
base_model = AutoModelForCausalLM.from_pretrained(base_model_id,tie_word_embeddings=False)
# Load the PEFT configuration
peft_model_dir = "models/SmolLM2-1.7B-custom"  # Directory containing PEFT adapter files
peft_model_merged_dir = "models/SmolLM2-1.7B-custom-merged"
peft_config = PeftConfig.from_pretrained(peft_model_dir)
# Initialize the PEFT model
peft_model = PeftModel(base_model, peft_config)
# Merge the adapters into the base model
merged_model = peft_model.merge_and_unload()
# Save the merged model
merged_model.save_pretrained(peft_model_merged_dir)


Some weights of LlamaForCausalLM were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-1.7B and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


In [11]:
model = pipeline('text-generation',
                  peft_model_merged_dir,
                  tokenizer=tokenizer,
                  device='cuda',
                  max_length=15)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
model('hello')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'hello OF fighter�markdown Raise holog Idea resort flap controvers flippingorialinks�'}]