## select GPU

In [1]:
import torch
import os

os.environ["WANDB_NOTEBOOK_NAME"] = "training_student.ipynb"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Load the distilled dataset from disk

In [2]:
import pandas as pd

df = pd.read_csv('sample/merged_distilled_dataset.csv')

In [3]:
from datasets import Dataset

distilled_dataset = Dataset.from_pandas(df)
distilled_dataset

Dataset({
    features: ['prompt', 'response'],
    num_rows: 10000
})

## Load student model

In [4]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM

STUDENT_MODEL = 'Qwen/Qwen2-0.5B-Instruct'

student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL, trust_remote_code=True)

student_model = AutoModelForCausalLM.from_pretrained(STUDENT_MODEL, torch_dtype=torch.float16, device_map="auto")

student_model = prepare_model_for_kbit_training(student_model)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.1,
    bias='none',
    task_type='CAUSAL_LM'
)
student_model = get_peft_model(student_model, lora_config)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


### Tokenize Distilled Dataset

In [5]:
def tokenize_fn(example):
    prompt = example.get('prompt') or ''
    response = example.get('response') or ''
    full_text = prompt + ' ' + response
    return student_tokenizer(full_text, truncation=True, padding='max_length', max_length=512)

# tokenized_dataset = distilled_dataset.map(tokenize_fn)
tokenized_dataset = distilled_dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
#tokenized_dataset[69]

In [7]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.3, seed=42)
val_test_split = split_dataset['test'].train_test_split(test_size=0.5, seed=42)
train_dataset = split_dataset['train']
valid_dataset = val_test_split['train']
test_dataset = val_test_split['test']

## Storing metrics & Memory profiling

In [8]:
import psutil

history = {'eval_loss': [], 'eval_ppl': [], 'gpu_mem': [], 'cpu_mem': []}

def compute_metrics(eval_pred):
    loss = eval_pred.metrics["eval_loss"]
    ppl = torch.exp(torch.tensor(loss)).item()
    history["eval_loss"].append(loss)
    history["eval_ppl"].append(ppl)

    gpu_mem = torch.cuda.memory_allocated() / (1024 ** 2)
    history["gpu_mem"].append(gpu_mem)
    history["cpu_mem"].append(psutil.virtual_memory().percent)

    return {"perplexity": ppl}


## Training Arguments & Trainer

In [9]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="models/qwen2-lora-distilled",
    run_name='student-qwen2-0.5B-lora-distilled',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    # num_train_epochs=5,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    # save_strategy="steps",
    # eval_strategy="steps",
    # eval_steps=100,
    eval_strategy="no",
    save_safetensors=True,
    # load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=student_tokenizer,
#     mlm=False
# )

trainer = Trainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    # data_collator=data_collator
)


[2025-04-22 13:16:48,432] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/tljh/user/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/tljh/user/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Start Training

In [10]:
train_result = trainer.train()
trainer.save_model("models/qwen2-lora-distilled-final")
student_tokenizer.save_pretrained("models/qwen2-lora-distilled-final")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mst124974[0m ([33mbinit-ait[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,9.4865
20,2.7773
30,0.4422
40,0.3994
50,0.3801
60,0.3921
70,0.3829
80,0.3611
90,0.3711
100,0.3648


('models/qwen2-lora-distilled-final/tokenizer_config.json',
 'models/qwen2-lora-distilled-final/special_tokens_map.json',
 'models/qwen2-lora-distilled-final/vocab.json',
 'models/qwen2-lora-distilled-final/merges.txt',
 'models/qwen2-lora-distilled-final/added_tokens.json',
 'models/qwen2-lora-distilled-final/tokenizer.json')

In [15]:
# from matplotlib import pyplot as plt

# test_results = trainer.evaluate(test_dataset)

# plt.figure(figsize=(8,6))
# plt.plot(history["eval_loss"], label="Validation Loss", linewidth=2, color='green')
# plt.plot(history["eval_ppl"], label="Validation Perplexity", linewidth=2, color='orange')
# plt.xlabel("Epoch")
# plt.ylabel("Value")
# plt.title("Validation Loss & Perplexity")
# plt.legend()
# plt.savefig("validation_metrics.png")

# plt.figure(figsize=(8,6))
# plt.plot(history["gpu_mem"], label="GPU Memory (MB)", linewidth=2, color='green')
# plt.plot(history["cpu_mem"], label="CPU Memory (%)", linewidth=2, color='orange')
# plt.xlabel("Epoch")
# plt.ylabel("Usage")
# plt.title("Memory Usage During Training")
# plt.legend()
# plt.savefig("memory_usage.png")

In [13]:

# training_args = TrainingArguments(
#     output_dir='models/qwen2-lora-distilled-checkpoints/',
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=8,
#     num_train_epochs=10,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_dir='./logs',
#     logging_steps=10,
#     save_total_limit=2,
#     save_strategy='epoch',
#     evaluation_strategy='epoch',
#     save_safetensors=True,
#     load_best_model_at_end=True,
#     metric_for_best_model='eval_loss',
#     greater_is_better=False
# )

# trainer = Trainer(
#     model=student_model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     tokenizer=student_tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
# )