In [9]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)

In [None]:
model_id = "./models/Qwen3-8B-AWQ-MMLU" 
output_dir = "./models/Qwen3-8B-AWQ-MMLU-lora"


dataset = load_dataset("cais/mmlu", "all", split="auxiliary_train")
tokenizer = AutoTokenizer.from_pretrained(model_id)


def format_and_tokenize(example):
    options = ["A", "B", "C", "D"]
    
    prompt = f"{example['question']}\n\nChoices:\n"
    for i, choice in enumerate(example['choices']):
        prompt += f"{options[i]}. {choice}\n"
    prompt += "\nAnswer:"
    
    response = f" {options[example['answer']]}"
    
    messages = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": response}
    ]

    full_text = tokenizer.apply_chat_template(messages, tokenize=False)
    
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=512,
        padding=False,
        add_special_tokens=False
    )
    
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized


In [11]:
dataset = dataset.select(range(10000))
tokenized_dataset = dataset.map(format_and_tokenize, remove_columns=dataset.column_names)

Map: 100%|██████████| 10000/10000 [00:20<00:00, 484.48 examples/s]


In [12]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    max_memory={0: "22GiB", 1: "18GiB"}, 
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.24it/s]


trainable params: 43,646,976 || all params: 1,288,614,912 || trainable%: 3.3871


In [13]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=6,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=1e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="steps",
    save_steps=50,
    optim="adamw_torch",
    report_to="tensorboard",
    remove_unused_columns=False,
    
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)


In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
The model is already on multiple devices. Skipping the move to device specified in `args`.
  return data.pin_memory(device)
  return data.pin_memory(device)


Step,Training Loss
10,2.9422
20,2.7026
30,2.5761
40,2.4057
50,2.3169
60,2.2404
70,2.1991
80,2.1353
90,2.1189
100,2.0628


  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)
  return data.pin_memory(device)


TrainOutput(global_step=209, training_loss=2.194456483758808, metrics={'train_runtime': 5544.0136, 'train_samples_per_second': 1.804, 'train_steps_per_second': 0.038, 'total_flos': 2.0185942625550336e+16, 'train_loss': 2.194456483758808, 'epoch': 1.0})

In [16]:
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./models/Qwen3-8B-AWQ-MMLU-lora/tokenizer_config.json',
 './models/Qwen3-8B-AWQ-MMLU-lora/special_tokens_map.json',
 './models/Qwen3-8B-AWQ-MMLU-lora/chat_template.jinja',
 './models/Qwen3-8B-AWQ-MMLU-lora/vocab.json',
 './models/Qwen3-8B-AWQ-MMLU-lora/merges.txt',
 './models/Qwen3-8B-AWQ-MMLU-lora/added_tokens.json',
 './models/Qwen3-8B-AWQ-MMLU-lora/tokenizer.json')

In [4]:
tokenizer.push_to_hub("akon1te/qwen3-8b-awq-mmlu-lora")
model.push_to_hub("akon1te/qwen3-8b-awq-mmlu-lora")

Processing Files (1 / 1): 100%|██████████| 11.4MB / 11.4MB,  0.00B/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  
Processing Files (1 / 1): 100%|██████████|  175MB /  175MB,  101MB/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  


CommitInfo(commit_url='https://huggingface.co/akon1te/qwen3-8b-awq-mmlu-lora/commit/4c3d9beccf8fce2d9211e26fa245b60ee7950927', commit_message='Upload model', commit_description='', oid='4c3d9beccf8fce2d9211e26fa245b60ee7950927', pr_url=None, repo_url=RepoUrl('https://huggingface.co/akon1te/qwen3-8b-awq-mmlu-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='akon1te/qwen3-8b-awq-mmlu-lora'), pr_revision=None, pr_num=None)

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


base_model_path = "akon1te/qwen3-8b-awq"  # Путь к базовой AWQ модели
adapter_path = "akon1te/qwen3-8b-awq-mmlu-lora"   # Путь к сохраненным адаптерам

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)


model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    is_trainable=False # Мы только используем, не учим
)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/

Loading checkpoint shards: 100%|██████████| 2/2 [01:24<00:00, 42.22s/it]


In [3]:
from mmlu_benchmark import MMLUEvaluator

evaluator = MMLUEvaluator(
    model=model, tokenizer=tokenizer, device="cuda",
    split="dev", per_subject_samples=10, seed=42, model_name="awq_mmlu"
)

_ = evaluator.evaluate()

  Загружена dev выборка
  Всего вопросов в dev выборке: 285
  Количество предметов: 57
Инициализация завершена. Эксперимент: awq_mmlu_dev_20251223_130308

Эксперимент: awq_mmlu_dev_20251223_130308
Модель: awq_mmlu
Всего вопросов в dev: 285
Количество предметов: 57
Промпт стиль: zero-shot


57it [01:28,  1.55s/it]

ОБЩАЯ ТОЧНОСТЬ: 0.7368 (73.68%)
Правильных ответов: 210 из 285
Оценено предметов: 57
Пиковое потребление VRAM: 6161.36 MB



