In [1]:
!pip install transformers accelerate datasets torch torchvision peft pillow



In [32]:
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments
)
import torch
from peft import (
LoraConfig,
get_peft_model,
TaskType,
PeftConfig,
PeftModel
)
from huggingface_hub import notebook_login

In [3]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [4]:
ds = load_dataset("Hamid-reza/Adv-small-persian-QA")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1261
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130
    })
})

In [5]:
ds = ds.remove_columns(["id", "title"])
ds

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 1261
    })
    validation: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 130
    })
})

In [6]:
ds["train"][0]["answers"]["text"][0]

'کار را با گردآوری شهد گل\u200cها در کندو انجام می\u200cدهد'

In [7]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

def preprocess(sample):
    sample = sample["question"] + "\n" + sample["answers"]["text"][0]
    tokenized = tokenizer(
        sample,
        max_length = 128,
        truncation = True,
        padding = "max_length"
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [8]:
data = ds.map(preprocess)
data

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1261
    })
    validation: Dataset({
        features: ['context', 'question', 'answers', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 130
    })
})

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    device_map = "auto",
    offload_folder = "offload",
    torch_dtype = torch.float16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
model.gradient_checkpointing_enable()

In [11]:
lora_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    target_modules = ["q_proj", "k_proj", "v_proj"]
)

In [12]:
model = get_peft_model(model, lora_config)

In [13]:
training_args = TrainingArguments(
    num_train_epochs = 27,
    learning_rate = 0.001,
    logging_steps = 100,
    #per_device_train_batch_size = 1,
    report_to = "tensorboard"
)

In [14]:
trainer = Trainer(
    model = model,
    train_dataset = data["train"],
    args = training_args
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
print("Start training...")
trainer.train()
print("Training finished...")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Start training...


Step,Training Loss
100,1.2003
200,0.7666
300,0.736
400,0.6748
500,0.6615
600,0.605
700,0.5676
800,0.5413
900,0.478
1000,0.4673


Training finished...


In [16]:
trainer.save_model("/kaggle/working/")
tokenizer.save_pretrained("/kaggle/working/")

('/kaggle/working/tokenizer_config.json',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/chat_template.jinja',
 '/kaggle/working/vocab.json',
 '/kaggle/working/merges.txt',
 '/kaggle/working/added_tokens.json',
 '/kaggle/working/tokenizer.json')

In [17]:
path = "/kaggle/working/"

In [18]:
config = PeftConfig.from_pretrained(path)
base = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, trust_remote_code = True)
model = PeftModel.from_pretrained(base, path)
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code = True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
def generate_response(query):
    inputs = tokenizer(query, return_tensors = "pt").to(model.device)
    output = model.generate(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
        max_new_tokens = 100
    )
    return tokenizer.decode(output[0], skip_special_tokens = True)

In [23]:
print(generate_response("زنبور های عسل چگونه عسل تولید می کنند؟"))

زنبور های عسل چگونه عسل تولید می کنند؟
کار را با گردآوری شهد گل‌ها در کندو انجام می‌دهد


In [24]:
print(generate_response("سریال قیام مختار رو کی کارگردانی کرد؟"))

سریال قیام مختار رو کی کارگردانی کرد؟
داوود میرباقری


In [25]:
print(generate_response("در انسان اولین دلیل ترشح عرق چیست؟"))

در انسان اولین دلیل ترشح عرق چیست؟
عملی برای تنظیم درجهٔ حرارت بدن


In [26]:
print(generate_response("مواد غذایی گندم کدامند؟"))

مواد غذایی گندم کدامند؟
گندم منبع مهمی از کربوهیدرات است. مصرف گندم کامل باعث دریافت مواد مغذی مختلف و فیبر غذایی می‌شود.


In [30]:
print(generate_response("کلمه ربات چه معنی دارد؟"))

کلمه ربات چه معنی دارد؟
در زبان چک به‌معنی برده و کارگر است


In [33]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
model.push_to_hub("alikhademi98/finetuned_Qwen2.5_on_Hamid_reza_Adv_small_persian_QA")
tokenizer.push_to_hub("alikhademi98/finetuned_Qwen2.5_on_Hamid_reza_Adv_small_persian_QA")

Uploading...:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Uploading...:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alikhademi98/finetuned_Qwen2.5_on_Hamid_reza_Adv_small_persian_QA/commit/81c6176c85ed36b1abe88fd810adb1668fb67d12', commit_message='Upload tokenizer', commit_description='', oid='81c6176c85ed36b1abe88fd810adb1668fb67d12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alikhademi98/finetuned_Qwen2.5_on_Hamid_reza_Adv_small_persian_QA', endpoint='https://huggingface.co', repo_type='model', repo_id='alikhademi98/finetuned_Qwen2.5_on_Hamid_reza_Adv_small_persian_QA'), pr_revision=None, pr_num=None)