In [1]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import json
import numpy as np
from collections import Counter
import string
import re

# Model setup
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # Increased from 16 to 32
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,  # Increased to match r
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Load and prepare the Persian QA dataset
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

# Load the dataset
dataset = load_dataset("SajjadAyoubi/persian_qa")

# Filter out samples with empty answers BEFORE any processing
def filter_valid_answers(example):
    """Filter out samples with empty or invalid answers"""
    answers = example['answers']
    if isinstance(answers, dict) and 'text' in answers:
        if isinstance(answers['text'], list):
            return len(answers['text']) > 0 and answers['text'][0].strip() != ""
        elif isinstance(answers['text'], str):
            return answers['text'].strip() != ""
    return False

print("Original dataset sizes:")
print(f"Train: {len(dataset['train'])}")
if 'validation' in dataset:
    print(f"Validation: {len(dataset['validation'])}")

# Filter the datasets
dataset['train'] = dataset['train'].filter(filter_valid_answers)
if 'validation' in dataset:
    dataset['validation'] = dataset['validation'].filter(filter_valid_answers)

print("\nAfter filtering empty answers:")
print(f"Train: {len(dataset['train'])}")
if 'validation' in dataset:
    print(f"Validation: {len(dataset['validation'])}")

# Let's first examine the dataset structure and test our conversion function
print("Dataset info:", dataset)

# Test the conversion function on a small subset first
test_subset = dataset['train'].select(range(min(10, len(dataset['train']))))
print("\nTesting conversion on first 10 samples...")

# Test conversion
try:
    test_converted = test_subset.map(convert_qa_to_conversation, batched=True)
    print(f"Conversion successful! Converted {len(test_converted)} samples")
    if len(test_converted) > 0 and len(test_converted[0]['conversations']) > 0:
        print("Sample conversation:", test_converted[0]['conversations'][0])
    else:
        print("No valid conversations found")
except Exception as e:
    print(f"Conversion failed: {e}")


def convert_qa_to_conversation(examples):
    """Convert Q&A format to conversation format"""
    conversations = []

    for i in range(len(examples['question'])):
        context = examples['context'][i]
        question = examples['question'][i]
        answers = examples['answers'][i]

        # Extract the answer text with proper error handling
        answer_text = ""
        if isinstance(answers, dict) and 'text' in answers:
            if isinstance(answers['text'], list) and len(answers['text']) > 0:
                answer_text = answers['text'][0]
            elif isinstance(answers['text'], str):
                answer_text = answers['text']
            else:
                print(f"Warning: Empty or invalid answer at index {i}: {answers}")
                continue  # Skip this sample if no valid answer
        else:
            print(f"Warning: Invalid answer format at index {i}: {answers}")
            continue  # Skip this sample if invalid format

        # Only create conversation if we have a valid answer
        if answer_text.strip():
            conversation = [
                {
                    "role": "user",
                    "content": f"بر اساس متن زیر به سوال پاسخ دهید:\n\nمتن: {context}\n\nسوال: {question}"
                },
                {
                    "role": "assistant",
                    "content": answer_text.strip()
                }
            ]
            conversations.append(conversation)

    return {"conversations": conversations}

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Convert dataset (only proceed if test was successful)
print("\nConverting full dataset...")
train_dataset = dataset['train'].map(convert_qa_to_conversation, batched=True, remove_columns=dataset['train'].column_names)

print(f"Training samples after conversion: {len(train_dataset)}")
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

# Also prepare validation set for evaluation
if 'validation' in dataset:
    val_dataset = dataset['validation'].map(convert_qa_to_conversation, batched=True, remove_columns=dataset['validation'].column_names)
    val_dataset = val_dataset.map(formatting_prompts_func, batched=True)
    print(f"Validation samples after conversion: {len(val_dataset)}")
else:
    # Split train set if no validation set exists
    split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']
    print(f"Created validation split: {len(val_dataset)} samples")

print("Sample conversation:")
print(train_dataset[0]["conversations"])
print("\nFormatted text sample:")
print(train_dataset[0]["text"][:500] + "...")

# Training setup
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        num_train_epochs=4,  # Increased from 1 to 3
        learning_rate=3e-4, # 1e-4
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
        save_strategy="epoch",
    ),
)

# Train on responses only
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# Memory stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Train the model
trainer_stats = trainer.train()

# Memory stats after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Evaluation functions
def normalize_answer_persian(s):
    # حذف علائم نگارشی فارسی و انگلیسی
    persian_punctuation = '،؛؟«»'
    all_punctuation = string.punctuation + persian_punctuation
    s = ''.join(ch for ch in s if ch not in all_punctuation)

    # یکسان‌سازی کاراکترهای عربی و فارسی
    s = re.sub(r'[يى]', 'ی', s)
    s = re.sub(r'[ك]', 'ک', s)

    # حذف فاصله‌های اضافی
    s = ' '.join(s.split())

    return s

# توابع ارزیابی را برای استفاده از تابع جدید به‌روز کنید
def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer_persian(prediction).split()
    ground_truth_tokens = normalize_answer_persian(ground_truth).split()

    # ... بقیه کد f1_score بدون تغییر باقی می‌ماند ...
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return int(prediction_tokens == ground_truth_tokens)

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1


def exact_match_score(prediction, ground_truth):
    return int(normalize_answer_persian(prediction) == normalize_answer_persian(ground_truth))

def evaluate_model(model, tokenizer, eval_dataset, num_samples=100):
    """Evaluate the model on the dataset."""
    FastLanguageModel.for_inference(model)

    f1_scores = []
    em_scores = []

    # Take a subset for evaluation (to speed up)
    eval_subset = eval_dataset.select(range(min(num_samples, len(eval_dataset))))

    # Get original validation data with proper error handling
    if 'validation' in dataset:
        original_data = dataset['validation'].select(range(min(num_samples, len(dataset['validation']))))
    else:
        # If no validation set, we need to reconstruct from our processed data
        print("Warning: Using processed data for evaluation - results may be less accurate")
        original_data = eval_subset

    for i, example in enumerate(eval_subset):
        if i % 10 == 0:
            print(f"Evaluating example {i+1}/{len(eval_subset)}")

        try:
            # Get the original data if available
            if 'validation' in dataset and i < len(original_data):
                orig = original_data[i]
                context = orig['context']
                question = orig['question']

                # Safe answer extraction
                answers = orig['answers']
                if isinstance(answers, dict) and 'text' in answers:
                    if isinstance(answers['text'], list) and len(answers['text']) > 0:
                        ground_truth = answers['text'][0]
                    elif isinstance(answers['text'], str):
                        ground_truth = answers['text']
                    else:
                        print(f"Skipping sample {i}: empty answer")
                        continue
                else:
                    print(f"Skipping sample {i}: invalid answer format")
                    continue
            else:
                # Extract from conversation format
                conversation = example['conversations'][0]
                user_content = conversation[0]['content']
                ground_truth = conversation[1]['content']

                # Parse context and question from user content
                parts = user_content.split('\n\n')
                context = parts[1].replace('متن: ', '') if len(parts) > 1 else ""
                question = parts[2].replace('سوال: ', '') if len(parts) > 2 else ""

            # Create the prompt
            messages = [
                {
                    "role": "user",
                    # "content": f"بر اساس متن زیر به سوال پاسخ دهید:\n\nمتن: {context}\n\nسوال: {question}"
                    "content": f"با توجه به متن زیر، فقط پاسخ کوتاه و دقیق را برای سوال استخراج کن:\n\nمتن: {context}\n\nسوال: {question}"
                }
            ]

            inputs = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
            ).to("cuda")

            # Create attention mask to avoid the warning
            attention_mask = torch.ones_like(inputs)

            # Generate answer with better parameters
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs,
                    attention_mask=attention_mask,
                    max_new_tokens=50,  # Reduced to avoid repetitive output
                    use_cache=True,
                    temperature=0.3,  # Slightly higher but still conservative
                    do_sample=True,
                    top_p=0.9,
                    repetition_penalty=1.1,  # Prevent repetition
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )

            # Extract only the new tokens (response)
            response_tokens = outputs[0][len(inputs[0]):]
            generated_text = tokenizer.decode(response_tokens, skip_special_tokens=True)

            # Clean up the prediction
            prediction = generated_text.strip()

            # Remove any remaining special tokens or artifacts
            prediction = re.sub(r'<\|.*?\|>', '', prediction)
            prediction = re.sub(r'\s+', ' ', prediction).strip()

            # Take only the first sentence if there are multiple
            if '.' in prediction:
                prediction = prediction.split('.')[0].strip()

            # Calculate metrics
            f1 = f1_score(prediction, ground_truth)
            em = exact_match_score(prediction, ground_truth)

            f1_scores.append(f1)
            em_scores.append(em)

            if i < 3:  # Show first few examples
                print(f"\nExample {i+1}:")
                print(f"Question: {question}")
                print(f"Ground Truth: {ground_truth}")
                print(f"Prediction: {prediction}")
                print(f"F1: {f1:.3f}, EM: {em}")

        except Exception as e:
            print(f"Error evaluating sample {i}: {e}")
            continue

    if len(f1_scores) == 0:
        print("No samples were successfully evaluated!")
        return 0.0, 0.0

    avg_f1 = np.mean(f1_scores)
    avg_em = np.mean(em_scores)

    print(f"\n=== EVALUATION RESULTS ===")
    print(f"Average F1 Score: {avg_f1:.4f}")
    print(f"Average Exact Match: {avg_em:.4f}")
    print(f"Samples evaluated: {len(f1_scores)}")

    return avg_f1, avg_em

# Run evaluation
print("Starting evaluation...")
f1, em = evaluate_model(model, tokenizer, val_dataset, num_samples=100)

# Save the model
model.save_pretrained("persian_qa_model")
tokenizer.save_pretrained("persian_qa_model")

print("\nTraining and evaluation complete!")
print(f"Final Results:")
print(f"F1 Score: {f1:.4f}")
print(f"Exact Match: {em:.4f}")

# Example inference
print("\n=== INFERENCE EXAMPLE ===")
FastLanguageModel.for_inference(model)

sample_context = "شرکت فولاد مبارکۀ اصفهان، بزرگ‌ترین واحد صنعتی خصوصی در ایران و بزرگ‌ترین مجتمع تولید فولاد در کشور ایران است، که در شرق شهر مبارکه قرار دارد."
sample_question = "شرکت فولاد مبارکه در کجا واقع شده است؟"

messages = [
    {
        "role": "user",
        "content": f"بر اساس متن زیر به سوال پاسخ دهید:\n\nمتن: {sample_context}\n\nسوال: {sample_question}"
    }
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

# Create attention mask
attention_mask = torch.ones_like(inputs)

outputs = model.generate(
    input_ids=inputs,
    attention_mask=attention_mask,
    max_new_tokens=32,
    use_cache=True,
    temperature=0.3,
    do_sample=True,
    top_p=0.9,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

# Extract only the response part
response_tokens = outputs[0][len(inputs[0]):]
response = tokenizer.decode(response_tokens, skip_special_tokens=True)
response = re.sub(r'<\|.*?\|>', '', response).strip()

print(f"Context: {sample_context}")
print(f"Question: {sample_question}")
print(f"Model Answer: {response}")
print(f"Expected: در شرق شهر مبارکه")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.8.10 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


Original dataset sizes:
Train: 9008
Validation: 930


Filter:   0%|          | 0/9008 [00:00<?, ? examples/s]

Filter:   0%|          | 0/930 [00:00<?, ? examples/s]


After filtering empty answers:
Train: 6306
Validation: 651
Dataset info: DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 6306
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 651
    })
})

Testing conversion on first 10 samples...
Conversion failed: name 'convert_qa_to_conversation' is not defined

Converting full dataset...


Map:   0%|          | 0/6306 [00:00<?, ? examples/s]

Training samples after conversion: 6306


Map:   0%|          | 0/6306 [00:00<?, ? examples/s]

Map:   0%|          | 0/651 [00:00<?, ? examples/s]

Map:   0%|          | 0/651 [00:00<?, ? examples/s]

Validation samples after conversion: 651
Sample conversation:
[{'content': 'بر اساس متن زیر به سوال پاسخ دهید:\n\nمتن: شرکت فولاد مبارکۀ اصفهان، بزرگ\u200cترین واحد صنعتی خصوصی در ایران و بزرگ\u200cترین مجتمع تولید فولاد در کشور ایران است، که در شرق شهر مبارکه قرار دارد. فولاد مبارکه هم\u200cاکنون محرک بسیاری از صنایع بالادستی و پایین\u200cدستی است. فولاد مبارکه در ۱۱ دوره جایزۀ ملی تعالی سازمانی و ۶ دوره جایزۀ شرکت دانشی در کشور رتبۀ نخست را بدست آورده\u200cاست و همچنین این شرکت در سال ۱۳۹۱ برای نخستین\u200cبار به عنوان تنها شرکت ایرانی با کسب امتیاز ۶۵۴ تندیس زرین جایزۀ ملی تعالی سازمانی را از آن خود کند. شرکت فولاد مبارکۀ اصفهان در ۲۳ دی ماه ۱۳۷۱ احداث شد و اکنون بزرگ\u200cترین واحدهای صنعتی و بزرگترین مجتمع تولید فولاد در ایران است. این شرکت در زمینی به مساحت ۳۵ کیلومتر مربع در نزدیکی شهر مبارکه و در ۷۵ کیلومتری جنوب غربی شهر اصفهان واقع شده\u200cاست. مصرف آب این کارخانه در کمترین میزان خود، ۱٫۵٪ از دبی زاینده\u200cرود برابر سالانه ۲۳ میلیون متر مکعب در سال است و خود یکی از عوامل ک

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/6306 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/6306 [00:00<?, ? examples/s]

GPU = Tesla T4. Max memory = 14.741 GB.
1.203 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,306 | Num Epochs = 4 | Total steps = 3,156
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 22,544,384 of 1,258,358,784 (1.79% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.0758
20,1.6
30,0.9924
40,0.944
50,1.016
60,0.8352
70,0.7579
80,0.7808
90,0.9674
100,0.8123


Step,Training Loss
10,2.0758
20,1.6
30,0.9924
40,0.944
50,1.016
60,0.8352
70,0.7579
80,0.7808
90,0.9674
100,0.8123


5604.3113 seconds used for training.
93.41 minutes used for training.
Peak reserved memory = 2.541 GB.
Peak reserved memory for training = 1.338 GB.
Peak reserved memory % of max memory = 17.238 %.
Peak reserved memory for training % of max memory = 9.077 %.
Starting evaluation...
Evaluating example 1/100

Example 1:
Question: پایتخت اسپانیا کجاست؟
Ground Truth: مادرید
Prediction: مادریدİTESİ CLIIIKаракт
F1: 0.000, EM: 0

Example 2:
Question: بر چه اساسی رئال موفق ترین تیم در تاریخ فوتبال اروپا است؟
Ground Truth: فیفا
Prediction: بر اساس رکوردİTESİилася
F1: 0.000, EM: 0

Example 3:
Question: رئال مادرید چند بار در لیگ قهرمانان اروپا به عنوان قهرمانی رسیده؟
Ground Truth: ۱۳
Prediction: ۱۳ قهرمانی?> lásilыџNеристи
F1: 0.500, EM: 0
Evaluating example 11/100
Evaluating example 21/100
Evaluating example 31/100
Evaluating example 41/100
Evaluating example 51/100
Evaluating example 61/100
Evaluating example 71/100
Evaluating example 81/100
Evaluating example 91/100

=== EVALUATION RESULTS ===

### Save LoRA

In [3]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/chat_template.jinja',
 'lora_model/tokenizer.json')

### Save GGUF for llama.cpp

In [4]:
model.save_pretrained_gguf("Llama-3.2-1B-bnb-4bit-persian-qa", tokenizer,)

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 6.69 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 23.37it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Llama-3.2-1B-bnb-4bit-persian-qa/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at Llama-3.2-1B-bnb-4bit-persian-qa into q8_0 GGUF format.
The output location will be /content/Llama-3.2-1B-bnb-4bit-persian-qa/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3.2-1B-bnb-4bit-persian-qa
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: l

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!du -hs /content/Llama-3.2-1B-bnb-4bit-persian-qa/unsloth.Q8_0.gguf

1.3G	/content/Llama-3.2-1B-bnb-4bit-persian-qa/unsloth.Q8_0.gguf


In [20]:
!cp /content/Llama-3.2-1B-bnb-4bit-persian-qa/unsloth.Q8_0.gguf /content/drive/MyDrive/llama3.2-persianqa

## Improve generation

In [18]:
import torch
import numpy as np
from collections import Counter
import string
import re

def clean_prediction(text):
    """Clean model prediction from artifacts and extra text."""
    # Remove special tokens
    text = re.sub(r'<\|[^|]*\|>', '', text)
    text = re.sub(r'user[a-zA-Z]*', '', text)
    text = re.sub(r'assistant[a-zA-Z]*', '', text)
    text = re.sub(r'<[^>]*>', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # If prediction is too long, take first meaningful part
    sentences = text.split('.')
    if len(sentences) > 1:
        # Take first sentence if it's not empty
        first_sentence = sentences[0].strip()
        if first_sentence:
            text = first_sentence

    # If still too long, take first 50 characters
    if len(text) > 50:
        text = text[:50].strip()

    return text

def generate_answer(model, tokenizer, context, question, temperature=0.1, max_new_tokens=30):
    """Generate answer with improved parameters."""
    messages = [
        {
            "role": "user",
            "content": f"بر اساس متن زیر به سوال پاسخ دهید:\n\nمتن: {context}\n\nسوال: {question}"
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    attention_mask = torch.ones_like(inputs)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            temperature=temperature,
            do_sample=True if temperature > 0 else False,
            top_p=0.8,
            top_k=50,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

    # Extract only the response part
    response_tokens = outputs[0][len(inputs[0]):]
    response = tokenizer.decode(response_tokens, skip_special_tokens=True)

    return clean_prediction(response)

def evaluate_with_different_params(model, tokenizer, eval_dataset, num_samples=50):
    """Evaluate with different generation parameters."""
    FastLanguageModel.for_inference(model)

    # Different parameter combinations to test
    param_configs = [
        {"temperature": 1.0, "max_new_tokens": 20, "name": "Normal"},
        {"temperature": 0.1, "max_new_tokens": 20, "name": "Conservative"},
        {"temperature": 0.01, "max_new_tokens": 30, "name": "Very Conservative"},
        {"temperature": 0.3, "max_new_tokens": 25, "name": "Moderate"},
        {"temperature": 0.0, "max_new_tokens": 25, "name": "Greedy"}  # Greedy decoding
    ]

    results = {}

    # Prepare data
    eval_subset = eval_dataset.select(range(min(num_samples, len(eval_dataset))))

    if 'validation' in dataset:
        original_data = dataset['validation'].select(range(min(num_samples, len(dataset['validation']))))
    else:
        original_data = eval_subset

    print(f"Testing {len(param_configs)} different generation configurations on {num_samples} samples...")

    for config in param_configs:
        print(f"\n🧪 Testing {config['name']} configuration...")
        print(f"   Temperature: {config['temperature']}, Max tokens: {config['max_new_tokens']}")

        f1_scores = []
        em_scores = []

        for i in range(min(num_samples, len(eval_subset))):
            if i % 20 == 0:
                print(f"   Progress: {i+1}/{num_samples}")

            try:
                if 'validation' in dataset and i < len(original_data):
                    orig = original_data[i]
                    context = orig['context']
                    question = orig['question']

                    answers = orig['answers']
                    if isinstance(answers['text'], list) and len(answers['text']) > 0:
                        ground_truth = answers['text'][0]
                    else:
                        continue
                else:
                    conversation = eval_subset[i]['conversations'][0]
                    user_content = conversation[0]['content']
                    ground_truth = conversation[1]['content']

                    parts = user_content.split('\n\n')
                    context = parts[1].replace('متن: ', '') if len(parts) > 1 else ""
                    question = parts[2].replace('سوال: ', '') if len(parts) > 2 else ""

                # Generate answer
                prediction = generate_answer(
                    model, tokenizer, context, question,
                    temperature=config['temperature'],
                    max_new_tokens=config['max_new_tokens']
                )

                # Calculate metrics
                f1 = f1_score(prediction, ground_truth)
                em = exact_match_score(prediction, ground_truth)

                f1_scores.append(f1)
                em_scores.append(em)

                # Show first few examples for this config
                if i < 2:
                    print(f"   Example {i+1}:")
                    print(f"     Q: {question[:50]}...")
                    print(f"     GT: {ground_truth}")
                    print(f"     Pred: {prediction}")
                    print(f"     F1: {f1:.3f}, EM: {em}")

            except Exception as e:
                print(f"   Error in sample {i}: {e}")
                continue

        if len(f1_scores) > 0:
            avg_f1 = np.mean(f1_scores)
            avg_em = np.mean(em_scores)
            std_f1 = np.std(f1_scores)
            std_em = np.std(em_scores)

            results[config['name']] = {
                'f1_mean': avg_f1,
                'f1_std': std_f1,
                'em_mean': avg_em,
                'em_std': std_em,
                'samples': len(f1_scores)
            }

            print(f"   Results: F1={avg_f1:.4f}±{std_f1:.4f}, EM={avg_em:.4f}±{std_em:.4f}")
        else:
            print(f"   No valid samples for {config['name']}")

    return results

# Run comprehensive evaluation
print("🚀 Starting comprehensive evaluation...")

# Step 1: Test different generation parameters
print("\n" + "="*60)
print("STEP 1: Testing Different Generation Parameters")
print("="*60)

param_results = evaluate_with_different_params(model, tokenizer, val_dataset, num_samples=30)

print("\n📊 Parameter Comparison Summary:")
for config_name, results in param_results.items():
    print(f"{config_name:15}: F1={results['f1_mean']:.4f}±{results['f1_std']:.4f}, EM={results['em_mean']:.4f}±{results['em_std']:.4f}")

# Find best configuration
if param_results:
    best_config = max(param_results.items(), key=lambda x: x[1]['f1_mean'])
    print(f"\n🏆 Best configuration: {best_config[0]} (F1: {best_config[1]['f1_mean']:.4f})")

# Step 2: Final inference examples
print("\n" + "="*60)
print("STEP 2: Final Inference Examples")
print("="*60)

FastLanguageModel.for_inference(model)

test_examples = [
    {
        "context": "شرکت فولاد مبارکۀ اصفهان، بزرگ‌ترین واحد صنعتی خصوصی در ایران و بزرگ‌ترین مجتمع تولید فولاد در کشور ایران است، که در شرق شهر مبارکه قرار دارد.",
        "question": "شرکت فولاد مبارکه در کجا واقع شده است؟",
        "expected": "در شرق شهر مبارکه"
    },
    {
        "context": "تهران پایتخت ایران و مرکز استان تهران است. این شهر در شمال ایران واقع شده است.",
        "question": "پایتخت ایران کدام شهر است؟",
        "expected": "تهران"
    },
    {
        "context": "رئال مادرید یکی از موفق‌ترین تیم‌های فوتبال جهان است که ۱۴ بار قهرمان لیگ قهرمانان اروپا شده است.",
        "question": "رئال مادرید چند بار قهرمان لیگ قهرمانان شده؟",
        "expected": "۱۴ بار"
    }
]

print("\n🎯 Testing Final Model Performance:")
for i, example in enumerate(test_examples, 1):
    prediction = generate_answer(
        model, tokenizer,
        example["context"],
        example["question"],
        temperature=0.01,
        max_new_tokens=25
    )

    f1 = f1_score(prediction, example["expected"])
    em = exact_match_score(prediction, example["expected"])

    print(f"\nExample {i}:")
    print(f"Question: {example['question']}")
    print(f"Expected: {example['expected']}")
    print(f"Generated: {prediction}")
    print(f"F1: {f1:.3f}, EM: {em}")
    print("-" * 50)

print("\n✅ Comprehensive evaluation completed!")

🚀 Starting comprehensive evaluation...

STEP 1: Testing Different Generation Parameters
Testing 5 different generation configurations on 30 samples...

🧪 Testing Normal configuration...
   Temperature: 1.0, Max tokens: 20
   Progress: 1/30
   Example 1:
     Q: پایتخت اسپانیا کجاست؟...
     GT: مادرید
     Pred: مامارید
     F1: 0.000, EM: 0
   Example 2:
     Q: بر چه اساسی رئال موفق ترین تیم در تاریخ فوتبال ارو...
     GT: فیفا
     Pred: موفق‌ترین تیم تاریخ فوتبال Європای
     F1: 0.000, EM: 0
   Progress: 21/30
   Results: F1=0.5515±0.3762, EM=0.2667±0.4422

🧪 Testing Conservative configuration...
   Temperature: 0.1, Max tokens: 20
   Progress: 1/30
   Example 1:
     Q: پایتخت اسپانیا کجاست؟...
     GT: مادرید
     Pred: مامارید
     F1: 0.000, EM: 0
   Example 2:
     Q: بر چه اساسی رئال موفق ترین تیم در تاریخ فوتبال ارو...
     GT: فیفا
     Pred: به صورت رسمی، به صورت رسمی، به صورت رسمی، به صورت
     F1: 0.000, EM: 0
   Progress: 21/30
   Results: F1=0.6287±0.3906, EM=0.3333±0