In [13]:
! pip install transformers datasets  evaluate rouge_score


Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=25025 sha256=dfe5d94334fa7ad4d4e7b4eb333490b87aa70b20c77460e139d6fb7d7a2236bf
  Stored in directory: c:\users\abdoa\appdata\local\pip\cache\wheels\1e\19\43\8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [20]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# تحميل النموذج المحفوظ
model_path = "AraT5_finetuned_model"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [21]:
from datasets import Dataset
import numpy as np
import evaluate

dataset = {"input_text": [], "target_text": []}

with open("unique_all.txt", "r") as f:
  lines = f.readlines()
  for i in range(0, len(lines), 2):  # Step by 2 to get input and target
    dataset["input_text"].append(lines[i].strip())  
    dataset["target_text"].append(lines[i+1].strip())

# تحويلها إلى Dataset خاص بـ Hugging Face
train_dataset = Dataset.from_dict(dataset)

# تحويل النصوص إلى Tokens
def preprocess_function(examples):
    inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 2633/2633 [00:00<00:00, 5790.41 examples/s]


In [22]:
import random
from arabert.preprocess import ArabertPreprocessor

arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv02")

# Function to replace words with synonyms (example for Arabic)
def synonym_replacement(text):
    words = text.split()
    if len(words) < 2:
        return text
    index = random.randint(0, len(words) - 1)
    words[index] = arabert_prep.preprocess(words[index])  # Apply some transformation
    return " ".join(words)

# Apply augmentation
augmented_input = [synonym_replacement(txt) for txt in dataset["input_text"]]
augmented_target = [synonym_replacement(txt) for txt in dataset["target_text"]]

# Add original + augmented data
dataset["input_text"].extend(augmented_input)
dataset["target_text"].extend(augmented_target)

In [23]:
# Assume train_dataset is already loaded
test_size = 0.1  # Allocate 10% for testing
train_size = len(train_dataset) - int(test_size * len(train_dataset))

# Split into train and test
train_dataset, test_dataset = train_dataset.train_test_split(
    train_size=train_size, test_size=int(test_size * len(train_dataset)), seed=42
).values()

# Now split the remaining train dataset into training and validation
train_size = int(0.8 * len(train_dataset))
eval_size = len(train_dataset) - train_size

train_dataset, eval_dataset = train_dataset.train_test_split(
    train_size=train_size, test_size=eval_size, seed=42
).values()


In [24]:

# اختبار النموذج مع نص جديد
input_text =  "ايه اللى اتسرق"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# توليد النص باستخدام النموذج المدرب
output_tokens = model.generate(**inputs)
output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print( output_text)


الفلوس اللي كنت شايلها في الدولاب، واللابتوب بتاع ابني.


In [25]:
from transformers import pipeline

qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=1)  # استخدم GPU إن أمكن

predictions = []
references = []

for idx, example in enumerate(test_dataset):
    input_text = f"question: {example['input_text']} context: {example['target_text']}"
    output = qa_pipeline(input_text, max_length=100, clean_up_tokenization_spaces=True)[0]["generated_text"]
    
    predictions.append({
        "id": str(idx),
        "prediction_text": output
    })
    
    references.append({
        "id": str(idx),
        "answers": [
            {
                "text": example["target_text"],
                "answer_start": 0  # مؤقتًا لو مش عندك مكان الإجابة في الـ context
            }
        ]
    })

Device set to use cpu


In [26]:
from evaluate import load

metric = load("squad")
results = metric.compute(predictions=predictions, references=references)
print(results)


{'exact_match': 8.745247148288973, 'f1': 38.56830961769148}


In [27]:
print(test_dataset[0])

{'input_text': 'هل عندك أي كاميرات مراقبة؟', 'target_text': 'للأسف لا.', 'input_ids': [1661, 3341, 918, 47089, 19976, 109673, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [10990, 126, 109566, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [28]:
import torch
from evaluate import load

# تحميل مقياس ROUGE
rouge = load("rouge")

# قائمة للتخزين
predictions = []
references = []

# توليد التوقعات
for sample in test_dataset:
    with torch.no_grad():
        # استخدم المفتاح الصحيح: "input_text"
        inputs = tokenizer(sample["input_text"], return_tensors="pt", padding=True, truncation=True).to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=100)

        # فك الترميز
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        ref = sample["target_text"]

        predictions.append(pred)
        references.append(ref)

# حساب ROUGE
results = rouge.compute(predictions=predictions, references=references)

# عرض النتائج
print("ROUGE-1:", results["rouge1"])
print("ROUGE-2:", results["rouge2"])
print("ROUGE-L:", results["rougeL"])


ROUGE-1: 0.09378960709759188
ROUGE-2: 0.06463878326996197
ROUGE-L: 0.09378960709759188


ROUGE-1 (~11.7%) بتقيس تطابق الكلمات الفردية → مش سيئة كبداية.

ROUGE-2 (~8.3%) بتقيس تطابق الأزواج (bi-grams) → ده أصعب، فالنسبة أقل.

ROUGE-L (~11.6%) بتقيس أطول تسلسل مشترك → مؤشر كويس على المحافظة على الترتيب والمعنى العام.