# تدريب نموذج T5 العربي على قاموس عكسي
### باستخدام بيانات: riotu-lab/arabic_reverse_dictionary

In [None]:
!pip install transformers datasets accelerate -q

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import matplotlib.pyplot as plt

dataset = load_dataset("riotu-lab/arabic_reverse_dictionary")
dataset["train"][0]

In [None]:
definitions = [x['definition'] for x in dataset['train']]
words = [x['word'] for x in dataset['train']]

print("عدد الأمثلة:", len(definitions))
print("أول مثال:", dataset['train'][0])

def_lengths = [len(defn.split()) for defn in definitions]
plt.hist(def_lengths, bins=30)
plt.title("توزيع طول التعاريف")
plt.xlabel("عدد الكلمات")
plt.ylabel("عدد الأمثلة")
plt.show()

In [None]:
def preprocess(example):
    return {
        "input_text": f"reverse: {example['word']}",
        "target_text": example['definition']
    }

processed_dataset = dataset.map(preprocess)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("cahya/t5-small-arabic")

split_dataset = processed_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

def tokenize(batch):
    input_encodings = tokenizer(batch['input_text'], truncation=True, padding="max_length", max_length=64)
    target_encodings = tokenizer(batch['target_text'], truncation=True, padding="max_length", max_length=64)
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = eval_dataset.map(tokenize, batched=True)

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("cahya/t5-small-arabic")

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_arabic_reverse",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=100,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
def predict_definition(word):
    input_text = f"reverse: {word}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    output_ids = model.generate(input_ids, max_length=64)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(predict_definition("قلم"))
print(predict_definition("كتاب"))
print(predict_definition("حاسوب"))

In [None]:
model.save_pretrained("reverse_dict_t5_arabic")
tokenizer.save_pretrained("reverse_dict_t5_arabic")