In [1]:
!pip -q install transformers datasets accelerate evaluate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA: False


In [3]:
label2id = {"hamza": 0, "ta_ha": 1, "none": 2}
id2label = {v: k for k, v in label2id.items()}
NUM_LABELS = 3

In [4]:
import pandas as pd
from collections import Counter

df = pd.read_csv("sawab_dataset.csv")

print(df.head())
print("Shape:", df.shape)
print("Labels:", Counter(df["label"]))

                              text  label
0       سال الطالب عن موعد التسجيل  hamza
1       تم الغاء الاختبار بسبب عطل  hamza
2     ارجو الإفادة عن نتائج المقرر  hamza
3  ناقش التقرير اثر التدريب العملي  hamza
4      يرجى التاكد من اعتماد الطلب  hamza
Shape: (100, 2)
Labels: Counter({'hamza': 40, 'ta_ha': 40, 'none': 20})


In [5]:
X = df["text"].astype(str).str.strip().tolist()
y = df["label"].map(label2id).tolist()

In [6]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=0.1765,
    random_state=42,
    stratify=y_trainval
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))

Train: 69 Val: 16 Test: 15


In [7]:
from datasets import Dataset

train_ds = Dataset.from_dict({"text": X_train, "labels": y_train})
val_ds   = Dataset.from_dict({"text": X_val,   "labels": y_val})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": y_test})

train_ds, val_ds, test_ds

(Dataset({
     features: ['text', 'labels'],
     num_rows: 69
 }),
 Dataset({
     features: ['text', 'labels'],
     num_rows: 16
 }),
 Dataset({
     features: ['text', 'labels'],
     num_rows: 15
 }))

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-msa"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

train_tok = train_ds.map(tokenize, batched=True)
val_tok   = val_ds.map(tokenize, batched=True)
test_tok  = test_ds.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

cols = ["input_ids","attention_mask","labels"]
train_tok = train_tok.select_columns(cols)
val_tok   = val_tok.select_columns(cols)
test_tok  = test_tok.select_columns(cols)

train_tok[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'input_ids': [2, 16932, 20835, 6419, 11189, 3],
 'attention_mask': [1, 1, 1, 1, 1, 1],
 'labels': 2}

In [9]:
import numpy as np, torch
from transformers import AutoModelForSequenceClassification

class_counts = np.bincount(np.array(y_train), minlength=NUM_LABELS)
class_weights = class_counts.sum() / (NUM_LABELS * np.maximum(class_counts, 1))
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class counts:", class_counts)
print("Class weights:", class_weights)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id
)

Class counts: [27 28 14]
Class weights: tensor([0.8519, 0.8214, 1.6429])


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: CAMeL-Lab/bert-base-arabic-camelbert-msa
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.decoder.bias               | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; n

In [12]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=0):
        labels = inputs.get("labels")
        outputs = model(**{k:v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, NUM_LABELS), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir="sawab_task3_runs",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    warmup_ratio=0.06,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = WeightedLossTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[],
)

trainer

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


<__main__.WeightedLossTrainer at 0x7ac1d8787a10>

In [13]:
trainer.train()



Step,Training Loss




Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=50, training_loss=1.1932201385498047, metrics={'train_runtime': 120.659, 'train_samples_per_second': 5.719, 'train_steps_per_second': 0.414, 'total_flos': 2807913504816.0, 'train_loss': 1.1932201385498047, 'epoch': 10.0})

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(test_tok)
logits = pred.predictions
labels = pred.label_ids
preds = np.argmax(logits, axis=-1)

print(classification_report(labels, preds, target_names=[id2label[i] for i in range(NUM_LABELS)]))
print("Confusion matrix:\n", confusion_matrix(labels, preds))



              precision    recall  f1-score   support

       hamza       0.86      1.00      0.92         6
       ta_ha       0.71      0.83      0.77         6
        none       0.00      0.00      0.00         3

    accuracy                           0.73        15
   macro avg       0.52      0.61      0.56        15
weighted avg       0.63      0.73      0.68        15

Confusion matrix:
 [[6 0 0]
 [0 5 1]
 [1 2 0]]


In [15]:
SAVE_PATH = "CAMeLBERT_Finetuned_T4"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("Saved:", SAVE_PATH)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved: CAMeLBERT_Finetuned_T4


In [19]:
from transformers import pipeline
import torch

clf = pipeline(
    "text-classification",
    model=SAVE_PATH,
    tokenizer=SAVE_PATH,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

def classify_error_type(sentence):
    scores = clf(sentence)

    if isinstance(scores, list) and len(scores) == 1 and isinstance(scores[0], list):
        scores = scores[0]

    best = max(scores, key=lambda x: x["score"])
    return {
        "label": best["label"],
        "confidence": float(best["score"]),
        "all_scores": {s["label"]: float(s["score"]) for s in scores}
    }

print(classify_error_type("ذهبت الى المدرسه اليوم"))

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

{'label': 'ta_ha', 'confidence': 0.6819700002670288, 'all_scores': {'ta_ha': 0.6819700002670288}}


In [20]:
CONF_THRESHOLD = 0.70

def heuristic_label(sentence: str) -> str:
    HAMZA_KEYS = [
        "انشاء", "انشاء الله", "الغاء", "ارجو",
        "اثر", "تاجيل", "التاكد", "ارسال", "افادة"
    ]

    for k in HAMZA_KEYS:
        if k in sentence:
            return "hamza"

    for w in sentence.split():
        w_clean = w.strip("،.؛:!?")
        if w_clean.endswith("ه"):
            return "ta_ha"

    return "none"

Hamza Correction

In [21]:
import pandas as pd
import re

In [22]:
def detect_hamza_candidates(sentence):
    """
    تُرجع قائمة بالكلمات المحتملة التي قد تحتوي
    على أخطاء في الهمزة (قطع، وصل، متوسطة، متطرفة).
    """

    words = sentence.split()


    exclude_words = {
        "إلى", "أن", "إن", "أو"
    }

    candidates = []

    for word in words:

        hamza_patterns = ("سال", "التاكد", "تاجيل", "الغاء", "ارجو", "اثر", "اوضح")

        if (
            word.startswith(hamza_patterns) or
            "التاكد" in word or
            "تاجيل" in word
        ) and word not in exclude_words:
            candidates.append(word)

    return candidates

In [23]:
sentence_test = "سال الطالب عن موعد تاجيل الاختبار"

hamza_candidates = detect_hamza_candidates(sentence_test)

print(f"الجملة الأصلية: {sentence_test}")
print(f"الكلمات المرشحة للتصحيح (الهمزات): {hamza_candidates}")

الجملة الأصلية: سال الطالب عن موعد تاجيل الاختبار
الكلمات المرشحة للتصحيح (الهمزات): ['سال', 'تاجيل']


In [24]:
def decide_hamza_error(candidates):
    """
    تختار كلمة واحدة من الكلمات المرشّحة
    يُرجّح أنها تحتوي على خطأ في الهمزة بناءً على الأولوية.
    """

    for word in candidates:
        if word.startswith(("ا", "ال")):

            if not word.startswith(("الاختبار", "الدراسة")):
                return word


    if candidates:
        return candidates[0]


    return None

In [25]:

sentence_test = "سال الطالب عن موعد تاجيل الاختبار"
candidates = detect_hamza_candidates(sentence_test)


error_word = decide_hamza_error(candidates)

print(f"الكلمات المرشحة: {candidates}")
print(f"الكلمة المختارة للتصحيح: {error_word}")

الكلمات المرشحة: ['سال', 'تاجيل']
الكلمة المختارة للتصحيح: سال


In [26]:
def correct_hamza(sentence):

    sentence = sentence.replace("انشاء الله", "إن شاء الله")
    sentence = sentence.replace("ان شاء الله", "إن شاء الله")


    candidates = detect_hamza_candidates(sentence)
    error_word = decide_hamza_error(candidates)

    if error_word is None:
        return sentence

    hamza_lookup = {
        "سال": "سأل",
        "تاكيد": "تأكيد",
        "اجل": "أجل",
        "ارجو": "أرجو",
        "ابد": "أبد",
        "وضح": "أوضح",
    }

    corrected_word = hamza_lookup.get(error_word, error_word)
    corrected_sentence = sentence.replace(error_word, corrected_word, 1)
    return corrected_sentence

In [27]:
print(correct_hamza("انشاء الله ننجح"))

إن شاء الله ننجح


In [28]:
sentence_example = "سال الطالب عن موعد تاجيل الاختبار"

result_sentence = correct_hamza(sentence_example)

print(f"الجملة قبل التصحيح: {sentence_example}")
print(f"الجملة بعد التصحيح: {result_sentence}")

الجملة قبل التصحيح: سال الطالب عن موعد تاجيل الاختبار
الجملة بعد التصحيح: سأل الطالب عن موعد تاجيل الاختبار


Ta Marbuta (ة) vs Ha (ه) Correction

In [29]:
import pandas as pd
import re

In [30]:
def detect_ta_ha_candidates(sentence):
    """
    تُرجع قائمة بالكلمات المحتملة التي قد تحتوي
    على خلط بين التاء المربوطة (ة) والهاء (ه).
    """


    words = sentence.split()


    exclude_words = {
        "له", "به", "فيه", "منه", "إليه", "عليه",
        "هذه"
    }

    candidates = []

    for word in words:

        if (
            word.endswith(("ه", "ة")) and
            len(word) >= 4 and
            word not in exclude_words
        ):
            candidates.append(word)

    return candidates

In [31]:
detect_ta_ha_candidates("هذه مدرسه كبيره")

['مدرسه', 'كبيره']

In [32]:
def decide_ta_ha_error(candidates):
    """
    تختار كلمة واحدة من الكلمات المرشّحة
    يُرجّح أنها تحتوي على خطأ (ة / ه)
     """

    for word in candidates:
        if word.endswith("ه"):
            return word
    return None

In [33]:
candidates = detect_ta_ha_candidates("هذه مدرسه كبيره")
error_word = decide_ta_ha_error(candidates)
error_word

'مدرسه'

In [34]:
def correct_ta_ha(sentence):
    """
    تُصحّح خطأ التاء المربوطة (ة) والهاء (ه)
    في جملة واحدة، بافتراض وجود خطأ واحد فقط.
    """


    candidates = detect_ta_ha_candidates(sentence)


    error_word = decide_ta_ha_error(candidates)


    if error_word is None:
        return sentence

    corrected_word = error_word[:-1] + "ة"


    corrected_sentence = sentence.replace(error_word, corrected_word, 1)

    return corrected_sentence

In [35]:
correct_ta_ha("هذه مدرسه كبيرة")

'هذه مدرسة كبيرة'

Educational Explanation module

In [36]:
from difflib import SequenceMatcher

def _changed_spans(a: str, b: str):
    sm = SequenceMatcher(None, a, b)
    changes = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag in ("replace", "delete", "insert"):
            old = a[i1:i2]
            new = b[j1:j2]
            if old.strip() or new.strip():
                changes.append((old, new))
    return changes

def summarize_changes(original_sentence: str, corrected_sentence: str, max_items: int = 6) -> str:
    changes = _changed_spans(original_sentence, corrected_sentence)
    if not changes:
        return "لا يوجد اختلاف ظاهر بين الجملتين."
    lines = [f"- '{old}' → '{new}'" for old, new in changes[:max_items]]
    if len(changes) > max_items:
        lines.append(f"... (+{len(changes)-max_items} تغييرات إضافية)")
    return "\n".join(lines)

In [37]:
def explain_hamza(original_sentence: str, corrected_sentence: str) -> str:
    return (
        "سبب الخطأ:\n"
        "أخطاء الهمزة شائعة لأن شكل الهمزة يتغير حسب موقعها في الكلمة وحركات الحروف.\n\n"
        "القاعدة المختصرة:\n"
        "نكتب الهمزة على الحرف المناسب حسب أقوى الحركات (الكسرة ثم الضمة ثم الفتحة).\n\n"
        "التغييرات التي تمت:\n"
        f"{summarize_changes(original_sentence, corrected_sentence)}\n\n"
        "مثال:\n"
        f"قبل: {original_sentence}\n"
        f"بعد: {corrected_sentence}"
    )

def explain_ta_ha(original_sentence: str, corrected_sentence: str) -> str:
    return (
        "سبب الخطأ:\n"
        "يحدث خلط بين (ة) و(ه) في نهاية الكلمات، خاصة في الأسماء المؤنثة.\n\n"
        "القاعدة المختصرة:\n"
        "التاء المربوطة (ة) تأتي غالبًا في نهاية الاسم المؤنث، وتُنطق (ه) عند الوقف و(ت) عند الوصل.\n\n"
        "التغييرات التي تمت:\n"
        f"{summarize_changes(original_sentence, corrected_sentence)}\n\n"
        "مثال:\n"
        f"قبل: {original_sentence}\n"
        f"بعد: {corrected_sentence}"
    )

def explain_none(original_sentence: str) -> str:
    return (
        "النتيجة:\n"
        "الجملة صحيحة ولا تحتوي على أخطاء من الأنواع المدعومة حاليًا (همزة / تاء مربوطة-هاء).\n\n"
        "الجملة:\n"
        f"{original_sentence}"
    )

In [38]:
def explain_error(error_type: str, original_sentence: str, corrected_sentence: str = None) -> str:
    """Unified explanation API used by Task 5 integration."""
    t = (error_type or "").strip().lower()
    corrected_sentence = original_sentence if corrected_sentence is None else corrected_sentence

    if t == "hamza":
        return explain_hamza(original_sentence, corrected_sentence)
    if t == "ta_ha":
        return explain_ta_ha(original_sentence, corrected_sentence)
    return explain_none(original_sentence)

Full Pipeline (sawab) + Demo

In [39]:
def sawab(sentence: str):
    original = sentence.strip()

    pred = classify_error_type(original)
    label = pred["label"]
    conf = pred["confidence"]


    if conf < CONF_THRESHOLD:
        label = heuristic_label(original)

    if label == "hamza":
        corrected = correct_hamza(original)
        if corrected == original:
            label = "none"

    elif label == "ta_ha":
        corrected = correct_ta_ha(original)
        if corrected == original:
            label = "none"

    else:
        corrected = original
        label = "none"

    explanation = explain_error(label, original, corrected)

    return corrected, explanation, {
        "model_label": pred["label"],
        "model_confidence": conf,
        "all_scores": pred["all_scores"]
    }

In [40]:
examples = [
    "انشاء الله ننجح",
    "ذهبت الى المدرسه اليوم",
    "اللغة العربية جميلة"
]

for s in examples:
    out, exp, debug = sawab(s)
    print("="*60)
    print("INPUT:", s)
    print("OUTPUT:", out)
    print("EXPLANATION:")
    print(exp)
    print("DEBUG:", debug)

INPUT: انشاء الله ننجح
OUTPUT: إن شاء الله ننجح
EXPLANATION:
سبب الخطأ:
أخطاء الهمزة شائعة لأن شكل الهمزة يتغير حسب موقعها في الكلمة وحركات الحروف.

القاعدة المختصرة:
نكتب الهمزة على الحرف المناسب حسب أقوى الحركات (الكسرة ثم الضمة ثم الفتحة).

التغييرات التي تمت:
- 'ا' → 'إ'

مثال:
قبل: انشاء الله ننجح
بعد: إن شاء الله ننجح
DEBUG: {'model_label': 'hamza', 'model_confidence': 0.5208855867385864, 'all_scores': {'hamza': 0.5208855867385864}}
INPUT: ذهبت الى المدرسه اليوم
OUTPUT: ذهبت الى المدرسة اليوم
EXPLANATION:
سبب الخطأ:
يحدث خلط بين (ة) و(ه) في نهاية الكلمات، خاصة في الأسماء المؤنثة.

القاعدة المختصرة:
التاء المربوطة (ة) تأتي غالبًا في نهاية الاسم المؤنث، وتُنطق (ه) عند الوقف و(ت) عند الوصل.

التغييرات التي تمت:
- 'ه' → 'ة'

مثال:
قبل: ذهبت الى المدرسه اليوم
بعد: ذهبت الى المدرسة اليوم
DEBUG: {'model_label': 'ta_ha', 'model_confidence': 0.6819700002670288, 'all_scores': {'ta_ha': 0.6819700002670288}}
INPUT: اللغة العربية جميلة
OUTPUT: اللغة العربية جميلة
EXPLANATION:
النتيجة:
الجملة 